Merge remote-tracking branch 'occam/opencl-dev' into concedo_experimental

# Conflicts:
#	.github/workflows/build.yml
#	CMakeLists.txt
#	Makefile
#	README.md
#	ggml-opencl.cpp
#	llama.cpp
#	otherarch/ggml_v2-opencl-legacy.c
This commit is contained in:
Concedo 2023-05-22 16:16:48 +08:00
commit 981d5ba866
14 changed files with 941 additions and 321 deletions

67
BLIS.md Normal file
View file

@ -0,0 +1,67 @@
BLIS Installation Manual
------------------------
BLIS is a portable software framework for high-performance BLAS-like dense linear algebra libraries. It has received awards and recognition, including the 2023 James H. Wilkinson Prize for Numerical Software and the 2020 SIAM Activity Group on Supercomputing Best Paper Prize. BLIS provides a new BLAS-like API and a compatibility layer for traditional BLAS routine calls. It offers features such as object-based API, typed API, BLAS and CBLAS compatibility layers.
Project URL: https://github.com/flame/blis
### Prepare:
Compile BLIS:
```bash
git clone https://github.com/flame/blis
cd blis
./configure --enable-cblas -t openmp,pthreads auto
# will install to /usr/local/ by default.
make -j
```
Install BLIS:
```bash
sudo make install
```
We recommend using openmp since it's easier to modify the cores been used.
### llama.cpp compilation
Makefile:
```bash
make LLAMA_BLIS=1 -j
# make LLAMA_BLIS=1 benchmark-matmult
```
CMake:
```bash
mkdir build
cd build
cmake -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=FLAME ..
make -j
```
### llama.cpp execution
According to the BLIS documentation, we could set the following
environment variables to modify the behavior of openmp:
```
export GOMP_GPU_AFFINITY="0-19"
export BLIS_NUM_THREADS=14
```
And then run the binaries as normal.
### Intel specific issue
Some might get the error message saying that `libimf.so` cannot be found.
Please follow this [stackoverflow page](https://stackoverflow.com/questions/70687930/intel-oneapi-2022-libimf-so-no-such-file-or-directory-during-openmpi-compila).
### Reference:
1. https://github.com/flame/blis#getting-started
2. https://github.com/flame/blis/blob/master/docs/Multithreading.md

View file

@ -51,14 +51,14 @@ extern "C"
//first digit is whether configured, second is platform, third is devices //first digit is whether configured, second is platform, third is devices
int parseinfo = inputs.clblast_info; int parseinfo = inputs.clblast_info;
std::string usingclblast = "GGML_CLBLAST_CONFIGURED="+std::to_string(parseinfo>0?1:0); std::string usingclblast = "GGML_OPENCL_CONFIGURED="+std::to_string(parseinfo>0?1:0);
putenv((char*)usingclblast.c_str()); putenv((char*)usingclblast.c_str());
parseinfo = parseinfo%100; //keep last 2 digits parseinfo = parseinfo%100; //keep last 2 digits
int platform = parseinfo/10; int platform = parseinfo/10;
int devices = parseinfo%10; int devices = parseinfo%10;
platformenv = "GGML_CLBLAST_PLATFORM="+std::to_string(platform); platformenv = "GGML_OPENCL_PLATFORM="+std::to_string(platform);
deviceenv = "GGML_CLBLAST_DEVICE="+std::to_string(devices); deviceenv = "GGML_OPENCL_DEVICE="+std::to_string(devices);
putenv((char*)platformenv.c_str()); putenv((char*)platformenv.c_str());
putenv((char*)deviceenv.c_str()); putenv((char*)deviceenv.c_str());
executable_path = inputs.executable_path; executable_path = inputs.executable_path;

View file

@ -83,9 +83,19 @@ typedef struct {
} block_q8_0; } block_q8_0;
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding"); static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
#define CUDA_MUL_BLOCK_SIZE 256
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256 #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
#define CUDA_DMMV_BLOCK_SIZE 32 // dmmv = dequantize_mul_mat_vec #define CUDA_DMMV_BLOCK_SIZE 32 // dmmv = dequantize_mul_mat_vec
static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= kx) {
return;
}
dst[i] = x[i] * y[i%ky];
}
static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){ static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
const block_q4_0 * x = (const block_q4_0 *) vx; const block_q4_0 * x = (const block_q4_0 *) vx;
@ -228,6 +238,11 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
} }
} }
static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
}
static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k); dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
@ -467,6 +482,67 @@ static cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor
} }
} }
static void ggml_cuda_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src1->backend == GGML_BACKEND_CUDA);
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[2];
const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
const int64_t ne12 = src1->ne[2];
const int64_t ne13 = src1->ne[3];
const int nb2 = dst->nb[2];
const int nb3 = dst->nb[3];
size_t x_size, d_size;
float * d_X = (float *) ggml_cuda_pool_malloc(ne0 * sizeof(float), &x_size); // src0
float * d_Y = (float *) src1->data; // src1 is already on device, broadcasted.
float * d_D = (float *) ggml_cuda_pool_malloc(ne0 * sizeof(float), &d_size); // dst
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
const int i0 = i03*ne02 + i02;
float * c_X2 = d_X + i0*ne01*ne00;
float * c_D2 = d_D + i0*ne01*ne00;
cudaStream_t cudaStream = g_cudaStreams[i0 % GGML_CUDA_MAX_STREAMS];
cudaStream_t cudaStream2 = g_cudaStreams2[i0 % GGML_CUDA_MAX_STREAMS];
cudaEvent_t cudaEvent = g_cudaEvents[i0 % GGML_CUDA_MAX_EVENTS];
// copy src0 to device
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_X2, src0, i03, i02, cudaStream2));
CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));
// wait for data
CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));
for (int64_t i01 = 0; i01 < ne01; i01++) {
const int64_t i13 = i03%ne13;
const int64_t i12 = i02%ne12;
const int64_t i11 = i01%ne11;
const int i1 = i13*ne12*ne11 + i12*ne11 + i11;
float * c_X1 = c_X2 + i01*ne00;
float * c_Y = d_Y + i1*ne10;
float * c_D1 = c_D2 + i01*ne00;
// compute
mul_f32_cuda(c_X1, c_Y, c_D1, ne00, ne10, cudaStream);
CUDA_CHECK(cudaGetLastError());
}
// copy dst to host
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
CUDA_CHECK(cudaMemcpyAsync(d, c_D2, sizeof(float)*ne00*ne01, cudaMemcpyDeviceToHost, cudaStream));
}
}
CUDA_CHECK(cudaDeviceSynchronize());
ggml_cuda_pool_free(d_X, x_size);
ggml_cuda_pool_free(d_D, d_size);
}
static void ggml_cuda_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_cuda_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const int64_t ne00 = src0->ne[0]; const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1]; const int64_t ne01 = src0->ne[1];
@ -724,6 +800,11 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor
ggml_cuda_pool_free(d_Q, q_size); ggml_cuda_pool_free(d_Q, q_size);
} }
void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
ggml_cuda_mul_f32(src0, src1, dst);
}
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
const int64_t ne10 = src1->ne[0]; const int64_t ne10 = src1->ne[0];
@ -797,14 +878,48 @@ void ggml_cuda_transform_tensor(ggml_tensor * tensor) {
const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type); const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
size_t q_size; size_t q_size;
char * d_Q = (char *) ggml_cuda_pool_malloc(q_sz, &q_size); char * dst = (char *) ggml_cuda_pool_malloc(q_sz, &q_size);
cudaStream_t cudaStream2 = g_cudaStreams2[0]; cudaStream_t cudaStream2 = g_cudaStreams2[0];
// copy tensor to device // copy tensor to device
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Q, tensor, 0, 0, cudaStream2)); for (int64_t i3 = 0; i3 < ne3; i3++) {
CUDA_CHECK(cudaDeviceSynchronize()); for (int64_t i2 = 0; i2 < ne2; i2++) {
int i = i3*ne2 + i2;
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(dst + i*ne0*ne1, tensor, i3, i2, cudaStream2));
}
}
tensor->data = d_Q; tensor->data = dst;
tensor->backend = GGML_BACKEND_CUDA; tensor->backend = GGML_BACKEND_CUDA;
} }
void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) {
FILE * fp = fopen(fname, "rb");
const size_t size = ggml_nbytes(tensor);
void * buf;
CUDA_CHECK(cudaMalloc(&buf, size));
void * buf_host = malloc(size);
#ifdef _WIN32
int ret = _fseeki64(fp, (__int64) offset, SEEK_SET);
#else
int ret = fseek(fp, (long) offset, SEEK_SET);
#endif
GGML_ASSERT(ret == 0); // same
size_t ret2 = fread(buf_host, size, 1, fp);
if (ret2 != 1) {
fprintf(stderr, "unexpectedly reached end of file");
exit(1);
}
cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
tensor->data = buf;
free(buf_host);
fclose(fp);
}

View file

@ -6,6 +6,7 @@ extern "C" {
void ggml_init_cublas(void); void ggml_init_cublas(void);
void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize); void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
@ -15,6 +16,7 @@ void * ggml_cuda_host_malloc(size_t size);
void ggml_cuda_host_free(void * ptr); void ggml_cuda_host_free(void * ptr);
void ggml_cuda_transform_tensor(struct ggml_tensor * tensor); void ggml_cuda_transform_tensor(struct ggml_tensor * tensor);
void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensors, size_t offset);
#ifdef __cplusplus #ifdef __cplusplus
} }

View file

@ -24,38 +24,38 @@ typedef uchar uint8_t;
typedef int int32_t; typedef int int32_t;
typedef uint uint32_t; typedef uint uint32_t;
struct block_q4_0 struct __attribute__ ((packed)) block_q4_0
{ {
half d; half d;
uint8_t qs[16]; uint8_t qs[QK4_0 / 2];
}; };
struct block_q4_1 struct __attribute__ ((packed)) block_q4_1
{ {
half d; half d;
half m; half m;
uint8_t qs[16]; uint8_t qs[QK4_1 / 2];
}; };
struct __attribute__ ((packed)) block_q5_0 struct __attribute__ ((packed)) block_q5_0
{ {
half d; half d;
uint32_t qh; uint32_t qh;
uint8_t qs[16]; uint8_t qs[QK5_0 / 2];
}; };
struct block_q5_1 struct __attribute__ ((packed)) block_q5_1
{ {
half d; half d;
half m; half m;
uint32_t qh; uint32_t qh;
uint8_t qs[16]; uint8_t qs[QK5_1 / 2];
}; };
struct block_q8_0 struct __attribute__ ((packed)) block_q8_0
{ {
half d; half d;
uint8_t qs[32]; int8_t qs[QK8_0];
}; };
@ -89,7 +89,7 @@ void dequantize_q4_1(__global const struct block_q4_1* x, const int ib, const in
*v1 = vi1*d + m; *v1 = vi1*d + m;
} }
void dequantize_q5_0(__global const struct block_q5_0* x, const int ib, const int iqs, float* v0, float* v1) { void dequantize_q5_0(__global const struct block_q5_0* x, const int ib, const int iqs, float* v0, float* v1) {
const float d = vload_half(0, (__global half*) &x[ib].d); const float d = vload_half(0, &x[ib].d);
uint32_t qh = x[ib].qh; uint32_t qh = x[ib].qh;
@ -103,8 +103,8 @@ void dequantize_q5_0(__global const struct block_q5_0* x, const int ib, const in
*v1 = x1*d; *v1 = x1*d;
} }
void dequantize_q5_1(__global const struct block_q5_1* x, const int ib, const int iqs, float* v0, float* v1) { void dequantize_q5_1(__global const struct block_q5_1* x, const int ib, const int iqs, float* v0, float* v1) {
const float d = vload_half(0, (__global half*) &x[ib].d); const float d = vload_half(0, &x[ib].d);
const float m = vload_half(0, (__global half*) &x[ib].m); const float m = vload_half(0, &x[ib].m);
uint32_t qh = x[ib].qh; uint32_t qh = x[ib].qh;
@ -126,13 +126,13 @@ void dequantize_q8_0(__global const struct block_q8_0* x, const int ib, const in
*v0 = vi0*d; *v0 = vi0*d;
*v1 = vi1*d; *v1 = vi1*d;
} }
static void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){ void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){
*v0 = vload_half(0, &x[ib + 0]); *v0 = vload_half(0, &x[ib + 0]);
*v1 = vload_half(0, &x[ib + 1]); *v1 = vload_half(0, &x[ib + 1]);
} }
); );
static std::string dequant_template = MULTILINE_QUOTE( std::string dequant_template = MULTILINE_QUOTE(
__kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) { __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
const int i = get_group_id(0)*get_local_size(0) + get_local_id(0)*2; const int i = get_group_id(0)*get_local_size(0) + get_local_id(0)*2;
@ -156,7 +156,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
} }
); );
static std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE( std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) { __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
const int block_size = get_local_size(0); const int block_size = get_local_size(0);
const int row = get_global_id(0) / block_size; const int row = get_global_id(0) / block_size;
@ -198,29 +198,51 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
} }
); );
static std::array<std::string, 5> dequant_str_keys = { #define CL_CHECK(err) \
do { \
cl_int err_ = (err); \
if (err_ != CL_SUCCESS) { \
fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n", \
#err, err_, __FILE__, __LINE__); \
fprintf(stderr, "You may be out of VRAM. Please check if you have enough.\n");\
exit(1); \
} \
} while (0)
#define CLBLAST_CHECK(err) \
do { \
CLBlastStatusCode err_ = (err); \
if (err_ != CLBlastSuccess) { \
fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n", \
#err, err_, __FILE__, __LINE__); \
fprintf(stderr, "You may be out of VRAM. Please check if you have enough.\n");\
exit(1); \
} \
} while (0)
std::array<std::string, 5> dequant_str_keys = {
"KERNEL_NAME", "X_TYPE", "QUANT_K", "QUANT_R", "DEQUANT_FUNC" "KERNEL_NAME", "X_TYPE", "QUANT_K", "QUANT_R", "DEQUANT_FUNC"
}; };
static std::array<std::string, 30> dequant_str_values = { std::array<std::string, 30> dequant_str_values = {
"dequantize_row_q4_0", "struct block_q4_0", "32", "2", "dequantize_q4_0", "dequantize_row_q4_0", "struct block_q4_0", "QK4_0", "QR4_0", "dequantize_q4_0",
"dequantize_row_q4_1", "struct block_q4_1", "32", "2", "dequantize_q4_1", "dequantize_row_q4_1", "struct block_q4_1", "QK4_1", "QR4_1", "dequantize_q4_1",
"dequantize_row_q5_0", "struct block_q5_0", "32", "2", "dequantize_q5_0", "dequantize_row_q5_0", "struct block_q5_0", "QK5_0", "QR5_0", "dequantize_q5_0",
"dequantize_row_q5_1", "struct block_q5_1", "32", "2", "dequantize_q5_1", "dequantize_row_q5_1", "struct block_q5_1", "QK5_1", "QR5_1", "dequantize_q5_1",
"dequantize_row_q8_0", "struct block_q8_0", "32", "1", "dequantize_q8_0", "dequantize_row_q8_0", "struct block_q8_0", "QK8_0", "QR8_0", "dequantize_q8_0",
"convert_row_f16", "half", "1", "1", "convert_f16" "convert_row_f16", "half", "1", "1", "convert_f16"
}; };
static std::array<std::string, 30> dequant_mul_mat_vec_str_values = { std::array<std::string, 30> dequant_mul_mat_vec_str_values = {
"dequantize_mul_mat_vec_q4_0", "struct block_q4_0", "32", "2", "dequantize_q4_0", "dequantize_mul_mat_vec_q4_0", "struct block_q4_0", "QK4_0", "QR4_0", "dequantize_q4_0",
"dequantize_mul_mat_vec_q4_1", "struct block_q4_1", "32", "2", "dequantize_q4_1", "dequantize_mul_mat_vec_q4_1", "struct block_q4_1", "QK4_1", "QR4_1", "dequantize_q4_1",
"dequantize_mul_mat_vec_q5_0", "struct block_q5_0", "32", "2", "dequantize_q5_0", "dequantize_mul_mat_vec_q5_0", "struct block_q5_0", "QK5_0", "QR5_0", "dequantize_q5_0",
"dequantize_mul_mat_vec_q5_1", "struct block_q5_1", "32", "2", "dequantize_q5_1", "dequantize_mul_mat_vec_q5_1", "struct block_q5_1", "QK5_1", "QR5_1", "dequantize_q5_1",
"dequantize_mul_mat_vec_q8_0", "struct block_q8_0", "32", "1", "dequantize_q8_0", "dequantize_mul_mat_vec_q8_0", "struct block_q8_0", "QK8_0", "QR8_0", "dequantize_q8_0",
"convert_mul_mat_vec_f16", "half", "1", "1", "convert_f16" "convert_mul_mat_vec_f16", "half", "1", "1", "convert_f16"
}; };
static std::string& sreplace(std::string& s, const std::string& from, const std::string& to) { std::string& replace(std::string& s, const std::string& from, const std::string& to) {
size_t pos = 0; size_t pos = 0;
while ((pos = s.find(from, pos)) != std::string::npos) { while ((pos = s.find(from, pos)) != std::string::npos) {
s.replace(pos, from.length(), to); s.replace(pos, from.length(), to);
@ -229,15 +251,15 @@ static std::string& sreplace(std::string& s, const std::string& from, const std:
return s; return s;
} }
static std::string generate_kernels() { std::string generate_kernels() {
std::stringstream src; std::stringstream src;
src << program_source << '\n'; src << program_source << '\n';
for (size_t i = 0; i < dequant_str_values.size(); i += dequant_str_keys.size()) { for (size_t i = 0; i < dequant_str_values.size(); i += dequant_str_keys.size()) {
std::string dequant_kernel = dequant_template; std::string dequant_kernel = dequant_template;
std::string dmmv_kernel = dequant_mul_mat_vec_template; std::string dmmv_kernel = dequant_mul_mat_vec_template;
for (size_t j = 0; j < dequant_str_keys.size(); j++) { for (size_t j = 0; j < dequant_str_keys.size(); j++) {
sreplace(dequant_kernel, dequant_str_keys[j], dequant_str_values[i + j]); replace(dequant_kernel, dequant_str_keys[j], dequant_str_values[i + j]);
sreplace(dmmv_kernel, dequant_str_keys[j], dequant_mul_mat_vec_str_values[i + j]); replace(dmmv_kernel, dequant_str_keys[j], dequant_mul_mat_vec_str_values[i + j]);
} }
src << dequant_kernel << '\n'; src << dequant_kernel << '\n';
src << dmmv_kernel << '\n'; src << dmmv_kernel << '\n';
@ -245,16 +267,6 @@ static std::string generate_kernels() {
return src.str(); return src.str();
} }
#define CL_CHECK(err, name) \
do { \
cl_int err_ = (err); \
if (err_ != CL_SUCCESS) { \
fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__); \
fprintf(stderr, "You may be out of VRAM. Please check if you have enough.\n"); \
exit(1); \
} \
} while (0)
static cl_platform_id platform; static cl_platform_id platform;
static cl_device_id device; static cl_device_id device;
static cl_context context; static cl_context context;
@ -263,12 +275,13 @@ static cl_program program;
static cl_kernel convert_row_f16_cl; static cl_kernel convert_row_f16_cl;
static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl; static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl;
static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl; static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl;
static bool fp16_support = false; static bool fp16_support;
static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) { static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
cl_program p; cl_program p;
char *program_log; char *program_log;
size_t program_size, log_size; size_t program_size;
size_t log_size;
int err; int err;
program_size = strlen(program_buffer); program_size = strlen(program_buffer);
@ -279,7 +292,8 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
exit(1); exit(1);
} }
const char* compile_opts = "-cl-mad-enable -cl-unsafe-math-optimizations -cl-finite-math-only -cl-fast-relaxed-math"; const char* compile_opts = "-cl-mad-enable -cl-unsafe-math-optimizations -cl-finite-math-only -cl-fast-relaxed-math "
"-DQK4_0=32 -DQR4_0=2 -DQK4_1=32 -DQR4_1=2 -DQK5_0=32 -DQR5_0=2 -DQK5_1=32 -DQR5_1=2 -DQK8_0=32 -DQR8_0=1";
err = clBuildProgram(p, 0, NULL, compile_opts, NULL, NULL); err = clBuildProgram(p, 0, NULL, compile_opts, NULL, NULL);
if(err < 0) { if(err < 0) {
@ -297,27 +311,169 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
} }
void ggml_cl_init(void) { void ggml_cl_init(void) {
cl_int err = 0; cl_int err;
char * GGML_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM");
char * GGML_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE"); struct cl_device;
int plat_num = (GGML_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_CLBLAST_PLATFORM)); struct cl_platform {
int dev_num = (GGML_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_CLBLAST_DEVICE)); cl_platform_id id;
printf("\nInitializing CLBlast (First Run)..."); unsigned number;
printf("\nAttempting to use: Platform=%d, Device=%d (If invalid, program will crash)\n",plat_num,dev_num); char name[128];
cl_uint num_platforms; char vendor[128];
clGetPlatformIDs(0, NULL, &num_platforms); struct cl_device * devices;
cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id)); unsigned n_devices;
clGetPlatformIDs(num_platforms, platforms, NULL); struct cl_device * default_device;
platform = platforms[plat_num]; };
char platform_buffer[1024];
clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_buffer), &platform_buffer, NULL); struct cl_device {
cl_uint num_devices; struct cl_platform * platform;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices); cl_device_id id;
cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id)); unsigned number;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL); cl_device_type type;
device = devices[dev_num]; char name[128];
char device_buffer[1024]; };
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_buffer), &device_buffer, NULL);
enum { NPLAT = 16, NDEV = 16 };
struct cl_platform platforms[NPLAT];
unsigned n_platforms = 0;
struct cl_device devices[NDEV];
unsigned n_devices = 0;
struct cl_device * default_device = NULL;
platform = NULL;
device = NULL;
cl_platform_id platform_ids[NPLAT];
CL_CHECK(clGetPlatformIDs(NPLAT, platform_ids, &n_platforms));
for (unsigned i = 0; i < n_platforms; i++) {
struct cl_platform * p = &platforms[i];
p->number = i;
p->id = platform_ids[i];
CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_NAME, sizeof(p->name), &p->name, NULL));
CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_VENDOR, sizeof(p->vendor), &p->vendor, NULL));
cl_device_id device_ids[NDEV];
cl_int clGetDeviceIDsError = clGetDeviceIDs(p->id, CL_DEVICE_TYPE_ALL, NDEV, device_ids, &p->n_devices);
if (clGetDeviceIDsError == CL_DEVICE_NOT_FOUND) {
p->n_devices = 0;
} else {
CL_CHECK(clGetDeviceIDsError);
}
p->devices = p->n_devices > 0 ? &devices[n_devices] : NULL;
p->default_device = NULL;
for (unsigned j = 0; j < p->n_devices; j++) {
struct cl_device * d = &devices[n_devices];
d->number = n_devices++;
d->id = device_ids[j];
d->platform = p;
CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_NAME, sizeof(d->name), &d->name, NULL));
CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_TYPE, sizeof(d->type), &d->type, NULL));
printf("\nPlatform:%d Device:%d - %s with %s",i,j,p->name,d->name);
if (p->default_device == NULL && d->type == CL_DEVICE_TYPE_GPU) {
p->default_device = d;
}
}
if (default_device == NULL && p->default_device != NULL) {
default_device = p->default_device;
}
}
printf("\n\n");
if (n_devices == 0) {
fprintf(stderr, "ggml_opencl: could find any OpenCL devices.\n");
exit(1);
}
char * user_platform_string = getenv("GGML_OPENCL_PLATFORM");
char * user_device_string = getenv("GGML_OPENCL_DEVICE");
int user_platform_number = -1;
int user_device_number = -1;
unsigned n;
if (user_platform_string != NULL && sscanf(user_platform_string, "%u", &n) == 1 && n < n_platforms) {
user_platform_number = (int)n;
}
if (user_device_string != NULL && sscanf(user_device_string, "%u", &n) == 1 && n < n_devices) {
user_device_number = (int)n;
}
struct cl_device * selected_devices = devices;
unsigned n_selected_devices = n_devices;
if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) {
for (unsigned i = 0; i < n_platforms; i++) {
struct cl_platform * p = &platforms[i];
if (strstr(p->name, user_platform_string) != NULL ||
strstr(p->vendor, user_platform_string) != NULL) {
user_platform_number = (int)i;
break;
}
}
if (user_platform_number == -1) {
fprintf(stderr, "ggml_opencl: no platform matching '%s' was found.\n", user_platform_string);
exit(1);
}
}
if (user_platform_number != -1) {
struct cl_platform * p = &platforms[user_platform_number];
selected_devices = p->devices;
n_selected_devices = p->n_devices;
default_device = p->default_device;
if (n_selected_devices == 0) {
fprintf(stderr, "ggml_opencl: selected platform '%s' does not have any devices.\n", p->name);
exit(1);
}
}
if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) {
for (unsigned i = 0; i < n_selected_devices; i++) {
struct cl_device * d = &selected_devices[i];
if (strstr(d->name, user_device_string) != NULL) {
user_device_number = d->number;
break;
}
}
if (user_device_number == -1) {
fprintf(stderr, "ggml_opencl: no device matching '%s' was found.\n", user_device_string);
exit(1);
}
}
if (user_device_number != -1) {
selected_devices = &devices[user_device_number];
n_selected_devices = 1;
default_device = &selected_devices[0];
}
GGML_ASSERT(n_selected_devices > 0);
if (default_device == NULL) {
default_device = &selected_devices[0];
}
//todo: fun hot fix
if (user_platform_number != -1 && user_device_number != -1)
{
cl_platform * myplat = &platforms[user_platform_number];
cl_device * mydev = &(myplat->devices[user_device_number]);
default_device = mydev;
default_device->platform = myplat;
}
fprintf(stderr, "ggml_opencl: selecting platform: '%s'\n", default_device->platform->name);
fprintf(stderr, "ggml_opencl: selecting device: '%s'\n", default_device->name);
if (default_device->type != CL_DEVICE_TYPE_GPU) {
fprintf(stderr, "ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name);
}
platform = default_device->platform->id;
device = default_device->id;
size_t ext_str_size; size_t ext_str_size;
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size); clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
char* ext_buffer = (char*) malloc(sizeof(char) * ext_str_size); char* ext_buffer = (char*) malloc(sizeof(char) * ext_str_size);
@ -330,50 +486,42 @@ void ggml_cl_init(void) {
} }
} }
free(ext_buffer); free(ext_buffer);
printf("Using Platform: %s Device: %s FP16: %d\n", platform_buffer, device_buffer, fp16_support); fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");
fp16_support = false; fp16_support = false;
printf("CL FP16 temporarily disabled pending further optimization.\n"); printf("CL FP16 temporarily disabled pending further optimization.\n");
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
CL_CHECK(err, "clCreateContext");
queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
CL_CHECK(err, "clCreateCommandQueue");
free(platforms); cl_context_properties properties[] = {
free(devices); (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0
};
std::string kernel_src = generate_kernels(); CL_CHECK((context = clCreateContext(properties, 1, &device, NULL, NULL, &err), err));
CL_CHECK((queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err),
(err != CL_INVALID_PROPERTY && err != CL_INVALID_VALUE ? err :
(queue = clCreateCommandQueue(context, device, 0, &err), err)
)));
const std::string kernel_src = generate_kernels();
program = build_program_from_source(context, device, kernel_src.c_str()); program = build_program_from_source(context, device, kernel_src.c_str());
// FP16 to FP32 kernel // FP16 to FP32 kernel
convert_row_f16_cl = clCreateKernel(program, "convert_row_f16", &err); CL_CHECK((convert_row_f16_cl = clCreateKernel(program, "convert_row_f16", &err), err));
CL_CHECK(err, "clCreateKernel");
// Dequantize kernels // Dequantize kernels
dequantize_row_q4_0_cl = clCreateKernel(program, "dequantize_row_q4_0", &err); CL_CHECK((dequantize_row_q4_0_cl = clCreateKernel(program, "dequantize_row_q4_0", &err), err));
CL_CHECK(err, "clCreateKernel"); CL_CHECK((dequantize_row_q4_1_cl = clCreateKernel(program, "dequantize_row_q4_1", &err), err));
dequantize_row_q4_1_cl = clCreateKernel(program, "dequantize_row_q4_1", &err); CL_CHECK((dequantize_row_q5_0_cl = clCreateKernel(program, "dequantize_row_q5_0", &err), err));
CL_CHECK(err, "clCreateKernel"); CL_CHECK((dequantize_row_q5_1_cl = clCreateKernel(program, "dequantize_row_q5_1", &err), err));
dequantize_row_q5_0_cl = clCreateKernel(program, "dequantize_row_q5_0", &err); CL_CHECK((dequantize_row_q8_0_cl = clCreateKernel(program, "dequantize_row_q8_0", &err), err));
CL_CHECK(err, "clCreateKernel");
dequantize_row_q5_1_cl = clCreateKernel(program, "dequantize_row_q5_1", &err);
CL_CHECK(err, "clCreateKernel");
dequantize_row_q8_0_cl = clCreateKernel(program, "dequantize_row_q8_0", &err);
CL_CHECK(err, "clCreateKernel");
// dequant mul mat kernel // dequant mul mat kernel
dequantize_mul_mat_vec_q4_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_0", &err); CL_CHECK((dequantize_mul_mat_vec_q4_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_0", &err), err));
CL_CHECK(err, "clCreateKernel"); CL_CHECK((dequantize_mul_mat_vec_q4_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_1", &err), err));
dequantize_mul_mat_vec_q4_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_1", &err); CL_CHECK((dequantize_mul_mat_vec_q5_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_0", &err), err));
CL_CHECK(err, "clCreateKernel"); CL_CHECK((dequantize_mul_mat_vec_q5_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_1", &err), err));
dequantize_mul_mat_vec_q5_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_0", &err); CL_CHECK((dequantize_mul_mat_vec_q8_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q8_0", &err), err));
CL_CHECK(err, "clCreateKernel"); CL_CHECK((convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err), err));
dequantize_mul_mat_vec_q5_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_1", &err);
CL_CHECK(err, "clCreateKernel");
dequantize_mul_mat_vec_q8_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q8_0", &err);
CL_CHECK(err, "clCreateKernel");
convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err);
CL_CHECK(err, "clCreateKernel");
} }
static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) { static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) {
@ -452,8 +600,8 @@ static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size, cl_mem_flag
return mem; return mem;
} }
} }
cl_mem mem = clCreateBuffer(context, flags, size, NULL, &err); cl_mem mem;
CL_CHECK(err, "clCreateBuffer"); CL_CHECK((mem = clCreateBuffer(context, flags, size, NULL, &err), err));
*actual_size = size; *actual_size = size;
return mem; return mem;
} }
@ -528,21 +676,20 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
const int y_ne = ne11 * ne10; const int y_ne = ne11 * ne10;
const int d_ne = ne11 * ne01; const int d_ne = ne11 * ne01;
size_t x_size, y_size, d_size; size_t x_size;
size_t y_size;
size_t d_size;
cl_mem d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_ONLY); cl_mem d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_ONLY);
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY); cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY);
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY); cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
cl_int err;
for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i02 = 0; i02 < ne02; i02++) {
// copy data to device // copy data to device
err = ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL); CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
err |= ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL); CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
CL_CHECK(err, "ggml_cl_h2d_tensor_2d");
CL_CHECK(clFinish(queue), "clFinish"); CL_CHECK(clFinish(queue));
// compute // compute
cl_event ev_sgemm; cl_event ev_sgemm;
@ -563,8 +710,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
// copy dst to host // copy dst to host
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
err = clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL); CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
CL_CHECK(err, "clEnqueueReadBuffer");
} }
} }
@ -598,21 +744,20 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
const int y_ne = ne11 * ne10; const int y_ne = ne11 * ne10;
const int d_ne = ne11 * ne01; const int d_ne = ne11 * ne01;
size_t x_size, y_size, d_size; size_t x_size;
size_t y_size;
size_t d_size;
cl_mem d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY); cl_mem d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size, CL_MEM_READ_ONLY); cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size, CL_MEM_READ_ONLY);
cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size, CL_MEM_WRITE_ONLY); cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
cl_int err;
bool src1_cont_rows = nb10 == sizeof(float); bool src1_cont_rows = nb10 == sizeof(float);
bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float); bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i02 = 0; i02 < ne02; i02++) {
// copy src0 to device // copy src0 to device
err = ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL); CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
CL_CHECK(err, "ggml_cl_h2d_tensor_2d");
// convert src1 to fp16 // convert src1 to fp16
// TODO: use multiple threads // TODO: use multiple threads
@ -638,10 +783,9 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
} }
// copy src1 to device // copy src1 to device
err |= clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL); CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
CL_CHECK(err, "ggml_cl_h2d_tensor_2d");
CL_CHECK(clFinish(queue), "clFinish"); CL_CHECK(clFinish(queue));
// compute // compute
cl_event ev_sgemm; cl_event ev_sgemm;
@ -661,7 +805,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
} }
// copy dst to host, then convert to float // copy dst to host, then convert to float
err = clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL); CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
@ -695,7 +839,10 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
const int d_ne = ne11 * ne01; const int d_ne = ne11 * ne01;
const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type); const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type);
size_t x_size, y_size, d_size, q_size; size_t x_size;
size_t y_size;
size_t d_size;
size_t q_size;
cl_mem d_X; cl_mem d_X;
if (!mul_mat_vec) { if (!mul_mat_vec) {
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_WRITE); d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_WRITE);
@ -717,7 +864,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
// copy src0 to device if necessary // copy src0 to device if necessary
if (src0->backend == GGML_BACKEND_CPU) { if (src0->backend == GGML_BACKEND_CPU) {
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, NULL), "ggml_cl_h2d_tensor_2d"); CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, NULL));
} else if (src0->backend == GGML_BACKEND_CL) { } else if (src0->backend == GGML_BACKEND_CL) {
d_Q = *(cl_mem*) src0->data; d_Q = *(cl_mem*) src0->data;
} else { } else {
@ -725,32 +872,32 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
} }
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
// copy src1 to device // copy src1 to device
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL), "ggml_cl_h2d_tensor_2d"); CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
// compute // compute
const size_t global = ne01 * CL_DMMV_BLOCK_SIZE; const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
const size_t local = CL_DMMV_BLOCK_SIZE; const size_t local = CL_DMMV_BLOCK_SIZE;
const cl_int ncols = ne00; const cl_int ncols = ne00;
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q), "clSetKernelArg"); CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL), "clSetKernelArg"); CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y), "clSetKernelArg"); CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D), "clSetKernelArg"); CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols), "clSetKernelArg"); CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
CL_CHECK(clFinish(queue), "clFinish"); CL_CHECK(clFinish(queue));
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, 0, NULL, &ev_sgemm), "clEnqueueNDRangeKernel"); CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, 0, NULL, &ev_sgemm));
} else { // general dequantization kernel + CLBlast matrix matrix multiplication } else { // general dequantization kernel + CLBlast matrix matrix multiplication
// convert src0 to fp32 on device // convert src0 to fp32 on device
const size_t global = x_ne; const size_t global = x_ne;
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q), "clSetKernelArg"); CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X), "clSetKernelArg"); CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
CL_CHECK(clFinish(queue), "clFinish"); CL_CHECK(clFinish(queue));
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, NULL, 0, NULL, NULL), "clEnqueueNDRangeKernel"); CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, NULL, 0, NULL, NULL));
// copy src1 to device // copy src1 to device
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL), "ggml_cl_h2d_tensor_2d"); CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
// wait for conversion // wait for conversion
CL_CHECK(clFinish(queue), "clFinish"); CL_CHECK(clFinish(queue));
// compute // compute
clblast::StatusCode status = (clblast::StatusCode)CLBlastSgemm((CLBlastLayout)clblast::Layout::kColMajor, clblast::StatusCode status = (clblast::StatusCode)CLBlastSgemm((CLBlastLayout)clblast::Layout::kColMajor,
@ -771,7 +918,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
// copy dst to host // copy dst to host
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL), "clEnqueueReadBuffer"); CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
clReleaseEvent(ev_sgemm); clReleaseEvent(ev_sgemm);
} }
} }
@ -870,11 +1017,11 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i3 = 0; i3 < ne3; i3++) {
for (int64_t i2 = 0; i2 < ne2; i2++) { for (int64_t i2 = 0; i2 < ne2; i2++) {
int i = i3*ne2 + i2; int i = i3*ne2 + i2;
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, *dst, i*ne0*ne1, tensor, i3, i2, NULL), "ggml_cl_h2d_tensor_2d"); CL_CHECK(ggml_cl_h2d_tensor_2d(queue, *dst, i*ne0*ne1, tensor, i3, i2, NULL));
} }
} }
CL_CHECK(clFinish(queue), "clFinish"); CL_CHECK(clFinish(queue));
tensor->data = dst; tensor->data = dst;
tensor->backend = GGML_BACKEND_CL; tensor->backend = GGML_BACKEND_CL;

338
ggml.c
View file

@ -740,19 +740,19 @@ inline static float vaddvq_f32(float32x4_t v) {
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
} }
float vminvq_f32(float32x4_t v) { inline static float vminvq_f32(float32x4_t v) {
return return
MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
} }
float vmaxvq_f32(float32x4_t v) { inline static float vmaxvq_f32(float32x4_t v) {
return return
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
} }
int32x4_t vcvtnq_s32_f32(float32x4_t v) { inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
int32x4_t res; int32x4_t res;
res[0] = roundf(vgetq_lane_f32(v, 0)); res[0] = roundf(vgetq_lane_f32(v, 0));
@ -766,7 +766,6 @@ int32x4_t vcvtnq_s32_f32(float32x4_t v) {
#endif #endif
#endif #endif
#define QK4_0 32 #define QK4_0 32
typedef struct { typedef struct {
ggml_fp16_t d; // delta ggml_fp16_t d; // delta
@ -1056,6 +1055,39 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3); y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
} }
} }
#elif defined(__wasm_simd128__)
for (int i = 0; i < nb; i++) {
v128_t srcv [8];
v128_t asrcv[8];
v128_t amaxv[8];
for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j);
for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
wasm_f32x4_extract_lane(amaxv[0], 1)),
MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
wasm_f32x4_extract_lane(amaxv[0], 3)));
const float d = amax / ((1 << 7) - 1);
const float id = d ? 1.0f/d : 0.0f;
y[i].d = GGML_FP32_TO_FP16(d);
for (int j = 0; j < 8; j++) {
const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
}
}
#elif defined(__AVX2__) || defined(__AVX__) #elif defined(__AVX2__) || defined(__AVX__)
for (int i = 0; i < nb; i++) { for (int i = 0; i < nb; i++) {
// Load elements into 4 AVX vectors // Load elements into 4 AVX vectors
@ -1224,6 +1256,48 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
y[i].s = d * vaddvq_s32(accv); y[i].s = d * vaddvq_s32(accv);
} }
#elif defined(__wasm_simd128__)
for (int i = 0; i < nb; i++) {
v128_t srcv [8];
v128_t asrcv[8];
v128_t amaxv[8];
for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j);
for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
wasm_f32x4_extract_lane(amaxv[0], 1)),
MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
wasm_f32x4_extract_lane(amaxv[0], 3)));
const float d = amax / ((1 << 7) - 1);
const float id = d ? 1.0f/d : 0.0f;
y[i].d = d;
v128_t accv = wasm_i32x4_splat(0);
for (int j = 0; j < 8; j++) {
const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
accv = wasm_i32x4_add(accv, vi);
}
y[i].s = d * (wasm_i32x4_extract_lane(accv, 0) +
wasm_i32x4_extract_lane(accv, 1) +
wasm_i32x4_extract_lane(accv, 2) +
wasm_i32x4_extract_lane(accv, 3));
}
#elif defined(__AVX2__) || defined(__AVX__) #elif defined(__AVX2__) || defined(__AVX__)
for (int i = 0; i < nb; i++) { for (int i = 0; i < nb; i++) {
// Load elements into 4 AVX vectors // Load elements into 4 AVX vectors
@ -2598,7 +2672,6 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
const block_q8_0 * restrict y0 = &y[i]; const block_q8_0 * restrict y0 = &y[i];
const v128_t m4b = wasm_i8x16_splat(0x0F); const v128_t m4b = wasm_i8x16_splat(0x0F);
const v128_t s16b = wasm_i8x16_splat(0x10);
// extract the 5th bit // extract the 5th bit
memcpy(&qh, x0->qh, sizeof(qh)); memcpy(&qh, x0->qh, sizeof(qh));
@ -2636,15 +2709,14 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h); const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h); const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
const float x0d = GGML_FP16_TO_FP32(x0->d);
// dot product // dot product
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4( sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
wasm_i32x4_add( wasm_i32x4_add(
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll), wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
wasm_i32x4_dot_i16x8(v0lfh, v1lh)), wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d))); wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
} }
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
@ -2868,8 +2940,6 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
const v128_t v0l = wasm_v128_and (v0, m4b); const v128_t v0l = wasm_v128_and (v0, m4b);
const v128_t v0h = wasm_u8x16_shr(v0, 4); const v128_t v0h = wasm_u8x16_shr(v0, 4);
static bool x = true;
// add high bit // add high bit
const v128_t v0lf = wasm_v128_or(v0l, qhl); const v128_t v0lf = wasm_v128_or(v0l, qhl);
const v128_t v0hf = wasm_v128_or(v0h, qhh); const v128_t v0hf = wasm_v128_or(v0h, qhh);
@ -2896,7 +2966,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
wasm_i32x4_dot_i16x8(v0lfh, v1lh)), wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d)); wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d)));
} }
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
@ -3472,6 +3542,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
"ROPE", "ROPE",
"ROPE_BACK", "ROPE_BACK",
"ALIBI", "ALIBI",
"CLAMP",
"CONV_1D_1S", "CONV_1D_1S",
"CONV_1D_2S", "CONV_1D_2S",
@ -3482,7 +3553,8 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
"MAP_BINARY", "MAP_BINARY",
}; };
static_assert(GGML_OP_COUNT == 50, "GGML_OP_COUNT != 50"); static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none", "none",
@ -3532,6 +3604,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"rope(x)", "rope(x)",
"rope_back(x)", "rope_back(x)",
"alibi(x)", "alibi(x)",
"clamp(x)",
"conv_1d_1s(x)", "conv_1d_1s(x)",
"conv_1d_2s(x)", "conv_1d_2s(x)",
@ -3542,7 +3615,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"f(x,y)", "f(x,y)",
}; };
static_assert(GGML_OP_COUNT == 50, "GGML_OP_COUNT != 50"); static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51");
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
@ -3776,6 +3849,12 @@ static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct g
(t1->ne[3]%t0->ne[3] == 0); (t1->ne[3]%t0->ne[3] == 0);
} }
static inline bool ggml_can_repeat_rows(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return (t0->ne[0] == t1->ne[0]) && ggml_can_repeat(t0, t1);
}
static inline int ggml_up32(int n) { static inline int ggml_up32(int n) {
return (n + 31) & ~31; return (n + 31) & ~31;
} }
@ -4658,11 +4737,15 @@ struct ggml_tensor * ggml_mul_impl(
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b, struct ggml_tensor * b,
bool inplace) { bool inplace) {
GGML_ASSERT(ggml_are_same_shape(a, b)); // TODO: support less-strict constraint
// GGML_ASSERT(ggml_can_repeat(b, a));
GGML_ASSERT(ggml_can_repeat_rows(b, a));
bool is_node = false; bool is_node = false;
if (!inplace && (a->grad || b->grad)) { if (!inplace && (a->grad || b->grad)) {
// TODO: support backward pass for broadcasting
GGML_ASSERT(ggml_are_same_shape(a, b));
is_node = true; is_node = true;
} }
@ -6204,7 +6287,8 @@ struct ggml_tensor * ggml_alibi(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int n_past, int n_past,
int n_head) { int n_head,
float bias_max) {
GGML_ASSERT(n_past >= 0); GGML_ASSERT(n_past >= 0);
bool is_node = false; bool is_node = false;
@ -6223,6 +6307,8 @@ struct ggml_tensor * ggml_alibi(
((int32_t *) b->data)[0] = n_past; ((int32_t *) b->data)[0] = n_past;
((int32_t *) b->data)[1] = n_head; ((int32_t *) b->data)[1] = n_head;
GGML_ASSERT(sizeof(float) == sizeof(int32_t));
(((float *) b->data)[2]) = bias_max;
ggml_scratch_load(ctx); ggml_scratch_load(ctx);
@ -6234,6 +6320,40 @@ struct ggml_tensor * ggml_alibi(
return result; return result;
} }
// ggml_clamp
struct ggml_tensor * ggml_clamp(
struct ggml_context * ctx,
struct ggml_tensor * a,
float min,
float max) {
bool is_node = false;
if (a->grad) {
GGML_ASSERT(false); // TODO: implement backward
is_node = true;
}
// TODO: when implement backward, fix this:
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
ggml_scratch_save(ctx);
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
((float *) b->data)[0] = min;
((float *) b->data)[1] = max;
ggml_scratch_load(ctx);
result->op = GGML_OP_CLAMP;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src0 = a;
result->src1 = b;
return result;
}
// ggml_conv_1d_1s // ggml_conv_1d_1s
struct ggml_tensor * ggml_conv_1d_1s( struct ggml_tensor * ggml_conv_1d_1s(
@ -7960,7 +8080,7 @@ static void ggml_compute_forward_mul_f32(
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
const struct ggml_tensor * src1, const struct ggml_tensor * src1,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return; return;
@ -7968,10 +8088,25 @@ static void ggml_compute_forward_mul_f32(
const int ith = params->ith; const int ith = params->ith;
const int nth = params->nth; const int nth = params->nth;
const int nr = ggml_nrows(src0); #ifdef GGML_USE_CUBLAS
const int64_t ne0 = src0->ne[0]; if (src1->backend == GGML_BACKEND_CUDA) {
const int64_t ne1 = src0->ne[1]; if (ith == 0) {
const int64_t ne2 = src0->ne[2]; ggml_cuda_mul(src0, src1, dst);
}
return;
}
#endif
const int64_t nr = ggml_nrows(src0);
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
const int64_t ne12 = src1->ne[2];
const int64_t ne13 = src1->ne[3];
const size_t nb00 = src0->nb[0]; const size_t nb00 = src0->nb[0];
const size_t nb01 = src0->nb[1]; const size_t nb01 = src0->nb[1];
@ -7990,44 +8125,51 @@ static void ggml_compute_forward_mul_f32(
GGML_ASSERT( nb0 == sizeof(float)); GGML_ASSERT( nb0 == sizeof(float));
GGML_ASSERT(nb00 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float));
GGML_ASSERT(ne00 == ne10);
if (nb10 == sizeof(float)) { if (nb10 == sizeof(float)) {
for (int ir = ith; ir < nr; ir += nth) { for (int64_t ir = ith; ir < nr; ir += nth) {
// src0, src1 and dst are same shape => same indices // src0 and dst are same shape => same indices
const int i3 = ir/(ne2*ne1); const int64_t i03 = ir/(ne02*ne01);
const int i2 = (ir - i3*ne2*ne1)/ne1; const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
const int i1 = (ir - i3*ne2*ne1 - i2*ne1); const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
const int64_t i13 = i03 % ne13;
const int64_t i12 = i02 % ne12;
const int64_t i11 = i01 % ne11;
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
#ifdef GGML_USE_ACCELERATE #ifdef GGML_USE_ACCELERATE
UNUSED(ggml_vec_mul_f32); UNUSED(ggml_vec_mul_f32);
vDSP_vmul( vDSP_vmul( src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00);
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1,
ne0);
#else #else
ggml_vec_mul_f32(ne0, ggml_vec_mul_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ),
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
#endif #endif
// } // }
// } // }
} }
} else { } else {
// src1 is not contiguous // src1 is not contiguous
for (int ir = ith; ir < nr; ir += nth) { for (int64_t ir = ith; ir < nr; ir += nth) {
// src0, src1 and dst are same shape => same indices // src0 and dst are same shape => same indices
const int i3 = ir/(ne2*ne1); // src1 is broadcastable across src0 and dst in i1, i2, i3
const int i2 = (ir - i3*ne2*ne1)/ne1; const int64_t i03 = ir/(ne02*ne01);
const int i1 = (ir - i3*ne2*ne1 - i2*ne1); const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); const int64_t i13 = i03 % ne13;
float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); const int64_t i12 = i02 % ne12;
for (int i0 = 0; i0 < ne0; i0++) { const int64_t i11 = i01 % ne11;
float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10);
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
for (int64_t i0 = 0; i0 < ne00; i0++) {
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr); dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
} }
@ -10503,6 +10645,7 @@ static void ggml_compute_forward_diag_mask_f32(
const int n_past = ((int32_t *) src1->data)[0]; const int n_past = ((int32_t *) src1->data)[0];
const bool inplace = (bool)((int32_t *) src1->data)[1]; const bool inplace = (bool)((int32_t *) src1->data)[1];
assert(n_past >= 0); assert(n_past >= 0);
if (!inplace && (params->type == GGML_TASK_INIT)) { if (!inplace && (params->type == GGML_TASK_INIT)) {
@ -10673,7 +10816,7 @@ static void ggml_compute_forward_alibi_f32(
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
assert(params->ith == 0); assert(params->ith == 0);
assert(src1->type == GGML_TYPE_I32); assert(src1->type == GGML_TYPE_I32);
assert(ggml_nelements(src1) == 2); assert(ggml_nelements(src1) == 3);
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return; return;
@ -10681,6 +10824,7 @@ static void ggml_compute_forward_alibi_f32(
const int n_past = ((int32_t *) src1->data)[0]; const int n_past = ((int32_t *) src1->data)[0];
const int n_head = ((int32_t *) src1->data)[1]; const int n_head = ((int32_t *) src1->data)[1];
const float max_bias = ((float *) src1->data)[2];
assert(n_past >= 0); assert(n_past >= 0);
@ -10703,8 +10847,8 @@ static void ggml_compute_forward_alibi_f32(
// add alibi to src0 (KQ_scaled) // add alibi to src0 (KQ_scaled)
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head)); const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
const float m0 = powf(2.0f, -8.0f / n_heads_log2_floor); const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
const float m1 = powf(2.0f, -4.0f / n_heads_log2_floor); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
for (int i = 0; i < ne0; i++) { for (int i = 0; i < ne0; i++) {
for (int j = 0; j < ne1; j++) { for (int j = 0; j < ne1; j++) {
@ -10722,12 +10866,12 @@ static void ggml_compute_forward_alibi_f32(
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1); m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
} }
pdst[0] = i * m_k + src[0]; pdst[0] = (i-ne0+1) * m_k + src[0];
}
}
}
}
}
}
}
}
static void ggml_compute_forward_alibi_f16( static void ggml_compute_forward_alibi_f16(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
@ -10736,7 +10880,7 @@ static void ggml_compute_forward_alibi_f16(
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
assert(params->ith == 0); assert(params->ith == 0);
assert(src1->type == GGML_TYPE_I32); assert(src1->type == GGML_TYPE_I32);
assert(ggml_nelements(src1) == 2); assert(ggml_nelements(src1) == 3);
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return; return;
@ -10744,6 +10888,7 @@ static void ggml_compute_forward_alibi_f16(
const int n_past = ((int32_t *) src1->data)[0]; const int n_past = ((int32_t *) src1->data)[0];
const int n_head = ((int32_t *) src1->data)[1]; const int n_head = ((int32_t *) src1->data)[1];
const float max_bias = ((float *) src1->data)[2];
assert(n_past >= 0); assert(n_past >= 0);
@ -10766,8 +10911,8 @@ static void ggml_compute_forward_alibi_f16(
// add alibi to src0 (KQ_scaled) // add alibi to src0 (KQ_scaled)
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head)); const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
const float m0 = powf(2.0f, -8.0f / n_heads_log2_floor); const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
const float m1 = powf(2.0f, -4.0f / n_heads_log2_floor); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
for (int i = 0; i < ne0; i++) { for (int i = 0; i < ne0; i++) {
for (int j = 0; j < ne1; j++) { for (int j = 0; j < ne1; j++) {
@ -10786,7 +10931,7 @@ static void ggml_compute_forward_alibi_f16(
} }
// we return F32 // we return F32
pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]); pdst[0] = (i-ne0+1) * m_k + GGML_FP16_TO_FP32(src[0]);
} }
} }
} }
@ -10822,6 +10967,77 @@ static void ggml_compute_forward_alibi(
} }
} }
// ggml_compute_forward_clamp
static void ggml_compute_forward_clamp_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
assert(params->ith == 0);
assert(src1->type == GGML_TYPE_I32);
assert(ggml_nelements(src1) == 2);
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
const int min = ((float *) src1->data)[0];
const int max = ((float *) src1->data)[1];
const int ith = params->ith;
const int nth = params->nth;
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
const size_t nb00 = src0->nb[0];
const size_t nb01 = src0->nb[1];
const size_t nb0 = dst->nb[0];
const size_t nb1 = dst->nb[1];
GGML_ASSERT( nb0 == sizeof(float));
GGML_ASSERT(nb00 == sizeof(float));
for (int j = ith; j < n; j += nth) {
float * dst_ptr = (float *) ((char *) dst->data + j*nb1);
float * src0_ptr = (float *) ((char *) src0->data + j*nb01);
for (int i = 0; i < nc; i++) {
dst_ptr[i] = MAX(MIN(src0_ptr[i], max), min);
}
}
}
static void ggml_compute_forward_clamp(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
ggml_compute_forward_clamp_f32(params, src0, src1, dst);
} break;
case GGML_TYPE_F16:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q8_1:
case GGML_TYPE_I8:
case GGML_TYPE_I16:
case GGML_TYPE_I32:
case GGML_TYPE_COUNT:
{
GGML_ASSERT(false);
} break;
}
}
// ggml_compute_forward_rope // ggml_compute_forward_rope
static void ggml_compute_forward_rope_f32( static void ggml_compute_forward_rope_f32(
@ -12803,6 +13019,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{ {
ggml_compute_forward_alibi(params, tensor->src0, tensor->src1, tensor); ggml_compute_forward_alibi(params, tensor->src0, tensor->src1, tensor);
} break; } break;
case GGML_OP_CLAMP:
{
ggml_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor);
} break;
case GGML_OP_CONV_1D_1S: case GGML_OP_CONV_1D_1S:
{ {
ggml_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor); ggml_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor);
@ -13110,6 +13330,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
{ {
GGML_ASSERT(false); // TODO: not implemented GGML_ASSERT(false); // TODO: not implemented
} break; } break;
case GGML_OP_CLAMP:
{
GGML_ASSERT(false); // TODO: not implemented
} break;
case GGML_OP_SILU: case GGML_OP_SILU:
{ {
// necessary for llama // necessary for llama
@ -13996,6 +14220,10 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
{ {
node->n_tasks = 1; //TODO node->n_tasks = 1; //TODO
} break; } break;
case GGML_OP_CLAMP:
{
node->n_tasks = 1; //TODO
} break;
case GGML_OP_CONV_1D_1S: case GGML_OP_CONV_1D_1S:
case GGML_OP_CONV_1D_2S: case GGML_OP_CONV_1D_2S:
{ {

14
ggml.h
View file

@ -314,6 +314,7 @@ extern "C" {
GGML_OP_ROPE, GGML_OP_ROPE,
GGML_OP_ROPE_BACK, GGML_OP_ROPE_BACK,
GGML_OP_ALIBI, GGML_OP_ALIBI,
GGML_OP_CLAMP,
GGML_OP_CONV_1D_1S, GGML_OP_CONV_1D_1S,
GGML_OP_CONV_1D_2S, GGML_OP_CONV_1D_2S,
@ -850,7 +851,7 @@ extern "C" {
int n_past); int n_past);
// in-place, returns view(a) // in-place, returns view(a)
GGML_API struct ggml_tensor * gml_diag_mask_zero_inplace( GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int n_past); int n_past);
@ -898,7 +899,16 @@ extern "C" {
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int n_past, int n_past,
int n_head); int n_head,
float bias_max);
// clamp
// in-place, returns view(a)
struct ggml_tensor * ggml_clamp(
struct ggml_context * ctx,
struct ggml_tensor * a,
float min,
float max);
// padding = 1 // padding = 1
// TODO: we don't support extra parameters for now // TODO: we don't support extra parameters for now

View file

@ -172,7 +172,7 @@ struct llama_mmap {
#ifdef _POSIX_MAPPED_FILES #ifdef _POSIX_MAPPED_FILES
static constexpr bool SUPPORTED = true; static constexpr bool SUPPORTED = true;
llama_mmap(struct llama_file * file, bool prefetch = true) { llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) {
size = file->size; size = file->size;
int fd = fileno(file->fp); int fd = fileno(file->fp);
int flags = MAP_SHARED; int flags = MAP_SHARED;
@ -184,9 +184,9 @@ struct llama_mmap {
throw std::runtime_error(format("mmap failed: %s", strerror(errno))); throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
} }
if (prefetch) { if (prefetch > 0) {
// Advise the kernel to preload the mapped memory // Advise the kernel to preload the mapped memory
if (madvise(addr, file->size, MADV_WILLNEED)) { if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
strerror(errno)); strerror(errno));
} }

239
llama.cpp
View file

@ -1,6 +1,7 @@
// Defines fileno on msys: // Defines fileno on msys:
#ifndef _GNU_SOURCE #ifndef _GNU_SOURCE
#define _GNU_SOURCE #define _GNU_SOURCE
#include <cstddef>
#include <cstdint> #include <cstdint>
#include <cstdio> #include <cstdio>
#endif #endif
@ -428,27 +429,31 @@ struct llama_file_loader {
} }
void read_magic() { void read_magic() {
uint32_t magic = file.read_u32(); uint32_t magic = file.read_u32();
uint32_t version = 0;
if (magic != 'ggml') { if (magic == LLAMA_FILE_MAGIC_GGML) {
version = file.read_u32(); file_version = LLAMA_FILE_VERSION_GGML;
return;
}
uint32_t version = file.read_u32();
switch (magic) {
case LLAMA_FILE_MAGIC_GGMF:
switch (version) {
case 1: file_version = LLAMA_FILE_VERSION_GGMF_V1; return;
}
break;
case LLAMA_FILE_MAGIC_GGJT:
switch (version) {
case 1: file_version = LLAMA_FILE_VERSION_GGJT_V1; return;
case 2: file_version = LLAMA_FILE_VERSION_GGJT_V2; return;
case 3: file_version = LLAMA_FILE_VERSION_GGJT_V3; return;
}
} }
if (magic == 'ggml' && version == 0) {
file_version = LLAMA_FILE_VERSION_GGML;
} else if (magic == 'ggmf' && version == 1) {
file_version = LLAMA_FILE_VERSION_GGMF_V1;
} else if (magic == 'ggjt' && version == 1) {
file_version = LLAMA_FILE_VERSION_GGJT_V1;
} else if (magic == 'ggjt' && version == 2) {
file_version = LLAMA_FILE_VERSION_GGJT_V2;
} else if (magic == 'ggjt' && version == 3) {
file_version = LLAMA_FILE_VERSION_GGJT_V3;
} else {
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?", throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
magic, version); magic, version);
} }
}
void read_hparams() { void read_hparams() {
hparams.n_vocab = file.read_u32(); hparams.n_vocab = file.read_u32();
hparams.n_embd = file.read_u32(); hparams.n_embd = file.read_u32();
@ -647,7 +652,7 @@ struct llama_model_loader {
} }
} }
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) { struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
auto it = tensors_map.name_to_idx.find(name); auto it = tensors_map.name_to_idx.find(name);
if (it == tensors_map.name_to_idx.end()) { if (it == tensors_map.name_to_idx.end()) {
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str()); throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
@ -658,10 +663,10 @@ struct llama_model_loader {
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()); name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
} }
return get_tensor_for(lt); return get_tensor_for(lt, backend);
} }
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) { struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
struct ggml_tensor * tensor; struct ggml_tensor * tensor;
if (lt.ne.size() == 2) { if (lt.ne.size() == 2) {
tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1)); tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
@ -671,6 +676,7 @@ struct llama_model_loader {
} }
ggml_set_name(tensor, lt.name.c_str()); ggml_set_name(tensor, lt.name.c_str());
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
tensor->backend = backend;
lt.ggml_tensor = tensor; lt.ggml_tensor = tensor;
num_ggml_tensors_created++; num_ggml_tensors_created++;
return tensor; return tensor;
@ -684,12 +690,16 @@ struct llama_model_loader {
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
size_t data_size = 0; size_t data_size = 0;
size_t prefetch_size = 0;
for (const llama_load_tensor & lt : tensors_map.tensors) { for (const llama_load_tensor & lt : tensors_map.tensors) {
data_size += lt.size; data_size += lt.size;
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
prefetch_size += lt.size;
}
} }
if (use_mmap) { if (use_mmap) {
mapping.reset(new llama_mmap(&file_loaders.at(0)->file)); mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
if (!lmlock) { if (!lmlock) {
// Don't call the callback since the actual loading will be lazy // Don't call the callback since the actual loading will be lazy
// and we can't measure it. // and we can't measure it.
@ -702,6 +712,9 @@ struct llama_model_loader {
size_t done_size = 0; size_t done_size = 0;
for (llama_load_tensor & lt : tensors_map.tensors) { for (llama_load_tensor & lt : tensors_map.tensors) {
if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
continue;
}
if (progress_callback) { if (progress_callback) {
progress_callback((float) done_size / data_size, progress_callback_user_data); progress_callback((float) done_size / data_size, progress_callback_user_data);
} }
@ -714,9 +727,6 @@ struct llama_model_loader {
lmlock->grow_to(done_size); lmlock->grow_to(done_size);
} }
} }
if (progress_callback) {
progress_callback(1.0f, progress_callback_user_data);
}
} }
void load_data_for(llama_load_tensor & lt) { void load_data_for(llama_load_tensor & lt) {
@ -971,27 +981,7 @@ static void llama_model_load_internal(
size_t ctx_size; size_t ctx_size;
size_t mmapped_size; size_t mmapped_size;
ml->calc_sizes(&ctx_size, &mmapped_size); ml->calc_sizes(&ctx_size, &mmapped_size);
fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/1024.0/1024.0); fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
// print memory requirements
{
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
// this is the total memory required to run the inference
const size_t mem_required =
ctx_size +
mmapped_size +
MEM_REQ_SCRATCH0().at(model.type) +
MEM_REQ_SCRATCH1().at(model.type) +
MEM_REQ_EVAL().at(model.type);
// this is the memory required by one llama_state
const size_t mem_required_state =
scale*MEM_REQ_KV_SELF().at(model.type);
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
}
// create the ggml context // create the ggml context
{ {
@ -1013,7 +1003,14 @@ static void llama_model_load_internal(
} }
} }
#ifdef GGML_USE_CUBLAS
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
#else
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
#endif
// prepare memory for the weights // prepare memory for the weights
size_t vram_total = 0;
{ {
const uint32_t n_embd = hparams.n_embd; const uint32_t n_embd = hparams.n_embd;
const uint32_t n_layer = hparams.n_layer; const uint32_t n_layer = hparams.n_layer;
@ -1021,33 +1018,87 @@ static void llama_model_load_internal(
ml->ggml_ctx = ctx; ml->ggml_ctx = ctx;
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}); model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
model.norm = ml->get_tensor("norm.weight", {n_embd}); model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
// "output" tensor
{
ggml_backend backend_output;
if (n_gpu_layers > int(n_layer)) { // NOLINT
backend_output = LLAMA_BACKEND_OFFLOAD;
} else {
backend_output = GGML_BACKEND_CPU;
}
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
}
const int i_gpu_start = n_layer - n_gpu_layers;
model.layers.resize(n_layer); model.layers.resize(n_layer);
for (uint32_t i = 0; i < n_layer; ++i) { for (uint32_t i = 0; i < n_layer; ++i) {
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
auto & layer = model.layers[i]; auto & layer = model.layers[i];
std::string layers_i = "layers." + std::to_string(i); std::string layers_i = "layers." + std::to_string(i);
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}); layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}); layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}); layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}); layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}); layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}); layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}); layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}); layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}); layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
if (backend == GGML_BACKEND_CUDA) {
vram_total +=
ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
}
} }
} }
ml->done_getting_tensors(); ml->done_getting_tensors();
// print memory requirements
{
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
// this is the total memory required to run the inference
const size_t mem_required =
ctx_size +
mmapped_size - vram_total + // weights in VRAM not in memory
MEM_REQ_SCRATCH0().at(model.type) +
MEM_REQ_SCRATCH1().at(model.type) +
MEM_REQ_EVAL().at(model.type);
// this is the memory required by one llama_state
const size_t mem_required_state =
scale*MEM_REQ_KV_SELF().at(model.type);
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
#ifdef GGML_USE_CUBLAS
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
if (n_gpu_layers > (int) hparams.n_layer) {
fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
}
fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
#elif !defined(GGML_USE_CLBLAST)
(void) n_gpu_layers;
#endif
}
// populate `tensors_by_name` // populate `tensors_by_name`
for (llama_load_tensor & lt : ml->tensors_map.tensors) { for (llama_load_tensor & lt : ml->tensors_map.tensors) {
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor); model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
@ -1055,38 +1106,32 @@ static void llama_model_load_internal(
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL); ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
model.mapping = std::move(ml->mapping); #ifdef GGML_USE_CUBLAS
#if defined(GGML_USE_CUBLAS)
{ {
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); size_t done_size = 0;
size_t data_size = 0;
fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu); for (llama_load_tensor & lt : ml->tensors_map.tensors) {
data_size += lt.size;
size_t vram_total = 0; if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
done_size += lt.size;
for (int i = 0; i < n_gpu; ++i) {
const auto & layer = model.layers[i];
ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
} }
if (n_gpu_layers > (int) hparams.n_layer) {
fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
} }
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
continue;
}
if (progress_callback) {
progress_callback((float) done_size / data_size, progress_callback_user_data);
}
ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
done_size += lt.size;
}
} }
#elif defined(GGML_USE_CLBLAST) #elif defined(GGML_USE_CLBLAST)
{ {
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu); fprintf(stderr, "ggml_opencl: offloading %d layers to GPU\n", n_gpu);
size_t vram_total = 0; size_t vram_total = 0;
@ -1102,16 +1147,22 @@ static void llama_model_load_internal(
ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3); ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
} }
if (n_gpu_layers > (int) hparams.n_layer) { if (n_gpu_layers > (int) hparams.n_layer) {
fprintf(stderr, "%s: [opencl] offloading output layer to GPU\n", __func__); fprintf(stderr, "ggml_opencl: offloading output layer to GPU\n");
ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output); ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
} }
fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); fprintf(stderr, "ggml_opencl: total VRAM used: %zu MB\n", vram_total / 1024 / 1024);
} }
#else #else
(void) n_gpu_layers; (void) n_gpu_layers;
#endif #endif
if (progress_callback) {
progress_callback(1.0f, progress_callback_user_data);
}
model.mapping = std::move(ml->mapping);
// loading time will be recalculate after the first eval, so // loading time will be recalculate after the first eval, so
// we take page faults deferred by mmap() into consideration // we take page faults deferred by mmap() into consideration
lctx.t_load_us = ggml_time_us() - lctx.t_start_us; lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
@ -1209,10 +1260,8 @@ static bool llama_eval_internal(
{ {
cur = ggml_rms_norm(ctx0, inpL); cur = ggml_rms_norm(ctx0, inpL);
// cur = attention_norm*cur // cur = cur*attention_norm(broadcasted)
cur = ggml_mul(ctx0, cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
ggml_repeat(ctx0, model.layers[il].attention_norm, cur),
cur);
} }
// self-attention // self-attention
@ -1319,10 +1368,8 @@ static bool llama_eval_internal(
{ {
cur = ggml_rms_norm(ctx0, inpFF); cur = ggml_rms_norm(ctx0, inpFF);
// cur = ffn_norm*cur // cur = cur*ffn_norm(broadcasted)
cur = ggml_mul(ctx0, cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
ggml_repeat(ctx0, model.layers[il].ffn_norm, cur),
cur);
} }
struct ggml_tensor * tmp = ggml_mul_mat(ctx0, struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
@ -1359,10 +1406,8 @@ static bool llama_eval_internal(
inpL = ggml_rms_norm(ctx0, inpL); inpL = ggml_rms_norm(ctx0, inpL);
// inpL = norm*inpL // inpL = inpL*norm(broadcasted)
inpL = ggml_mul(ctx0, inpL = ggml_mul(ctx0, inpL, model.norm);
ggml_repeat(ctx0, model.norm, inpL),
inpL);
embeddings = inpL; embeddings = inpL;
} }
@ -2186,7 +2231,7 @@ struct llama_context * llama_init_from_file(
unsigned * cur_percentage_p = (unsigned *) ctx; unsigned * cur_percentage_p = (unsigned *) ctx;
unsigned percentage = (unsigned) (100 * progress); unsigned percentage = (unsigned) (100 * progress);
while (percentage > *cur_percentage_p) { while (percentage > *cur_percentage_p) {
++*cur_percentage_p; *cur_percentage_p = percentage;
fprintf(stderr, "."); fprintf(stderr, ".");
fflush(stderr); fflush(stderr);
if (percentage >= 100) { if (percentage >= 100) {
@ -2279,7 +2324,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
{ {
uint32_t magic; uint32_t magic;
fin.read((char *) &magic, sizeof(magic)); fin.read((char *) &magic, sizeof(magic));
if (magic != 'ggla') { if (magic != LLAMA_FILE_MAGIC_GGLA) {
fprintf(stderr, "%s: bad file magic\n", __func__); fprintf(stderr, "%s: bad file magic\n", __func__);
return 1; return 1;
} }
@ -2343,7 +2388,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
// maybe this should in llama_model_loader // maybe this should in llama_model_loader
if (model_loader->use_mmap) { if (model_loader->use_mmap) {
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false)); model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
} }
} }
@ -2436,7 +2481,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
} }
size_t idx = model_loader->tensors_map.name_to_idx[base_name]; size_t idx = model_loader->tensors_map.name_to_idx[base_name];
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx]; llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }); base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
lt.data = (uint8_t *) lt.ggml_tensor->data; lt.data = (uint8_t *) lt.ggml_tensor->data;
model_loader->load_data_for(lt); model_loader->load_data_for(lt);
lt.ggml_tensor->data = lt.data; lt.ggml_tensor->data = lt.data;

12
llama.h
View file

@ -19,10 +19,16 @@
# define LLAMA_API # define LLAMA_API
#endif #endif
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
#define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
#define LLAMA_FILE_VERSION 3 #define LLAMA_FILE_VERSION 3
#define LLAMA_FILE_MAGIC 'ggjt' #define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT
#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml' #define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
#define LLAMA_SESSION_MAGIC 'ggsn' #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
#define LLAMA_SESSION_VERSION 1 #define LLAMA_SESSION_VERSION 1
#ifdef __cplusplus #ifdef __cplusplus

View file

@ -223,8 +223,8 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
void ggml_v2_cl_init_legacy(void) { void ggml_v2_cl_init_legacy(void) {
cl_int err = 0; cl_int err = 0;
char * GGML_V2_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM"); char * GGML_V2_CLBLAST_PLATFORM = getenv("GGML_OPENCL_PLATFORM");
char * GGML_V2_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE"); char * GGML_V2_CLBLAST_DEVICE = getenv("GGML_OPENCL_DEVICE");
int plat_num = (GGML_V2_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_V2_CLBLAST_PLATFORM)); int plat_num = (GGML_V2_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_V2_CLBLAST_PLATFORM));
int dev_num = (GGML_V2_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_V2_CLBLAST_DEVICE)); int dev_num = (GGML_V2_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_V2_CLBLAST_DEVICE));
printf("\nInitializing LEGACY CLBlast (First Run)..."); printf("\nInitializing LEGACY CLBlast (First Run)...");

View file

@ -298,8 +298,8 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
void ggml_v2_cl_init(void) { void ggml_v2_cl_init(void) {
cl_int err = 0; cl_int err = 0;
char * GGML_V2_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM"); char * GGML_V2_CLBLAST_PLATFORM = getenv("GGML_OPENCL_PLATFORM");
char * GGML_V2_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE"); char * GGML_V2_CLBLAST_DEVICE = getenv("GGML_OPENCL_DEVICE");
int plat_num = (GGML_V2_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_V2_CLBLAST_PLATFORM)); int plat_num = (GGML_V2_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_V2_CLBLAST_PLATFORM));
int dev_num = (GGML_V2_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_V2_CLBLAST_DEVICE)); int dev_num = (GGML_V2_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_V2_CLBLAST_DEVICE));
printf("\nInitializing LEGACY v2 CLBlast (First Run)..."); printf("\nInitializing LEGACY v2 CLBlast (First Run)...");

View file

@ -76,7 +76,7 @@ std::string utils_gpt_random_prompt(std::mt19937 & rng) {
return "The"; return "The";
} }
void replace(std::string & str, const std::string & needle, const std::string & replacement) { void utreplace(std::string & str, const std::string & needle, const std::string & replacement) {
size_t pos = 0; size_t pos = 0;
while ((pos = str.find(needle, pos)) != std::string::npos) { while ((pos = str.find(needle, pos)) != std::string::npos) {
str.replace(pos, needle.length(), replacement); str.replace(pos, needle.length(), replacement);
@ -148,9 +148,9 @@ std::map<std::string, int32_t> json_parse(const std::string & fname) {
has_key = false; has_key = false;
} }
::replace(str_key, "\\u0120", " " ); // \u0120 -> space ::utreplace(str_key, "\\u0120", " " ); // \u0120 -> space
::replace(str_key, "\\u010a", "\n"); // \u010a -> new line ::utreplace(str_key, "\\u010a", "\n"); // \u010a -> new line
::replace(str_key, "\\\"", "\""); // \\\" -> " ::utreplace(str_key, "\\\"", "\""); // \\\" -> "
try { try {
result[str_key] = std::stoi(str_val); result[str_key] = std::stoi(str_val);

View file

@ -26,7 +26,7 @@ struct gpt_vocab {
std::map<id, token> id_to_token; std::map<id, token> id_to_token;
}; };
void replace(std::string & str, const std::string & needle, const std::string & replacement); void utreplace(std::string & str, const std::string & needle, const std::string & replacement);
// poor-man's JSON parsing // poor-man's JSON parsing
std::map<std::string, int32_t> json_parse(const std::string & fname); std::map<std::string, int32_t> json_parse(const std::string & fname);