diff --git a/Makefile b/Makefile index f952ff126..8f57e406d 100644 --- a/Makefile +++ b/Makefile @@ -2,6 +2,7 @@ default: koboldcpp koboldcpp_noavx2 koboldcpp_openblas koboldcpp_openblas_noavx2 simple: koboldcpp koboldcpp_noavx2 tools: quantize_gpt2 quantize_gptj quantize_llama quantize_neox dev: koboldcpp_openblas +dev2: koboldcpp_clblast ifndef UNAME_S @@ -225,8 +226,8 @@ ggml_openblas_noavx2.o: ggml.c ggml.h $(CC) $(CFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@ ggml_clblast.o: ggml.c ggml.h $(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) $(CLBLAST_FLAGS) -c $< -o $@ -ggml-opencl.o: ggml-opencl.c ggml-opencl.h - $(CC) $(CFLAGS) -c $< -o $@ +ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h + $(CXX) $(CXXFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ ggml-opencl-legacy.o: ggml-opencl-legacy.c ggml-opencl-legacy.h $(CC) $(CFLAGS) -c $< -o $@ @@ -248,6 +249,9 @@ expose.o: expose.cpp expose.h gpttype_adapter.o: gpttype_adapter.cpp $(CXX) $(CXXFLAGS) -c $< -o $@ +gpttype_adapter_clblast.o: gpttype_adapter.cpp + $(CXX) $(CXXFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ + clean: rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize-stats perplexity embedding benchmark-matmult save-load-state build-info.h main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe koboldcpp.dll koboldcpp_openblas.dll koboldcpp_noavx2.dll koboldcpp_openblas_noavx2.dll koboldcpp_clblast.dll koboldcpp.so koboldcpp_openblas.so koboldcpp_noavx2.so koboldcpp_openblas_noavx2.so koboldcpp_clblast.so gptj.exe gpt2.exe @@ -273,7 +277,7 @@ koboldcpp_noavx2: ggml_noavx2.o ggml_v1_noavx2.o expose.o common.o gpttype_adapt koboldcpp_openblas_noavx2: ggml_openblas_noavx2.o ggml_v1_noavx2.o expose.o common.o gpttype_adapter.o $(OPENBLAS_NOAVX2_BUILD) -koboldcpp_clblast: ggml_clblast.o ggml_v1.o expose.o common.o gpttype_adapter.o ggml-opencl.o ggml-opencl-legacy.o +koboldcpp_clblast: ggml_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml-opencl-legacy.o $(CLBLAST_BUILD) quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o diff --git a/ggml-opencl-legacy.h b/ggml-opencl-legacy.h index 588a5bab6..ff7f4f0c2 100644 --- a/ggml-opencl-legacy.h +++ b/ggml-opencl-legacy.h @@ -6,6 +6,17 @@ extern "C" { #endif +enum ggml_blas_order { + GGML_BLAS_ORDER_ROW_MAJOR = 101, + GGML_BLAS_ORDER_COLUMN_MAJOR = 102, +}; + +enum ggml_blas_op { + GGML_BLAS_OP_N = 111, + GGML_BLAS_OP_T = 112, + GGML_BLAS_OP_C = 113, +}; + void ggml_cl_init_legacy(void); void ggml_cl_sgemm_wrapper_legacy(const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype); diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp new file mode 100644 index 000000000..1367408b8 --- /dev/null +++ b/ggml-opencl.cpp @@ -0,0 +1,1092 @@ +#include "ggml-opencl.h" + +#include + +#define CL_TARGET_OPENCL_VERSION 110 +#include +#include + +#include +#include +#include + +#include "ggml.h" + +#define CL_DMMV_BLOCK_SIZE 32; + +#define MULTILINE_QUOTE(...) #__VA_ARGS__ +const char * clblast_dequant = MULTILINE_QUOTE( + +typedef char int8_t; +typedef uchar uint8_t; +typedef int int32_t; +typedef uint uint32_t; + +constant uint GGML_TYPE_Q4_0 = 2; +constant uint GGML_TYPE_Q4_1 = 3; +constant uint GGML_TYPE_Q5_0 = 6; +constant uint GGML_TYPE_Q5_1 = 7; +constant uint GGML_TYPE_Q8_0 = 8; +constant uint GGML_TYPE_Q8_1 = 9; + +constant uint QK4_0 = 32; +constant uint QR4_0 = 2; +struct block_q4_0 +{ + float d; + uint8_t qs[QK4_0 / 2]; +}; + +constant uint QK4_1 = 32; +constant uint QR4_1 = 2; +struct block_q4_1 +{ + float d; + float m; + uint8_t qs[QK4_1 / 2]; +}; + +constant uint QK5_0 = 32; +constant uint QR5_0 = 2; +struct __attribute__ ((packed)) block_q5_0 +{ + half d; + uint32_t qh; + uint8_t qs[QK5_0 / 2]; +}; + +constant uint QK5_1 = 32; +constant uint QR5_1 = 2; +struct block_q5_1 +{ + half d; + half m; + uint32_t qh; + uint8_t qs[QK5_1 / 2]; +}; + +constant uint QK8_0 = 32; +constant uint QR8_0 = 1; +struct block_q8_0 +{ + float d; + uint8_t qs[QK8_0]; +}; + + +__kernel void convert_fp16_to_fp32(__global half* x, __global float* y) { + const uint i = get_global_id(0); + + y[i] = vload_half(0, &x[i]); +} + + +__kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) { + constant uint qk = QK4_0; + + const uint i = get_global_id(0) / qk; + const uint j = get_local_id(0); + + const float d = x[i].d; + + const int x0 = (x[i].qs[j] & 0xf) - 8; + const int x1 = (x[i].qs[j] >> 4) - 8; + + y[i*qk + j + 0 ] = x0*d; + y[i*qk + j + qk/2] = x1*d; +} + +__kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) { + constant uint qk = QK4_1; + + const uint i = get_global_id(0) / qk; + const uint j = get_local_id(0); + + const float d = x[i].d; + const float m = x[i].m; + + const int x0 = (x[i].qs[j] & 0xf); + const int x1 = (x[i].qs[j] >> 4); + + y[i*qk + j + 0 ] = x0*d + m; + y[i*qk + j + qk/2] = x1*d + m; +} + +__kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) { + constant uint qk = QK5_0; + + const uint i = get_global_id(0) / qk; + const uint j = get_local_id(0); + + const float d = vload_half(0, (__global half*) &x[i].d); + + uint32_t qh = x[i].qh; + + const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; + + const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16; + const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; + + y[i*qk + j + 0 ] = x0*d; + y[i*qk + j + qk/2] = x1*d; +} + +__kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) { + constant uint qk = QK5_1; + + const uint i = get_global_id(0) / qk; + const uint j = get_local_id(0); + + const float d = vload_half(0, (__global half*) &x[i].d); + const float m = vload_half(0, (__global half*) &x[i].m); + + uint32_t qh = x[i].qh; + + const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; + + const int x0 = (x[i].qs[j] & 0xf) | xh_0; + const int x1 = (x[i].qs[j] >> 4) | xh_1; + + y[i*qk + j + 0 ] = x0*d + m; + y[i*qk + j + qk/2] = x1*d + m; +} + +__kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) { + constant uint qk = QK8_0; + const uint i = get_global_id(0) / qk; + const uint j = get_local_id(0); + + const float d = x[i].d; + y[i*qk + j] = x[i].qs[j]*d; +} + +void dequantize_q4_0(__global const struct block_q4_0* x, const int ib, const int iqs, float* v0, float* v1) { + const float d = x[ib].d; + + const uint8_t vui = x[ib].qs[iqs]; + + const int8_t vi0 = vui & 0xF; + const int8_t vi1 = vui >> 4; + + *v0 = (vi0 - 8)*d; + *v1 = (vi1 - 8)*d; +} + +__kernel void dequantize_mul_mat_vec_q4_0(__global struct block_q4_0* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) { + const int block_size = get_local_size(0); + const int row = get_global_id(0) / block_size; + const int tid = get_local_id(0); + + const uint qk = QK4_0; + const uint qr = QR4_0; + + const int y_offset = qr == 1 ? 1 : qk/2; + + tmp[tid] = 0; + + for (int i = 0; i < ncols/block_size; i += 2) { + const int col = i*block_size + 2*tid; + const int ib = (row*ncols + col)/qk; // block index + const int iqs = (col%qk)/qr; // quant index + const int iybs = col - col%qk; // y block start index + + // dequantize + float v0, v1; + dequantize_q4_0(x, ib, iqs, &v0, &v1); + + // matrix multiplication + tmp[tid] += v0 * y[iybs + iqs + 0]; + tmp[tid] += v1 * y[iybs + iqs + y_offset]; + } + + // sum up partial sums and write back result + barrier(CLK_LOCAL_MEM_FENCE); + for (int s=block_size/2; s>0; s>>=1) { + if (tid < s) { + tmp[tid] += tmp[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + if (tid == 0) { + dst[row] = tmp[0]; + } +} + +void dequantize_q4_1(__global const struct block_q4_1* x, const int ib, const int iqs, float* v0, float* v1) { + const float d = x[ib].d; + const float m = x[ib].m; + + const uint8_t vui = x[ib].qs[iqs]; + + const int8_t vi0 = vui & 0xF; + const int8_t vi1 = vui >> 4; + + *v0 = vi0*d + m; + *v1 = vi1*d + m; +} +__kernel void dequantize_mul_mat_vec_q4_1(__global struct block_q4_1* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) { + const int block_size = get_local_size(0); + const int row = get_global_id(0) / block_size; + const int tid = get_local_id(0); + + const uint qk = QK4_1; + const uint qr = QR4_1; + + const int y_offset = qr == 1 ? 1 : qk/2; + + tmp[tid] = 0; + + for (int i = 0; i < ncols/block_size; i += 2) { + const int col = i*block_size + 2*tid; + const int ib = (row*ncols + col)/qk; // block index + const int iqs = (col%qk)/qr; // quant index + const int iybs = col - col%qk; // y block start index + + // dequantize + float v0, v1; + dequantize_q4_1(x, ib, iqs, &v0, &v1); + + // matrix multiplication + tmp[tid] += v0 * y[iybs + iqs + 0]; + tmp[tid] += v1 * y[iybs + iqs + y_offset]; + } + + // sum up partial sums and write back result + barrier(CLK_LOCAL_MEM_FENCE); + for (int s=block_size/2; s>0; s>>=1) { + if (tid < s) { + tmp[tid] += tmp[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + if (tid == 0) { + dst[row] = tmp[0]; + } +} + +void dequantize_q5_0(__global const struct block_q5_0* x, const int ib, const int iqs, float* v0, float* v1) { + const float d = vload_half(0, (__global half*) &x[ib].d); + + uint32_t qh = x[ib].qh; + + const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10; + + const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0) - 16; + const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1) - 16; + + *v0 = x0*d; + *v1 = x1*d; +} +__kernel void dequantize_mul_mat_vec_q5_0(__global struct block_q5_0* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) { + const int block_size = get_local_size(0); + const int row = get_global_id(0) / block_size; + const int tid = get_local_id(0); + + const uint qk = QK5_0; + const uint qr = QR5_0; + + const int y_offset = qr == 1 ? 1 : qk/2; + + tmp[tid] = 0; + + for (int i = 0; i < ncols/block_size; i += 2) { + const int col = i*block_size + 2*tid; + const int ib = (row*ncols + col)/qk; // block index + const int iqs = (col%qk)/qr; // quant index + const int iybs = col - col%qk; // y block start index + + // dequantize + float v0, v1; + dequantize_q5_0(x, ib, iqs, &v0, &v1); + + // matrix multiplication + tmp[tid] += v0 * y[iybs + iqs + 0]; + tmp[tid] += v1 * y[iybs + iqs + y_offset]; + } + + // sum up partial sums and write back result + barrier(CLK_LOCAL_MEM_FENCE); + for (int s=block_size/2; s>0; s>>=1) { + if (tid < s) { + tmp[tid] += tmp[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + if (tid == 0) { + dst[row] = tmp[0]; + } +} + +void dequantize_q5_1(__global const struct block_q5_1* x, const int ib, const int iqs, float* v0, float* v1) { + const float d = vload_half(0, (__global half*) &x[ib].d); + const float m = vload_half(0, (__global half*) &x[ib].m); + + uint32_t qh = x[ib].qh; + + const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10; + + const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0); + const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1); + + *v0 = x0*d + m; + *v1 = x1*d + m; +} +__kernel void dequantize_mul_mat_vec_q5_1(__global struct block_q5_1* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) { + const int block_size = get_local_size(0); + const int row = get_global_id(0) / block_size; + const int tid = get_local_id(0); + + const uint qk = QK5_1; + const uint qr = QR5_1; + + const int y_offset = qr == 1 ? 1 : qk/2; + + tmp[tid] = 0; + + for (int i = 0; i < ncols/block_size; i += 2) { + const int col = i*block_size + 2*tid; + const int ib = (row*ncols + col)/qk; // block index + const int iqs = (col%qk)/qr; // quant index + const int iybs = col - col%qk; // y block start index + + // dequantize + float v0, v1; + dequantize_q5_1(x, ib, iqs, &v0, &v1); + + // matrix multiplication + tmp[tid] += v0 * y[iybs + iqs + 0]; + tmp[tid] += v1 * y[iybs + iqs + y_offset]; + } + + // sum up partial sums and write back result + barrier(CLK_LOCAL_MEM_FENCE); + for (int s=block_size/2; s>0; s>>=1) { + if (tid < s) { + tmp[tid] += tmp[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + if (tid == 0) { + dst[row] = tmp[0]; + } +} + +void dequantize_q8_0(__global const struct block_q8_0* x, const int ib, const int iqs, float* v0, float* v1) { + const float d = x[ib].d; + + const int8_t vi0 = x[ib].qs[iqs + 0]; + const int8_t vi1 = x[ib].qs[iqs + 1]; + + *v0 = vi0*d; + *v1 = vi1*d; +} +__kernel void dequantize_mul_mat_vec_q8_0(__global struct block_q8_0* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) { + const int block_size = get_local_size(0); + const int row = get_global_id(0) / block_size; + const int tid = get_local_id(0); + + const uint qk = QK8_0; + const uint qr = QR8_0; + + const int y_offset = qr == 1 ? 1 : qk/2; + + tmp[tid] = 0; + + for (int i = 0; i < ncols/block_size; i += 2) { + const int col = i*block_size + 2*tid; + const int ib = (row*ncols + col)/qk; // block index + const int iqs = (col%qk)/qr; // quant index + const int iybs = col - col%qk; // y block start index + + // dequantize + float v0, v1; + dequantize_q8_0(x, ib, iqs, &v0, &v1); + + // matrix multiplication + tmp[tid] += v0 * y[iybs + iqs + 0]; + tmp[tid] += v1 * y[iybs + iqs + y_offset]; + } + + // sum up partial sums and write back result + barrier(CLK_LOCAL_MEM_FENCE); + for (int s=block_size/2; s>0; s>>=1) { + if (tid < s) { + tmp[tid] += tmp[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + if (tid == 0) { + dst[row] = tmp[0]; + } +} + +void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){ + *v0 = vload_half(0, (__global half*) &x[ib + 0]); + *v1 = vload_half(0, (__global half*) &x[ib + 1]); +} +__kernel void convert_mul_mat_vec_f16(__global half* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) { + const int block_size = get_local_size(0); + const int row = get_global_id(0) / block_size; + const int tid = get_local_id(0); + + const uint qk = 32; + const uint qr = 1; + + const int y_offset = qr == 1 ? 1 : qk/2; + + tmp[tid] = 0; + + for (int i = 0; i < ncols/block_size; i += 2) { + const int col = i*block_size + 2*tid; + const int ib = (row*ncols + col)/qk; // block index + const int iqs = (col%qk)/qr; // quant index + const int iybs = col - col%qk; // y block start index + + // convert + float v0, v1; + convert_f16(x, ib, iqs, &v0, &v1); + + // matrix multiplication + tmp[tid] += v0 * y[iybs + iqs + 0]; + tmp[tid] += v1 * y[iybs + iqs + y_offset]; + } + + // sum up partial sums and write back result + barrier(CLK_LOCAL_MEM_FENCE); + for (int s=block_size/2; s>0; s>>=1) { + if (tid < s) { + tmp[tid] += tmp[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + if (tid == 0) { + dst[row] = tmp[0]; + } +} +); + +#define CL_CHECK(err, name) \ + do { \ + cl_int err_ = (err); \ + if (err_ != CL_SUCCESS) { \ + fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__); \ + exit(1); \ + } \ + } while (0) + +static cl_platform_id platform; +static cl_device_id device; +static cl_context context; +static cl_command_queue queue; +static cl_program program; +static cl_kernel convert_fp16_to_fp32_cl; +static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl; +static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl; +static bool fp16_support; + +static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) { + cl_program p; + char *program_log; + size_t program_size, log_size; + int err; + + program_size = strlen(program_buffer); + + p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err); + if(err < 0) { + fprintf(stderr, "OpenCL error creating program"); + exit(1); + } + + err = clBuildProgram(p, 0, NULL, NULL, NULL, NULL); + if(err < 0) { + + clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); + program_log = (char*) malloc(log_size + 1); + program_log[log_size] = '\0'; + clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL); + printf("%s\n", program_log); + free(program_log); + exit(1); + } + + return p; +} + +void ggml_cl_init(void) { + cl_int err = 0; + char * GGML_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM"); + char * GGML_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE"); + int plat_num = (GGML_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_CLBLAST_PLATFORM)); + int dev_num = (GGML_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_CLBLAST_DEVICE)); + printf("\nInitializing CLBlast (First Run)..."); + printf("\nAttempting to use: Platform=%d, Device=%d (If invalid, program will crash)\n",plat_num,dev_num); + cl_uint num_platforms; + clGetPlatformIDs(0, NULL, &num_platforms); + cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id)); + clGetPlatformIDs(num_platforms, platforms, NULL); + platform = platforms[plat_num]; + char platform_buffer[1024]; + clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_buffer), &platform_buffer, NULL); + cl_uint num_devices; + clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices); + cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id)); + clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL); + device = devices[dev_num]; + char device_buffer[1024]; + clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_buffer), &device_buffer, NULL); + size_t ext_str_size; + clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size); + char* ext_buffer = (char*) malloc(sizeof(char) * ext_str_size); + clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL); + // Check if ext_buffer contains cl_khr_fp16 + for (size_t i = 0; i < ext_str_size - 12; i++) { + if (memcmp(ext_buffer + i, "cl_khr_fp16", 11) == 0) { + fp16_support = true; + break; + } + } + free(ext_buffer); + printf("Using Platform: %s Device: %s FP16: %d\n", platform_buffer, device_buffer, fp16_support); + context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); + CL_CHECK(err, "clCreateContext"); + queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err); + CL_CHECK(err, "clCreateCommandQueue"); + + free(platforms); + free(devices); + + program = build_program_from_source(context, device, clblast_dequant); + + // FP16 to FP32 kernel + convert_fp16_to_fp32_cl = clCreateKernel(program, "convert_fp16_to_fp32", &err); + CL_CHECK(err, "clCreateKernel"); + + // Dequantize kernels + dequantize_row_q4_0_cl = clCreateKernel(program, "dequantize_row_q4_0", &err); + CL_CHECK(err, "clCreateKernel"); + dequantize_row_q4_1_cl = clCreateKernel(program, "dequantize_row_q4_1", &err); + CL_CHECK(err, "clCreateKernel"); + dequantize_row_q5_0_cl = clCreateKernel(program, "dequantize_row_q5_0", &err); + CL_CHECK(err, "clCreateKernel"); + dequantize_row_q5_1_cl = clCreateKernel(program, "dequantize_row_q5_1", &err); + CL_CHECK(err, "clCreateKernel"); + dequantize_row_q8_0_cl = clCreateKernel(program, "dequantize_row_q8_0", &err); + CL_CHECK(err, "clCreateKernel"); + + // dequant mul mat kernel + dequantize_mul_mat_vec_q4_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_0", &err); + CL_CHECK(err, "clCreateKernel"); + dequantize_mul_mat_vec_q4_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_1", &err); + CL_CHECK(err, "clCreateKernel"); + dequantize_mul_mat_vec_q5_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_0", &err); + CL_CHECK(err, "clCreateKernel"); + dequantize_mul_mat_vec_q5_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_1", &err); + CL_CHECK(err, "clCreateKernel"); + dequantize_mul_mat_vec_q8_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q8_0", &err); + CL_CHECK(err, "clCreateKernel"); + convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err); + CL_CHECK(err, "clCreateKernel"); +} + +static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) { + switch (type) { + case GGML_TYPE_Q4_0: + return &dequantize_row_q4_0_cl; + case GGML_TYPE_Q4_1: + return &dequantize_row_q4_1_cl; + case GGML_TYPE_Q5_0: + return &dequantize_row_q5_0_cl; + case GGML_TYPE_Q5_1: + return &dequantize_row_q5_1_cl; + case GGML_TYPE_Q8_0: + return &dequantize_row_q8_0_cl; + case GGML_TYPE_F16: + return &convert_fp16_to_fp32_cl; + default: + return nullptr; + } +} + +static cl_kernel* ggml_get_dequantize_mul_mat_vec_cl(ggml_type type) { + switch (type) { + case GGML_TYPE_Q4_0: + return &dequantize_mul_mat_vec_q4_0_cl; + case GGML_TYPE_Q4_1: + return &dequantize_mul_mat_vec_q4_1_cl; + case GGML_TYPE_Q5_0: + return &dequantize_mul_mat_vec_q5_0_cl; + case GGML_TYPE_Q5_1: + return &dequantize_mul_mat_vec_q5_1_cl; + case GGML_TYPE_Q8_0: + return &dequantize_mul_mat_vec_q8_0_cl; + case GGML_TYPE_F16: + return &convert_mul_mat_vec_f16_cl; + default: + return nullptr; + } +} + +// buffer pool for cl +#define MAX_CL_BUFFERS 256 + +struct scoped_spin_lock { + std::atomic_flag& lock; + scoped_spin_lock(std::atomic_flag& lock) : lock(lock) { + while (lock.test_and_set(std::memory_order_acquire)) { + ; // spin + } + } + ~scoped_spin_lock() { + lock.clear(std::memory_order_release); + } + scoped_spin_lock(const scoped_spin_lock&) = delete; + scoped_spin_lock& operator=(const scoped_spin_lock&) = delete; +}; + +struct cl_buffer { + cl_mem mem; + size_t size = 0; +}; + +static cl_buffer g_cl_buffer_pool[MAX_CL_BUFFERS]; +static std::atomic_flag g_cl_pool_lock = ATOMIC_FLAG_INIT; + +static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size, cl_mem_flags flags) { + scoped_spin_lock lock(g_cl_pool_lock); + cl_int err; + + for (int i = 0; i < MAX_CL_BUFFERS; ++i) { + cl_buffer& b = g_cl_buffer_pool[i]; + if (b.size > 0 && b.size >= size) { + cl_mem mem = b.mem; + *actual_size = b.size; + b.size = 0; + return mem; + } + } + cl_mem mem = clCreateBuffer(context, flags, size, NULL, &err); + CL_CHECK(err, "clCreateBuffer"); + *actual_size = size; + return mem; +} + +static void ggml_cl_pool_free(cl_mem mem, size_t size) { + scoped_spin_lock lock(g_cl_pool_lock); + + for (int i = 0; i < MAX_CL_BUFFERS; ++i) { + cl_buffer& b = g_cl_buffer_pool[i]; + if (b.size == 0) { + b.mem = mem; + b.size = size; + return; + } + } + fprintf(stderr, "WARNING: cl buffer pool full, increase MAX_CL_BUFFERS\n"); + clReleaseMemObject(mem); +} + +static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cl_event* ev) { + cl_int err; + const uint64_t ne0 = src->ne[0]; + const uint64_t ne1 = src->ne[1]; + const uint64_t nb0 = src->nb[0]; + const uint64_t nb1 = src->nb[1]; + const uint64_t nb2 = src->nb[2]; + const uint64_t nb3 = src->nb[3]; + const enum ggml_type type = src->type; + const size_t ts = ggml_type_size(type); + const size_t bs = ggml_blck_size(type); + + const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3); + if (nb0 == ts && nb1 == ts*ne0/bs) { + err = clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*nb1, x, 0, NULL, ev); + return err; + } + if (nb0 == ts) { + const size_t buffer_origin[3] = { offset, 0, 0 }; + const size_t host_origin[3] = { 0, 0, 0 }; + const size_t region[3] = { ts*ne0/bs, ne1, 1 }; + err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts*ne0/bs, 0, nb1, 0, x, 0, NULL, ev); + return err; + } + for (uint64_t i1 = 0; i1 < ne1; i1++) { + // pretend the row is a matrix with cols=1 + const size_t buffer_origin[3] = { offset, i1, 0 }; + const size_t host_origin[3] = { 0, 0, 0 }; + const size_t region[3] = { ts/bs, ne0, 1 }; + err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, 0, 0, nb0, 0, ((const char *)x) + i1*nb0, 0, NULL, ev); + if (err != CL_SUCCESS) { + break; + } + } + return err; +} + +static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + + const float alpha = 1.0f; + const float beta = 0.0f; + const int x_ne = ne01 * ne00; + const int y_ne = ne11 * ne10; + const int d_ne = ne11 * ne01; + + size_t x_size, y_size, d_size; + cl_mem d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_ONLY); + cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY); + cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY); + + cl_int err; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + // copy data to device + err = ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL); + err |= ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL); + CL_CHECK(err, "ggml_cl_h2d_tensor_2d"); + + CL_CHECK(clFinish(queue), "clFinish"); + + // compute + cl_event ev_sgemm; + + clblast::StatusCode status = (clblast::StatusCode)CLBlastSgemm((CLBlastLayout)clblast::Layout::kColMajor, + (CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo, + ne01, ne11, ne10, + alpha, + d_X, 0, ne00, + d_Y, 0, ne10, + beta, + d_D, 0, ne01, + &queue, &ev_sgemm); + + if (status != clblast::StatusCode::kSuccess) { + GGML_ASSERT(false); + } + + // copy dst to host + float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + err = clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL); + CL_CHECK(err, "clEnqueueReadBuffer"); + } + } + + ggml_cl_pool_free(d_X, x_size); + ggml_cl_pool_free(d_Y, y_size); + ggml_cl_pool_free(d_D, d_size); +} + +static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t /* wsize */) { + GGML_ASSERT(fp16_support); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + + const int nb10 = src1->nb[0]; + const int nb11 = src1->nb[1]; + const int nb12 = src1->nb[2]; + const int nb13 = src1->nb[3]; + + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + + const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f); + const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f); + const int x_ne = ne01 * ne00; + const int y_ne = ne11 * ne10; + const int d_ne = ne11 * ne01; + + size_t x_size, y_size, d_size; + cl_mem d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY); + cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size, CL_MEM_READ_ONLY); + cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size, CL_MEM_WRITE_ONLY); + + cl_int err; + + bool src1_cont_rows = nb10 == sizeof(float); + bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float); + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + // copy src0 to device + err = ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL); + CL_CHECK(err, "ggml_cl_h2d_tensor_2d"); + + // convert src1 to fp16 + // TODO: use multiple threads + ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02); + char * src1i = (char *) src1->data + i03*nb13 + i02*nb12; + if (src1_cont_rows) { + if (src1_cont_cols) { + ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11); + } + else { + for (int64_t i01 = 0; i01 < ne11; i01++) { + ggml_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10); + } + } + } + else { + for (int64_t i01 = 0; i01 < ne11; i01++) { + for (int64_t i00 = 0; i00 < ne10; i00++) { + // very slow due to no inlining + tmp[i01*ne10 + i00] = ggml_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10)); + } + } + } + + // copy src1 to device + err |= clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL); + CL_CHECK(err, "ggml_cl_h2d_tensor_2d"); + + CL_CHECK(clFinish(queue), "clFinish"); + + // compute + cl_event ev_sgemm; + clblast::StatusCode status = (clblast::StatusCode)CLBlastSgemm((CLBlastLayout)clblast::Layout::kColMajor, + (CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo, + ne01, ne11, ne10, + alpha, + d_X, 0, ne00, + d_Y, 0, ne10, + beta, + d_D, 0, ne01, + &queue, &ev_sgemm); + + if (status != clblast::StatusCode::kSuccess) { + GGML_ASSERT(false); + } + + // copy dst to host, then convert to float + err = clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL); + + float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + + ggml_fp16_to_fp32_row(tmp, d, d_ne); + } + } + + ggml_cl_pool_free(d_X, x_size); + ggml_cl_pool_free(d_Y, y_size); + ggml_cl_pool_free(d_D, d_size); +} + +static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + const ggml_type type = src0->type; + const bool mul_mat_vec = ne11 == 1; + + const float alpha = 1.0f; + const float beta = 0.0f; + const int x_ne = ne01 * ne00; + const int y_ne = ne11 * ne10; + const int d_ne = ne11 * ne01; + const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type); + + size_t x_size, y_size, d_size, q_size; + cl_mem d_X; + if (!mul_mat_vec) { + d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_WRITE); + } + cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY); + cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY); + cl_mem d_Q; + if (src0->backend == GGML_BACKEND_CPU) { + d_Q = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY); + } + + cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type); + cl_kernel* dmmv = ggml_get_dequantize_mul_mat_vec_cl(type); + GGML_ASSERT(to_fp32_cl != nullptr); + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + cl_event ev_sgemm; + + // copy src0 to device if necessary + if (src0->backend == GGML_BACKEND_CPU) { + CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, NULL), "ggml_cl_h2d_tensor_2d"); + } else if (src0->backend == GGML_BACKEND_CL) { + d_Q = *(cl_mem*) src0->data; + } else { + GGML_ASSERT(false); + } + if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel + // copy src1 to device + CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL), "ggml_cl_h2d_tensor_2d"); + + // compute + // dequantize_mul_mat_vec(__global void * vx, __local float* tmp, __global float * y, __global float * dst, __global int ncols, __global int vx_type) { + const size_t global = ne01 * CL_DMMV_BLOCK_SIZE; + const size_t local = CL_DMMV_BLOCK_SIZE; + const cl_int ncols = ne00; + CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q), "clSetKernelArg"); + CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL), "clSetKernelArg"); + CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y), "clSetKernelArg"); + CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D), "clSetKernelArg"); + CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols), "clSetKernelArg"); + CL_CHECK(clFinish(queue), "clFinish"); + CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, 0, NULL, &ev_sgemm), "clEnqueueNDRangeKernel"); + } else { // general dequantization kernel + CLBlast matrix matrix multiplication + // convert src0 to fp32 on device + const size_t global = x_ne; + const size_t local = ggml_blck_size(type) / 2; + CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q), "clSetKernelArg"); + CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X), "clSetKernelArg"); + CL_CHECK(clFinish(queue), "clFinish"); + CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, &local, 0, NULL, NULL), "clEnqueueNDRangeKernel"); + + // copy src1 to device + CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL), "ggml_cl_h2d_tensor_2d"); + + // wait for conversion + CL_CHECK(clFinish(queue), "clFinish"); + + // compute + clblast::StatusCode status = (clblast::StatusCode)CLBlastSgemm((CLBlastLayout)clblast::Layout::kColMajor, + (CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo, + ne01, ne11, ne10, + alpha, + d_X, 0, ne00, + d_Y, 0, ne10, + beta, + d_D, 0, ne01, + &queue, &ev_sgemm); + + if (status != clblast::StatusCode::kSuccess) { + GGML_ASSERT(false); + } + } + + // copy dst to host + float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL), "clEnqueueReadBuffer"); + clReleaseEvent(ev_sgemm); + } + } + + if (!mul_mat_vec) { + ggml_cl_pool_free(d_X, x_size); + } + ggml_cl_pool_free(d_Y, y_size); + ggml_cl_pool_free(d_D, d_size); + if (src0->backend == GGML_BACKEND_CPU) { + ggml_cl_pool_free(d_Q, q_size); + } +} + + +bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { + const int64_t ne10 = src1->ne[0]; + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + + // TODO: find the optimal values for these + if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && + src1->type == GGML_TYPE_F32 && + dst->type == GGML_TYPE_F32 && + ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CL)) { + return true; + } + + return false; +} + +bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) { + // If device doesn't support FP16 + if (!fp16_support) { + return false; + } + + size_t src0_sz = ggml_nbytes(src0); + size_t src1_sz = ggml_nbytes(src1); + + // mul_mat_q: src0 is converted to fp32 on device + size_t mul_mat_q_transfer = src0_sz + src1_sz; + + // mul_mat_f16: src1 is converted to fp16 on cpu + size_t mul_mat_f16_transfer = src0_sz + sizeof(ggml_fp16_t) * ggml_nelements(src1); + + // choose the smaller one to transfer to the device + // TODO: this is not always the best choice due to the overhead of converting to fp16 + return mul_mat_f16_transfer < mul_mat_q_transfer; +} + +void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize) { + GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst)); + + if (src0->type == GGML_TYPE_F32) { + ggml_cl_mul_mat_f32(src0, src1, dst); + } + else if (src0->type == GGML_TYPE_F16) { + if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) { + ggml_cl_mul_mat_f16(src0, src1, dst, wdata, wsize); + } + else { + ggml_cl_mul_mat_q_f32(src0, src1, dst); + } + } + else if (ggml_is_quantized(src0->type)) { + ggml_cl_mul_mat_q_f32(src0, src1, dst); + } + else { + GGML_ASSERT(false); + } +} + +size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { + if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) { + return ggml_nelements(src1) * sizeof(ggml_fp16_t); + } + return 0; +} + +void ggml_cl_transform_tensor(ggml_tensor * tensor) { + const int64_t ne0 = tensor->ne[0]; + const int64_t ne1 = tensor->ne[1]; + const int64_t ne2 = tensor->ne[2]; + const int64_t ne3 = tensor->ne[3]; + + const ggml_type type = tensor->type; + const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type); + + size_t q_size; + cl_mem* d_Q = (cl_mem*) malloc(sizeof(cl_mem)); + *d_Q = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY); + + // copy tensor to device + CL_CHECK(ggml_cl_h2d_tensor_2d(queue, *d_Q, 0, tensor, 0, 0, NULL), "ggml_cl_h2d_tensor_2d"); + CL_CHECK(clFinish(queue), "clFinish"); + + tensor->data = d_Q; + tensor->backend = GGML_BACKEND_CL; +} diff --git a/ggml-opencl.h b/ggml-opencl.h index 7bcc603ef..5a1a50093 100644 --- a/ggml-opencl.h +++ b/ggml-opencl.h @@ -1,23 +1,21 @@ #pragma once +#include "ggml.h" + #ifdef __cplusplus extern "C" { #endif void ggml_cl_init(void); -enum ggml_blas_order { - GGML_BLAS_ORDER_ROW_MAJOR = 101, - GGML_BLAS_ORDER_COLUMN_MAJOR = 102, -}; +bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); +size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); +void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize); -enum ggml_blas_op { - GGML_BLAS_OP_N = 111, - GGML_BLAS_OP_T = 112, - GGML_BLAS_OP_C = 113, -}; +void * ggml_cl_host_malloc(size_t size); +void ggml_cl_host_free(void * ptr); -void ggml_cl_sgemm_wrapper(const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype); +void ggml_cl_transform_tensor(struct ggml_tensor * tensor); #ifdef __cplusplus } diff --git a/ggml.c b/ggml.c index f99a250f1..fbf75d706 100644 --- a/ggml.c +++ b/ggml.c @@ -9327,7 +9327,7 @@ static void ggml_compute_forward_rms_norm_back( // ggml_compute_forward_mul_mat -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // helper function to determine if it is better to use BLAS or not // for large matrices, BLAS is faster static bool ggml_compute_forward_mul_mat_use_blas( @@ -9368,7 +9368,7 @@ static void ggml_compute_forward_mul_mat_f32( const int64_t ne02 = src0->ne[2]; const int64_t ne03 = src0->ne[3]; -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) const int64_t ne10 = src1->ne[0]; #endif const int64_t ne11 = src1->ne[1]; @@ -9432,9 +9432,16 @@ static void ggml_compute_forward_mul_mat_f32( } return; } +#elif defined(GGML_USE_CLBLAST) + if (ggml_cl_can_mul_mat(src0, src1, dst)) { + if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { + ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); + } + return; + } #endif -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { if (params->ith != 0) { return; @@ -9454,33 +9461,11 @@ static void ggml_compute_forward_mul_mat_f32( const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); -#if defined(GGML_USE_CLBLAST) - // zT = y * xT - if(quants_unshuffled) - { - ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T, - ne11, ne01, ne10, - 1.0f, y, ne10, - x, ne10, - 0.0f, d, ne01, - GGML_TYPE_F32); - } - else - { - ggml_cl_sgemm_wrapper_legacy(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T, - ne11, ne01, ne10, - 1.0f, y, ne10, - x, ne10, - 0.0f, d, ne01, - GGML_TYPE_F32); - } -#else cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, ne11, ne01, ne10, 1.0f, y, ne10, x, ne00, 0.0f, d, ne01); -#endif } } //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); @@ -9619,9 +9604,16 @@ static void ggml_compute_forward_mul_mat_f16_f32( } return; } +#elif defined(GGML_USE_CLBLAST) + if (ggml_cl_can_mul_mat(src0, src1, dst)) { + if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { + ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); + } + return; + } #endif -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { GGML_ASSERT(nb10 == sizeof(float)); @@ -9651,32 +9643,6 @@ static void ggml_compute_forward_mul_mat_f16_f32( assert(id*sizeof(float) <= params->wsize); } -#if defined(GGML_USE_CLBLAST) - const float * x = wdata; - const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); - - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - - // zT = y * xT - if(quants_unshuffled) - { - ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T, - ne11, ne01, ne10, - 1.0f, y, ne10, - x, ne10, - 0.0f, d, ne01, - GGML_TYPE_F32); - } - else - { - ggml_cl_sgemm_wrapper_legacy(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T, - ne11, ne01, ne10, - 1.0f, y, ne10, - x, ne10, - 0.0f, d, ne01, - GGML_TYPE_F32); - } -#else const float * x = wdata; const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); @@ -9688,7 +9654,6 @@ static void ggml_compute_forward_mul_mat_f16_f32( 1.0f, y, ne10, x, ne00, 0.0f, d, ne01); -#endif } } @@ -9851,9 +9816,16 @@ static void ggml_compute_forward_mul_mat_q_f32( } return; } +#elif defined(GGML_USE_CLBLAST) + if (ggml_cl_can_mul_mat(src0, src1, dst)) { + if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { + ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); + } + return; + } #endif -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { if (params->ith != 0) { return; @@ -9876,9 +9848,6 @@ static void ggml_compute_forward_mul_mat_q_f32( float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); -#if defined(GGML_USE_CLBLAST) - const void* x = (char *) src0->data + i03*nb03 + i02*nb02; -#else { size_t id = 0; for (int64_t i01 = 0; i01 < ne01; ++i01) { @@ -9890,35 +9859,12 @@ static void ggml_compute_forward_mul_mat_q_f32( } const float * x = wdata; -#endif -#if defined(GGML_USE_CLBLAST) - // zT = y * xT - if(quants_unshuffled) - { - ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T, - ne11, ne01, ne10, - 1.0f, y, ne10, - x, ne10, - 0.0f, d, ne01, - type); - } - else - { - ggml_cl_sgemm_wrapper_legacy(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T, - ne11, ne01, ne10, - 1.0f, y, ne10, - x, ne10, - 0.0f, d, ne01, - type); - } -#else cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, ne11, ne01, ne10, 1.0f, y, ne10, x, ne00, 0.0f, d, ne01); -#endif } } @@ -14024,9 +13970,16 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node); } else +#elif defined(GGML_USE_CLBLAST) + if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) { + node->n_tasks = 1; // TODO: this actually is doing nothing + // the threads are still spinning + cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node); + } + else #endif if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { node->n_tasks = 1; // TODO: this actually is doing nothing // the threads are still spinning @@ -14040,13 +13993,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) #endif } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) { cur = 0; -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { node->n_tasks = 1; } #endif } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { node->n_tasks = 1; cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); diff --git a/ggml.h b/ggml.h index e6bbe803e..f8400e39c 100644 --- a/ggml.h +++ b/ggml.h @@ -250,6 +250,7 @@ extern "C" { enum ggml_backend { GGML_BACKEND_CPU = 0, GGML_BACKEND_CUDA = 1, + GGML_BACKEND_CL = 2, }; // model file types diff --git a/include/clblast.h b/include/clblast.h new file mode 100644 index 000000000..99364ec61 --- /dev/null +++ b/include/clblast.h @@ -0,0 +1,792 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains the interface to the CLBlast BLAS routines. It also contains the definitions +// of the returned status codes and the layout and transpose types. This is the only header users +// of CLBlast should include and use. +// +// ================================================================================================= + +#ifndef CLBLAST_CLBLAST_H_ +#define CLBLAST_CLBLAST_H_ + +#include // For size_t +#include // For OverrideParameters function +#include // For OverrideParameters function + +// Includes the normal OpenCL C header +#if defined(__APPLE__) || defined(__MACOSX) + #include +#else + #include +#endif + +// Exports library functions under Windows when building a DLL. See also: +// https://msdn.microsoft.com/en-us/library/a90k134d.aspx +#if defined(_WIN32) && defined(CLBLAST_DLL) + #if defined(COMPILING_DLL) + #define PUBLIC_API __declspec(dllexport) + #else + #define PUBLIC_API __declspec(dllimport) + #endif +#else + #define PUBLIC_API +#endif + +// Version numbering (v1.5.3) +#define CLBLAST_VERSION_MAJOR 1 +#define CLBLAST_VERSION_MINOR 5 +#define CLBLAST_VERSION_PATCH 3 + +namespace clblast { +// ================================================================================================= + +// Status codes. These codes can be returned by functions declared in this header file. The error +// codes match either the standard OpenCL error codes or the clBLAS error codes. +enum class StatusCode { + + // Status codes in common with the OpenCL standard + kSuccess = 0, // CL_SUCCESS + kOpenCLCompilerNotAvailable= -3, // CL_COMPILER_NOT_AVAILABLE + kTempBufferAllocFailure = -4, // CL_MEM_OBJECT_ALLOCATION_FAILURE + kOpenCLOutOfResources = -5, // CL_OUT_OF_RESOURCES + kOpenCLOutOfHostMemory = -6, // CL_OUT_OF_HOST_MEMORY + kOpenCLBuildProgramFailure = -11, // CL_BUILD_PROGRAM_FAILURE: OpenCL compilation error + kInvalidValue = -30, // CL_INVALID_VALUE + kInvalidCommandQueue = -36, // CL_INVALID_COMMAND_QUEUE + kInvalidMemObject = -38, // CL_INVALID_MEM_OBJECT + kInvalidBinary = -42, // CL_INVALID_BINARY + kInvalidBuildOptions = -43, // CL_INVALID_BUILD_OPTIONS + kInvalidProgram = -44, // CL_INVALID_PROGRAM + kInvalidProgramExecutable = -45, // CL_INVALID_PROGRAM_EXECUTABLE + kInvalidKernelName = -46, // CL_INVALID_KERNEL_NAME + kInvalidKernelDefinition = -47, // CL_INVALID_KERNEL_DEFINITION + kInvalidKernel = -48, // CL_INVALID_KERNEL + kInvalidArgIndex = -49, // CL_INVALID_ARG_INDEX + kInvalidArgValue = -50, // CL_INVALID_ARG_VALUE + kInvalidArgSize = -51, // CL_INVALID_ARG_SIZE + kInvalidKernelArgs = -52, // CL_INVALID_KERNEL_ARGS + kInvalidLocalNumDimensions = -53, // CL_INVALID_WORK_DIMENSION: Too many thread dimensions + kInvalidLocalThreadsTotal = -54, // CL_INVALID_WORK_GROUP_SIZE: Too many threads in total + kInvalidLocalThreadsDim = -55, // CL_INVALID_WORK_ITEM_SIZE: ... or for a specific dimension + kInvalidGlobalOffset = -56, // CL_INVALID_GLOBAL_OFFSET + kInvalidEventWaitList = -57, // CL_INVALID_EVENT_WAIT_LIST + kInvalidEvent = -58, // CL_INVALID_EVENT + kInvalidOperation = -59, // CL_INVALID_OPERATION + kInvalidBufferSize = -61, // CL_INVALID_BUFFER_SIZE + kInvalidGlobalWorkSize = -63, // CL_INVALID_GLOBAL_WORK_SIZE + + // Status codes in common with the clBLAS library + kNotImplemented = -1024, // Routine or functionality not implemented yet + kInvalidMatrixA = -1022, // Matrix A is not a valid OpenCL buffer + kInvalidMatrixB = -1021, // Matrix B is not a valid OpenCL buffer + kInvalidMatrixC = -1020, // Matrix C is not a valid OpenCL buffer + kInvalidVectorX = -1019, // Vector X is not a valid OpenCL buffer + kInvalidVectorY = -1018, // Vector Y is not a valid OpenCL buffer + kInvalidDimension = -1017, // Dimensions M, N, and K have to be larger than zero + kInvalidLeadDimA = -1016, // LD of A is smaller than the matrix's first dimension + kInvalidLeadDimB = -1015, // LD of B is smaller than the matrix's first dimension + kInvalidLeadDimC = -1014, // LD of C is smaller than the matrix's first dimension + kInvalidIncrementX = -1013, // Increment of vector X cannot be zero + kInvalidIncrementY = -1012, // Increment of vector Y cannot be zero + kInsufficientMemoryA = -1011, // Matrix A's OpenCL buffer is too small + kInsufficientMemoryB = -1010, // Matrix B's OpenCL buffer is too small + kInsufficientMemoryC = -1009, // Matrix C's OpenCL buffer is too small + kInsufficientMemoryX = -1008, // Vector X's OpenCL buffer is too small + kInsufficientMemoryY = -1007, // Vector Y's OpenCL buffer is too small + + // Custom additional status codes for CLBlast + kInsufficientMemoryTemp = -2050, // Temporary buffer provided to GEMM routine is too small + kInvalidBatchCount = -2049, // The batch count needs to be positive + kInvalidOverrideKernel = -2048, // Trying to override parameters for an invalid kernel + kMissingOverrideParameter = -2047, // Missing override parameter(s) for the target kernel + kInvalidLocalMemUsage = -2046, // Not enough local memory available on this device + kNoHalfPrecision = -2045, // Half precision (16-bits) not supported by the device + kNoDoublePrecision = -2044, // Double precision (64-bits) not supported by the device + kInvalidVectorScalar = -2043, // The unit-sized vector is not a valid OpenCL buffer + kInsufficientMemoryScalar = -2042, // The unit-sized vector's OpenCL buffer is too small + kDatabaseError = -2041, // Entry for the device was not found in the database + kUnknownError = -2040, // A catch-all error code representing an unspecified error + kUnexpectedError = -2039, // A catch-all error code representing an unexpected exception +}; + +// Matrix layout and transpose types +enum class Layout { kRowMajor = 101, kColMajor = 102 }; +enum class Transpose { kNo = 111, kYes = 112, kConjugate = 113 }; +enum class Triangle { kUpper = 121, kLower = 122 }; +enum class Diagonal { kNonUnit = 131, kUnit = 132 }; +enum class Side { kLeft = 141, kRight = 142 }; +enum class KernelMode { kCrossCorrelation = 151, kConvolution = 152 }; + +// Precision scoped enum (values in bits) +enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64, + kComplexSingle = 3232, kComplexDouble = 6464, kAny = -1 }; + +// ================================================================================================= +// BLAS level-1 (vector-vector) routines +// ================================================================================================= + +// Generate givens plane rotation: SROTG/DROTG +template +StatusCode Rotg(cl_mem sa_buffer, const size_t sa_offset, + cl_mem sb_buffer, const size_t sb_offset, + cl_mem sc_buffer, const size_t sc_offset, + cl_mem ss_buffer, const size_t ss_offset, + cl_command_queue* queue, cl_event* event = nullptr); + +// Generate modified givens plane rotation: SROTMG/DROTMG +template +StatusCode Rotmg(cl_mem sd1_buffer, const size_t sd1_offset, + cl_mem sd2_buffer, const size_t sd2_offset, + cl_mem sx1_buffer, const size_t sx1_offset, + const cl_mem sy1_buffer, const size_t sy1_offset, + cl_mem sparam_buffer, const size_t sparam_offset, + cl_command_queue* queue, cl_event* event = nullptr); + +// Apply givens plane rotation: SROT/DROT +template +StatusCode Rot(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const T cos, + const T sin, + cl_command_queue* queue, cl_event* event = nullptr); + +// Apply modified givens plane rotation: SROTM/DROTM +template +StatusCode Rotm(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem sparam_buffer, const size_t sparam_offset, + cl_command_queue* queue, cl_event* event = nullptr); + +// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP +template +StatusCode Swap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event = nullptr); + +// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL +template +StatusCode Scal(const size_t n, + const T alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event = nullptr); + +// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY +template +StatusCode Copy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event = nullptr); + +// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY +template +StatusCode Axpy(const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event = nullptr); + +// Dot product of two vectors: SDOT/DDOT/HDOT +template +StatusCode Dot(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event = nullptr); + +// Dot product of two complex vectors: CDOTU/ZDOTU +template +StatusCode Dotu(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event = nullptr); + +// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC +template +StatusCode Dotc(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event = nullptr); + +// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 +template +StatusCode Nrm2(const size_t n, + cl_mem nrm2_buffer, const size_t nrm2_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event = nullptr); + +// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM +template +StatusCode Asum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event = nullptr); + +// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM +template +StatusCode Sum(const size_t n, + cl_mem sum_buffer, const size_t sum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event = nullptr); + +// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX +template +StatusCode Amax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event = nullptr); + +// Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN +template +StatusCode Amin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event = nullptr); + +// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX +template +StatusCode Max(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event = nullptr); + +// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN +template +StatusCode Min(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event = nullptr); + +// ================================================================================================= +// BLAS level-2 (matrix-vector) routines +// ================================================================================================= + +// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV +template +StatusCode Gemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event = nullptr); + +// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV +template +StatusCode Gbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event = nullptr); + +// Hermitian matrix-vector multiplication: CHEMV/ZHEMV +template +StatusCode Hemv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event = nullptr); + +// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV +template +StatusCode Hbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event = nullptr); + +// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV +template +StatusCode Hpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event = nullptr); + +// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV +template +StatusCode Symv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event = nullptr); + +// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV +template +StatusCode Sbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event = nullptr); + +// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV +template +StatusCode Spmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event = nullptr); + +// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV +template +StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event = nullptr); + +// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV +template +StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event = nullptr); + +// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV +template +StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event = nullptr); + +// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV +template +StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event = nullptr); + +// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV +template +StatusCode Tbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event = nullptr); + +// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV +template +StatusCode Tpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event = nullptr); + +// General rank-1 matrix update: SGER/DGER/HGER +template +StatusCode Ger(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event = nullptr); + +// General rank-1 complex matrix update: CGERU/ZGERU +template +StatusCode Geru(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event = nullptr); + +// General rank-1 complex conjugated matrix update: CGERC/ZGERC +template +StatusCode Gerc(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event = nullptr); + +// Hermitian rank-1 matrix update: CHER/ZHER +template +StatusCode Her(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event = nullptr); + +// Hermitian packed rank-1 matrix update: CHPR/ZHPR +template +StatusCode Hpr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event = nullptr); + +// Hermitian rank-2 matrix update: CHER2/ZHER2 +template +StatusCode Her2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event = nullptr); + +// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 +template +StatusCode Hpr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event = nullptr); + +// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR +template +StatusCode Syr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event = nullptr); + +// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR +template +StatusCode Spr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event = nullptr); + +// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 +template +StatusCode Syr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event = nullptr); + +// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 +template +StatusCode Spr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event = nullptr); + +// ================================================================================================= +// BLAS level-3 (matrix-matrix) routines +// ================================================================================================= + +// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM +template +StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event = nullptr, + cl_mem temp_buffer = nullptr); + +// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM +template +StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event = nullptr); + +// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM +template +StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event = nullptr); + +// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK +template +StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event = nullptr); + +// Rank-K update of a hermitian matrix: CHERK/ZHERK +template +StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event = nullptr); + +// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K +template +StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event = nullptr); + +// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K +template +StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const U beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event = nullptr); + +// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM +template +StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event = nullptr); + +// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM +template +StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event = nullptr); + +// ================================================================================================= +// Extra non-BLAS routines (level-X) +// ================================================================================================= + +// Element-wise vector product (Hadamard): SHAD/DHAD/CHAD/ZHAD/HHAD +template +StatusCode Had(const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const T beta, + cl_mem z_buffer, const size_t z_offset, const size_t z_inc, + cl_command_queue* queue, cl_event* event = nullptr); + +// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY +template +StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event = nullptr); + +// Im2col function (non-BLAS function): SIM2COL/DIM2COL/CIM2COL/ZIM2COL/HIM2COL +template +StatusCode Im2col(const KernelMode kernel_mode, + const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, + const cl_mem im_buffer, const size_t im_offset, + cl_mem col_buffer, const size_t col_offset, + cl_command_queue* queue, cl_event* event = nullptr); + +// Col2im function (non-BLAS function): SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM +template +StatusCode Col2im(const KernelMode kernel_mode, + const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, + const cl_mem col_buffer, const size_t col_offset, + cl_mem im_buffer, const size_t im_offset, + cl_command_queue* queue, cl_event* event = nullptr); + +// Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/HCONVGEMM +template +StatusCode Convgemm(const KernelMode kernel_mode, + const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, + const cl_mem im_buffer, const size_t im_offset, + const cl_mem kernel_buffer, const size_t kernel_offset, + cl_mem result_buffer, const size_t result_offset, + cl_command_queue* queue, cl_event* event = nullptr); + +// Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED +template +StatusCode AxpyBatched(const size_t n, + const T *alphas, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, + const size_t batch_count, + cl_command_queue* queue, cl_event* event = nullptr); + +// Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED +template +StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const T *alphas, + const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, + const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, + const T *betas, + cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, + const size_t batch_count, + cl_command_queue* queue, cl_event* event = nullptr); + +// StridedBatched version of GEMM: SGEMMSTRIDEDBATCHED/DGEMMSTRIDEDBATCHED/CGEMMSTRIDEDBATCHED/ZGEMMSTRIDEDBATCHED/HGEMMSTRIDEDBATCHED +template +StatusCode GemmStridedBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, + const T beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, + const size_t batch_count, + cl_command_queue* queue, cl_event* event = nullptr); + +// ================================================================================================= + +// Retrieves the required size of the temporary buffer for the GEMM kernel (optional) +template +StatusCode GemmTempBufferSize(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const size_t a_offset, const size_t a_ld, + const size_t b_offset, const size_t b_ld, + const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, size_t& temp_buffer_size); + +// ================================================================================================= + +// CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on +// for the same device. This cache can be cleared to free up system memory or in case of debugging. +StatusCode PUBLIC_API ClearCache(); + +// The cache can also be pre-initialized for a specific device with all possible CLBlast kernels. +// Further CLBlast routine calls will then run at maximum speed. +StatusCode PUBLIC_API FillCache(const cl_device_id device); + +// ================================================================================================= + +// Retrieves current tuning parameters for a specific device-precision-kernel combination +StatusCode PUBLIC_API RetrieveParameters(const cl_device_id device, const std::string &kernel_name, + const Precision precision, + std::unordered_map ¶meters); + +// Overrides tuning parameters for a specific device-precision-kernel combination. The next time +// the target routine is called it will re-compile and use the new parameters from then on. +StatusCode PUBLIC_API OverrideParameters(const cl_device_id device, const std::string &kernel_name, + const Precision precision, + const std::unordered_map ¶meters); + +// ================================================================================================= + +// Tunes the "Xaxpy" kernel, used for many level-1 routines such as XAXPY, XCOPY, and XSWAP +template +StatusCode TuneXaxpy(cl_command_queue* queue, const size_t n, + const double fraction, std::unordered_map ¶meters); + +// Tunes the "Xdot" kernel, used for level-1 reduction routines such as XDOT, XMAX, and XSUM +template +StatusCode TuneXdot(cl_command_queue* queue, const size_t n, + const double fraction, std::unordered_map ¶meters); + +// Tunes the "Xgemv" kernel, used for matrix-vector level-2 routines such as XGEMV, XGBMV, and XHEMV +template +StatusCode TuneXgemv(cl_command_queue* queue, const size_t m, const size_t n, + const double fraction, std::unordered_map ¶meters); + +// Tunes the "Xger" kernel, used for matrix update level-2 routines such as XGER, XHER, and XSYR2 +template +StatusCode TuneXger(cl_command_queue* queue, const size_t m, const size_t n, + const double fraction, std::unordered_map ¶meters); + +// Tunes the "Xgemm" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K +template +StatusCode TuneXgemm(cl_command_queue* queue, const size_t m, const size_t n, const size_t k, + const double fraction, std::unordered_map ¶meters); + +// Tunes the "XgemmDiret" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K +template +StatusCode TuneXgemmDirect(cl_command_queue* queue, const size_t m, const size_t n, const size_t k, + const double fraction, std::unordered_map ¶meters); + +// Tunes the "Copy" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K +template +StatusCode TuneCopy(cl_command_queue* queue, const size_t m, const size_t n, + const double fraction, std::unordered_map ¶meters); + +// Tunes the "Pad" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K +template +StatusCode TunePad(cl_command_queue* queue, const size_t m, const size_t n, + const double fraction, std::unordered_map ¶meters); + +// Tunes the "Transpose" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K +template +StatusCode TuneTranspose(cl_command_queue* queue, const size_t m, const size_t n, + const double fraction, std::unordered_map ¶meters); + +// Tunes the "Padtranspose" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K +template +StatusCode TunePadtranspose(cl_command_queue* queue, const size_t m, const size_t n, + const double fraction, std::unordered_map ¶meters); + +// Tunes the "Xgemm" kernel, used for the level-3 routine XTRSM +template +StatusCode TuneInvert(cl_command_queue* queue, const size_t m, const size_t n, const size_t k, + const double fraction, std::unordered_map ¶meters); + +// ================================================================================================= + +} // namespace clblast + +// CLBLAST_CLBLAST_H_ +#endif diff --git a/include/clblast_half.h b/include/clblast_half.h new file mode 100644 index 000000000..b8de8537a --- /dev/null +++ b/include/clblast_half.h @@ -0,0 +1,254 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file provides simple conversion operations between fp16 (half) and fp32 (float). These +// conversion functions are based on ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf and +// are also part of the C++ half-precision header (http://half.sourceforge.net/). +// +// This file is pure C99. +// +// ================================================================================================= + +#ifndef CLBLAST_HALF_H_ +#define CLBLAST_HALF_H_ + +// MSVC 2013 doesn't fully support C99 +#ifdef _MSC_VER + #define inline __inline +#endif + +// ================================================================================================= + +// The host data-type for half-precision floating-point (16-bit) is based on the `cl_half` OpenCL +// type, which is a typedef for unsigned short. +typedef unsigned short half; + +// 32-bit union for conversions +typedef union ConversionBits_ { + unsigned int i32; + float f32; +} ConversionBits; + +// ================================================================================================= + +// Converts a IEEE-compliant single-precision value to half-precision floating-point. This function +// applies simple truncation (round toward zero, but with overflows set to infinity) as rounding +// mode. +inline half FloatToHalf(const float value) { + static const unsigned short base_table[512] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100, + 0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000, 0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00, + 0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800, 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, + 0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, 0x9400, 0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00, + 0xC000, 0xC400, 0xC800, 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00, 0xF000, 0xF400, 0xF800, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00 + }; + static const unsigned char shift_table[512] = { + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13 + }; + ConversionBits bits; + bits.f32 = value; + const unsigned short halfbits = base_table[bits.i32 >> 23] + + (unsigned short)((bits.i32 & 0x7FFFFF) >> shift_table[bits.i32 >> 23]); + return halfbits; +} + +// Converts a half-precision value to IEEE-compliant single-precision floating-point +inline float HalfToFloat(const half value) { + static const unsigned int mantissa_table[2048] = { + 0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, 0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000, 0x35700000, + 0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000, + 0x36000000, 0x36040000, 0x36080000, 0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000, 0x36200000, 0x36240000, 0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000, + 0x36400000, 0x36440000, 0x36480000, 0x364C0000, 0x36500000, 0x36540000, 0x36580000, 0x365C0000, 0x36600000, 0x36640000, 0x36680000, 0x366C0000, 0x36700000, 0x36740000, 0x36780000, 0x367C0000, + 0x36800000, 0x36820000, 0x36840000, 0x36860000, 0x36880000, 0x368A0000, 0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369A0000, 0x369C0000, 0x369E0000, + 0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000, 0x36A80000, 0x36AA0000, 0x36AC0000, 0x36AE0000, 0x36B00000, 0x36B20000, 0x36B40000, 0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000, + 0x36C00000, 0x36C20000, 0x36C40000, 0x36C60000, 0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000, 0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000, 0x36DC0000, 0x36DE0000, + 0x36E00000, 0x36E20000, 0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000, 0x36EE0000, 0x36F00000, 0x36F20000, 0x36F40000, 0x36F60000, 0x36F80000, 0x36FA0000, 0x36FC0000, 0x36FE0000, + 0x37000000, 0x37010000, 0x37020000, 0x37030000, 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000, 0x370C0000, 0x370D0000, 0x370E0000, 0x370F0000, + 0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371A0000, 0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000, 0x371F0000, + 0x37200000, 0x37210000, 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, 0x37280000, 0x37290000, 0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000, 0x372F0000, + 0x37300000, 0x37310000, 0x37320000, 0x37330000, 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, 0x373A0000, 0x373B0000, 0x373C0000, 0x373D0000, 0x373E0000, 0x373F0000, + 0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374A0000, 0x374B0000, 0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000, + 0x37500000, 0x37510000, 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, 0x37580000, 0x37590000, 0x375A0000, 0x375B0000, 0x375C0000, 0x375D0000, 0x375E0000, 0x375F0000, + 0x37600000, 0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000, 0x376E0000, 0x376F0000, + 0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377A0000, 0x377B0000, 0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000, + 0x37800000, 0x37808000, 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, 0x37870000, 0x37878000, + 0x37880000, 0x37888000, 0x37890000, 0x37898000, 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000, 0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000, 0x378E8000, 0x378F0000, 0x378F8000, + 0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, 0x37960000, 0x37968000, 0x37970000, 0x37978000, + 0x37980000, 0x37988000, 0x37990000, 0x37998000, 0x379A0000, 0x379A8000, 0x379B0000, 0x379B8000, 0x379C0000, 0x379C8000, 0x379D0000, 0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000, + 0x37A00000, 0x37A08000, 0x37A10000, 0x37A18000, 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000, 0x37A48000, 0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000, + 0x37A80000, 0x37A88000, 0x37A90000, 0x37A98000, 0x37AA0000, 0x37AA8000, 0x37AB0000, 0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000, 0x37AE0000, 0x37AE8000, 0x37AF0000, 0x37AF8000, + 0x37B00000, 0x37B08000, 0x37B10000, 0x37B18000, 0x37B20000, 0x37B28000, 0x37B30000, 0x37B38000, 0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000, 0x37B60000, 0x37B68000, 0x37B70000, 0x37B78000, + 0x37B80000, 0x37B88000, 0x37B90000, 0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000, 0x37BB8000, 0x37BC0000, 0x37BC8000, 0x37BD0000, 0x37BD8000, 0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000, + 0x37C00000, 0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000, 0x37C30000, 0x37C38000, 0x37C40000, 0x37C48000, 0x37C50000, 0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000, 0x37C78000, + 0x37C80000, 0x37C88000, 0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000, 0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000, 0x37CD0000, 0x37CD8000, 0x37CE0000, 0x37CE8000, 0x37CF0000, 0x37CF8000, + 0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000, 0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, 0x37D50000, 0x37D58000, 0x37D60000, 0x37D68000, 0x37D70000, 0x37D78000, + 0x37D80000, 0x37D88000, 0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000, 0x37DB0000, 0x37DB8000, 0x37DC0000, 0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000, 0x37DE8000, 0x37DF0000, 0x37DF8000, + 0x37E00000, 0x37E08000, 0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000, 0x37E38000, 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000, 0x37E60000, 0x37E68000, 0x37E70000, 0x37E78000, + 0x37E80000, 0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000, 0x37EA8000, 0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, 0x37ED0000, 0x37ED8000, 0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000, + 0x37F00000, 0x37F08000, 0x37F10000, 0x37F18000, 0x37F20000, 0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000, 0x37F50000, 0x37F58000, 0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000, + 0x37F80000, 0x37F88000, 0x37F90000, 0x37F98000, 0x37FA0000, 0x37FA8000, 0x37FB0000, 0x37FB8000, 0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000, 0x37FF0000, 0x37FF8000, + 0x38000000, 0x38004000, 0x38008000, 0x3800C000, 0x38010000, 0x38014000, 0x38018000, 0x3801C000, 0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000, 0x38034000, 0x38038000, 0x3803C000, + 0x38040000, 0x38044000, 0x38048000, 0x3804C000, 0x38050000, 0x38054000, 0x38058000, 0x3805C000, 0x38060000, 0x38064000, 0x38068000, 0x3806C000, 0x38070000, 0x38074000, 0x38078000, 0x3807C000, + 0x38080000, 0x38084000, 0x38088000, 0x3808C000, 0x38090000, 0x38094000, 0x38098000, 0x3809C000, 0x380A0000, 0x380A4000, 0x380A8000, 0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000, + 0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, 0x380D0000, 0x380D4000, 0x380D8000, 0x380DC000, 0x380E0000, 0x380E4000, 0x380E8000, 0x380EC000, 0x380F0000, 0x380F4000, 0x380F8000, 0x380FC000, + 0x38100000, 0x38104000, 0x38108000, 0x3810C000, 0x38110000, 0x38114000, 0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000, 0x38130000, 0x38134000, 0x38138000, 0x3813C000, + 0x38140000, 0x38144000, 0x38148000, 0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000, 0x38160000, 0x38164000, 0x38168000, 0x3816C000, 0x38170000, 0x38174000, 0x38178000, 0x3817C000, + 0x38180000, 0x38184000, 0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000, 0x3819C000, 0x381A0000, 0x381A4000, 0x381A8000, 0x381AC000, 0x381B0000, 0x381B4000, 0x381B8000, 0x381BC000, + 0x381C0000, 0x381C4000, 0x381C8000, 0x381CC000, 0x381D0000, 0x381D4000, 0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000, 0x381F0000, 0x381F4000, 0x381F8000, 0x381FC000, + 0x38200000, 0x38204000, 0x38208000, 0x3820C000, 0x38210000, 0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000, 0x38228000, 0x3822C000, 0x38230000, 0x38234000, 0x38238000, 0x3823C000, + 0x38240000, 0x38244000, 0x38248000, 0x3824C000, 0x38250000, 0x38254000, 0x38258000, 0x3825C000, 0x38260000, 0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000, 0x38278000, 0x3827C000, + 0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, 0x38298000, 0x3829C000, 0x382A0000, 0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000, 0x382B4000, 0x382B8000, 0x382BC000, + 0x382C0000, 0x382C4000, 0x382C8000, 0x382CC000, 0x382D0000, 0x382D4000, 0x382D8000, 0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000, 0x382EC000, 0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000, + 0x38300000, 0x38304000, 0x38308000, 0x3830C000, 0x38310000, 0x38314000, 0x38318000, 0x3831C000, 0x38320000, 0x38324000, 0x38328000, 0x3832C000, 0x38330000, 0x38334000, 0x38338000, 0x3833C000, + 0x38340000, 0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000, 0x38358000, 0x3835C000, 0x38360000, 0x38364000, 0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000, 0x3837C000, + 0x38380000, 0x38384000, 0x38388000, 0x3838C000, 0x38390000, 0x38394000, 0x38398000, 0x3839C000, 0x383A0000, 0x383A4000, 0x383A8000, 0x383AC000, 0x383B0000, 0x383B4000, 0x383B8000, 0x383BC000, + 0x383C0000, 0x383C4000, 0x383C8000, 0x383CC000, 0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000, 0x383E0000, 0x383E4000, 0x383E8000, 0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000, + 0x38400000, 0x38404000, 0x38408000, 0x3840C000, 0x38410000, 0x38414000, 0x38418000, 0x3841C000, 0x38420000, 0x38424000, 0x38428000, 0x3842C000, 0x38430000, 0x38434000, 0x38438000, 0x3843C000, + 0x38440000, 0x38444000, 0x38448000, 0x3844C000, 0x38450000, 0x38454000, 0x38458000, 0x3845C000, 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000, 0x38478000, 0x3847C000, + 0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000, 0x38494000, 0x38498000, 0x3849C000, 0x384A0000, 0x384A4000, 0x384A8000, 0x384AC000, 0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000, + 0x384C0000, 0x384C4000, 0x384C8000, 0x384CC000, 0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000, 0x384E4000, 0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000, 0x384F8000, 0x384FC000, + 0x38500000, 0x38504000, 0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000, 0x38520000, 0x38524000, 0x38528000, 0x3852C000, 0x38530000, 0x38534000, 0x38538000, 0x3853C000, + 0x38540000, 0x38544000, 0x38548000, 0x3854C000, 0x38550000, 0x38554000, 0x38558000, 0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000, 0x38570000, 0x38574000, 0x38578000, 0x3857C000, + 0x38580000, 0x38584000, 0x38588000, 0x3858C000, 0x38590000, 0x38594000, 0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000, 0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000, 0x385BC000, + 0x385C0000, 0x385C4000, 0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000, 0x385E0000, 0x385E4000, 0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000, 0x385FC000, + 0x38600000, 0x38604000, 0x38608000, 0x3860C000, 0x38610000, 0x38614000, 0x38618000, 0x3861C000, 0x38620000, 0x38624000, 0x38628000, 0x3862C000, 0x38630000, 0x38634000, 0x38638000, 0x3863C000, + 0x38640000, 0x38644000, 0x38648000, 0x3864C000, 0x38650000, 0x38654000, 0x38658000, 0x3865C000, 0x38660000, 0x38664000, 0x38668000, 0x3866C000, 0x38670000, 0x38674000, 0x38678000, 0x3867C000, + 0x38680000, 0x38684000, 0x38688000, 0x3868C000, 0x38690000, 0x38694000, 0x38698000, 0x3869C000, 0x386A0000, 0x386A4000, 0x386A8000, 0x386AC000, 0x386B0000, 0x386B4000, 0x386B8000, 0x386BC000, + 0x386C0000, 0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000, 0x386D8000, 0x386DC000, 0x386E0000, 0x386E4000, 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000, 0x386FC000, + 0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000, 0x38718000, 0x3871C000, 0x38720000, 0x38724000, 0x38728000, 0x3872C000, 0x38730000, 0x38734000, 0x38738000, 0x3873C000, + 0x38740000, 0x38744000, 0x38748000, 0x3874C000, 0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000, 0x38764000, 0x38768000, 0x3876C000, 0x38770000, 0x38774000, 0x38778000, 0x3877C000, + 0x38780000, 0x38784000, 0x38788000, 0x3878C000, 0x38790000, 0x38794000, 0x38798000, 0x3879C000, 0x387A0000, 0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000, 0x387B4000, 0x387B8000, 0x387BC000, + 0x387C0000, 0x387C4000, 0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, 0x387D8000, 0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000, 0x387F0000, 0x387F4000, 0x387F8000, 0x387FC000, + 0x38000000, 0x38002000, 0x38004000, 0x38006000, 0x38008000, 0x3800A000, 0x3800C000, 0x3800E000, 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801A000, 0x3801C000, 0x3801E000, + 0x38020000, 0x38022000, 0x38024000, 0x38026000, 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000, 0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000, + 0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804A000, 0x3804C000, 0x3804E000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, 0x38058000, 0x3805A000, 0x3805C000, 0x3805E000, + 0x38060000, 0x38062000, 0x38064000, 0x38066000, 0x38068000, 0x3806A000, 0x3806C000, 0x3806E000, 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807A000, 0x3807C000, 0x3807E000, + 0x38080000, 0x38082000, 0x38084000, 0x38086000, 0x38088000, 0x3808A000, 0x3808C000, 0x3808E000, 0x38090000, 0x38092000, 0x38094000, 0x38096000, 0x38098000, 0x3809A000, 0x3809C000, 0x3809E000, + 0x380A0000, 0x380A2000, 0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000, 0x380AC000, 0x380AE000, 0x380B0000, 0x380B2000, 0x380B4000, 0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000, + 0x380C0000, 0x380C2000, 0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000, 0x380CE000, 0x380D0000, 0x380D2000, 0x380D4000, 0x380D6000, 0x380D8000, 0x380DA000, 0x380DC000, 0x380DE000, + 0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000, 0x380E8000, 0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, 0x380F4000, 0x380F6000, 0x380F8000, 0x380FA000, 0x380FC000, 0x380FE000, + 0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810A000, 0x3810C000, 0x3810E000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, 0x38118000, 0x3811A000, 0x3811C000, 0x3811E000, + 0x38120000, 0x38122000, 0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000, 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813A000, 0x3813C000, 0x3813E000, + 0x38140000, 0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000, 0x3814C000, 0x3814E000, 0x38150000, 0x38152000, 0x38154000, 0x38156000, 0x38158000, 0x3815A000, 0x3815C000, 0x3815E000, + 0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, 0x38178000, 0x3817A000, 0x3817C000, 0x3817E000, + 0x38180000, 0x38182000, 0x38184000, 0x38186000, 0x38188000, 0x3818A000, 0x3818C000, 0x3818E000, 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000, 0x3819C000, 0x3819E000, + 0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000, 0x381AE000, 0x381B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000, 0x381BC000, 0x381BE000, + 0x381C0000, 0x381C2000, 0x381C4000, 0x381C6000, 0x381C8000, 0x381CA000, 0x381CC000, 0x381CE000, 0x381D0000, 0x381D2000, 0x381D4000, 0x381D6000, 0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000, + 0x381E0000, 0x381E2000, 0x381E4000, 0x381E6000, 0x381E8000, 0x381EA000, 0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000, 0x381F4000, 0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000, + 0x38200000, 0x38202000, 0x38204000, 0x38206000, 0x38208000, 0x3820A000, 0x3820C000, 0x3820E000, 0x38210000, 0x38212000, 0x38214000, 0x38216000, 0x38218000, 0x3821A000, 0x3821C000, 0x3821E000, + 0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822A000, 0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, 0x38238000, 0x3823A000, 0x3823C000, 0x3823E000, + 0x38240000, 0x38242000, 0x38244000, 0x38246000, 0x38248000, 0x3824A000, 0x3824C000, 0x3824E000, 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825A000, 0x3825C000, 0x3825E000, + 0x38260000, 0x38262000, 0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000, 0x38272000, 0x38274000, 0x38276000, 0x38278000, 0x3827A000, 0x3827C000, 0x3827E000, + 0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828A000, 0x3828C000, 0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, 0x38298000, 0x3829A000, 0x3829C000, 0x3829E000, + 0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000, 0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000, 0x382B4000, 0x382B6000, 0x382B8000, 0x382BA000, 0x382BC000, 0x382BE000, + 0x382C0000, 0x382C2000, 0x382C4000, 0x382C6000, 0x382C8000, 0x382CA000, 0x382CC000, 0x382CE000, 0x382D0000, 0x382D2000, 0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000, 0x382DC000, 0x382DE000, + 0x382E0000, 0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000, 0x382F0000, 0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000, 0x382FA000, 0x382FC000, 0x382FE000, + 0x38300000, 0x38302000, 0x38304000, 0x38306000, 0x38308000, 0x3830A000, 0x3830C000, 0x3830E000, 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831A000, 0x3831C000, 0x3831E000, + 0x38320000, 0x38322000, 0x38324000, 0x38326000, 0x38328000, 0x3832A000, 0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000, 0x38336000, 0x38338000, 0x3833A000, 0x3833C000, 0x3833E000, + 0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834A000, 0x3834C000, 0x3834E000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000, 0x3835E000, + 0x38360000, 0x38362000, 0x38364000, 0x38366000, 0x38368000, 0x3836A000, 0x3836C000, 0x3836E000, 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000, 0x3837C000, 0x3837E000, + 0x38380000, 0x38382000, 0x38384000, 0x38386000, 0x38388000, 0x3838A000, 0x3838C000, 0x3838E000, 0x38390000, 0x38392000, 0x38394000, 0x38396000, 0x38398000, 0x3839A000, 0x3839C000, 0x3839E000, + 0x383A0000, 0x383A2000, 0x383A4000, 0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000, 0x383B4000, 0x383B6000, 0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000, + 0x383C0000, 0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000, 0x383CA000, 0x383CC000, 0x383CE000, 0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000, 0x383DC000, 0x383DE000, + 0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000, 0x383EA000, 0x383EC000, 0x383EE000, 0x383F0000, 0x383F2000, 0x383F4000, 0x383F6000, 0x383F8000, 0x383FA000, 0x383FC000, 0x383FE000, + 0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, 0x38418000, 0x3841A000, 0x3841C000, 0x3841E000, + 0x38420000, 0x38422000, 0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000, 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843A000, 0x3843C000, 0x3843E000, + 0x38440000, 0x38442000, 0x38444000, 0x38446000, 0x38448000, 0x3844A000, 0x3844C000, 0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000, 0x38458000, 0x3845A000, 0x3845C000, 0x3845E000, + 0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846A000, 0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, 0x38478000, 0x3847A000, 0x3847C000, 0x3847E000, + 0x38480000, 0x38482000, 0x38484000, 0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000, 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000, 0x3849E000, + 0x384A0000, 0x384A2000, 0x384A4000, 0x384A6000, 0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000, 0x384B0000, 0x384B2000, 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000, 0x384BC000, 0x384BE000, + 0x384C0000, 0x384C2000, 0x384C4000, 0x384C6000, 0x384C8000, 0x384CA000, 0x384CC000, 0x384CE000, 0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, 0x384D8000, 0x384DA000, 0x384DC000, 0x384DE000, + 0x384E0000, 0x384E2000, 0x384E4000, 0x384E6000, 0x384E8000, 0x384EA000, 0x384EC000, 0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000, 0x384F6000, 0x384F8000, 0x384FA000, 0x384FC000, 0x384FE000, + 0x38500000, 0x38502000, 0x38504000, 0x38506000, 0x38508000, 0x3850A000, 0x3850C000, 0x3850E000, 0x38510000, 0x38512000, 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000, + 0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, 0x3852C000, 0x3852E000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, 0x38538000, 0x3853A000, 0x3853C000, 0x3853E000, + 0x38540000, 0x38542000, 0x38544000, 0x38546000, 0x38548000, 0x3854A000, 0x3854C000, 0x3854E000, 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855A000, 0x3855C000, 0x3855E000, + 0x38560000, 0x38562000, 0x38564000, 0x38566000, 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000, 0x38570000, 0x38572000, 0x38574000, 0x38576000, 0x38578000, 0x3857A000, 0x3857C000, 0x3857E000, + 0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858A000, 0x3858C000, 0x3858E000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, 0x38598000, 0x3859A000, 0x3859C000, 0x3859E000, + 0x385A0000, 0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000, 0x385AC000, 0x385AE000, 0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000, 0x385BA000, 0x385BC000, 0x385BE000, + 0x385C0000, 0x385C2000, 0x385C4000, 0x385C6000, 0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000, 0x385D4000, 0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000, + 0x385E0000, 0x385E2000, 0x385E4000, 0x385E6000, 0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000, 0x385F2000, 0x385F4000, 0x385F6000, 0x385F8000, 0x385FA000, 0x385FC000, 0x385FE000, + 0x38600000, 0x38602000, 0x38604000, 0x38606000, 0x38608000, 0x3860A000, 0x3860C000, 0x3860E000, 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000, 0x3861C000, 0x3861E000, + 0x38620000, 0x38622000, 0x38624000, 0x38626000, 0x38628000, 0x3862A000, 0x3862C000, 0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000, 0x38638000, 0x3863A000, 0x3863C000, 0x3863E000, + 0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864A000, 0x3864C000, 0x3864E000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000, + 0x38660000, 0x38662000, 0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000, 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867A000, 0x3867C000, 0x3867E000, + 0x38680000, 0x38682000, 0x38684000, 0x38686000, 0x38688000, 0x3868A000, 0x3868C000, 0x3868E000, 0x38690000, 0x38692000, 0x38694000, 0x38696000, 0x38698000, 0x3869A000, 0x3869C000, 0x3869E000, + 0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000, 0x386A8000, 0x386AA000, 0x386AC000, 0x386AE000, 0x386B0000, 0x386B2000, 0x386B4000, 0x386B6000, 0x386B8000, 0x386BA000, 0x386BC000, 0x386BE000, + 0x386C0000, 0x386C2000, 0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000, 0x386D0000, 0x386D2000, 0x386D4000, 0x386D6000, 0x386D8000, 0x386DA000, 0x386DC000, 0x386DE000, + 0x386E0000, 0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000, 0x386EC000, 0x386EE000, 0x386F0000, 0x386F2000, 0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000, 0x386FC000, 0x386FE000, + 0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, 0x38718000, 0x3871A000, 0x3871C000, 0x3871E000, + 0x38720000, 0x38722000, 0x38724000, 0x38726000, 0x38728000, 0x3872A000, 0x3872C000, 0x3872E000, 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873A000, 0x3873C000, 0x3873E000, + 0x38740000, 0x38742000, 0x38744000, 0x38746000, 0x38748000, 0x3874A000, 0x3874C000, 0x3874E000, 0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000, 0x3875E000, + 0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876A000, 0x3876C000, 0x3876E000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, 0x38778000, 0x3877A000, 0x3877C000, 0x3877E000, + 0x38780000, 0x38782000, 0x38784000, 0x38786000, 0x38788000, 0x3878A000, 0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000, + 0x387A0000, 0x387A2000, 0x387A4000, 0x387A6000, 0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000, 0x387B2000, 0x387B4000, 0x387B6000, 0x387B8000, 0x387BA000, 0x387BC000, 0x387BE000, + 0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000, 0x387C8000, 0x387CA000, 0x387CC000, 0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000, 0x387DC000, 0x387DE000, + 0x387E0000, 0x387E2000, 0x387E4000, 0x387E6000, 0x387E8000, 0x387EA000, 0x387EC000, 0x387EE000, 0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000, 0x387F8000, 0x387FA000, 0x387FC000, 0x387FE000 + }; + static const unsigned int exponent_table[64] = { + 0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, 0x06000000, 0x06800000, 0x07000000, 0x07800000, + 0x08000000, 0x08800000, 0x09000000, 0x09800000, 0x0A000000, 0x0A800000, 0x0B000000, 0x0B800000, 0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000, 0x0E000000, 0x0E800000, 0x0F000000, 0x47800000, + 0x80000000, 0x80800000, 0x81000000, 0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000, + 0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8A000000, 0x8A800000, 0x8B000000, 0x8B800000, 0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000, 0x8E000000, 0x8E800000, 0x8F000000, 0xC7800000 + }; + static const unsigned short offset_table[64] = { + 0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, + 0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024 + }; + ConversionBits bits; + bits.i32 = mantissa_table[offset_table[value >> 10] + (value & 0x3FF)] + + exponent_table[value >> 10]; + return bits.f32; +} + +// ================================================================================================= + +// CLBLAST_HALF_H_ +#endif diff --git a/include/clblast_netlib_c.h b/include/clblast_netlib_c.h new file mode 100644 index 000000000..4c54fb188 --- /dev/null +++ b/include/clblast_netlib_c.h @@ -0,0 +1,993 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains the Netlib CBLAS interface to the CLBlast BLAS routines, performing all buffer +// copies automatically and running on the default OpenCL platform and device. For full control over +// performance, it is advised to use the regular clblast.h or clblast_c.h headers instead. +// +// ================================================================================================= + +#ifndef CLBLAST_CLBLAST_NETLIB_C_H_ +#define CLBLAST_CLBLAST_NETLIB_C_H_ + +// Exports library functions under Windows when building a DLL. See also: +// https://msdn.microsoft.com/en-us/library/a90k134d.aspx +#if defined(_WIN32) && defined(CLBLAST_DLL) + #if defined(COMPILING_DLL) + #define PUBLIC_API __declspec(dllexport) + #else + #define PUBLIC_API __declspec(dllimport) + #endif +#else + #define PUBLIC_API +#endif + +// The C interface +#ifdef __cplusplus +extern "C" { +#endif + +// ================================================================================================= + +// Matrix layout and transpose types +typedef enum CLBlastLayout_ { CLBlastLayoutRowMajor = 101, + CLBlastLayoutColMajor = 102 } CLBlastLayout; +typedef enum CLBlastTranspose_ { CLBlastTransposeNo = 111, CLBlastTransposeYes = 112, + CLBlastTransposeConjugate = 113 } CLBlastTranspose; +typedef enum CLBlastTriangle_ { CLBlastTriangleUpper = 121, + CLBlastTriangleLower = 122 } CLBlastTriangle; +typedef enum CLBlastDiagonal_ { CLBlastDiagonalNonUnit = 131, + CLBlastDiagonalUnit = 132 } CLBlastDiagonal; +typedef enum CLBlastSide_ { CLBlastSideLeft = 141, CLBlastSideRight = 142 } CLBlastSide; +typedef enum CLBlastKernelMode_ { CLBlastKernelModeCrossCorrelation = 141, CLBlastKernelModeConvolution = 152 } CLBlastKernelMode; + +// For full compatibility with CBLAS +typedef CLBlastLayout CBLAS_ORDER; +typedef CLBlastTranspose CBLAS_TRANSPOSE; +typedef CLBlastTriangle CBLAS_UPLO; +typedef CLBlastDiagonal CBLAS_DIAG; +typedef CLBlastSide CBLAS_SIDE; +#define CblasRowMajor CLBlastLayoutRowMajor +#define CblasColMajor CLBlastLayoutColMajor +#define CblasNoTrans CLBlastTransposeNo +#define CblasTrans CLBlastTransposeYes +#define CblasConjTrans CLBlastTransposeConjugate +#define CblasUpper CLBlastTriangleUpper +#define CblasLower CLBlastTriangleLower +#define CblasNonUnit CLBlastDiagonalNonUnit +#define CblasUnit CLBlastDiagonalUnit +#define CblasLeft CLBlastSideLeft +#define CblasRight CLBlastSideRight + +// ================================================================================================= +// BLAS level-1 (vector-vector) routines +// ================================================================================================= + +// Generate givens plane rotation: SROTG/DROTG +void PUBLIC_API cblas_srotg(float* sa, + float* sb, + float* sc, + float* ss); +void PUBLIC_API cblas_drotg(double* sa, + double* sb, + double* sc, + double* ss); + +// Generate modified givens plane rotation: SROTMG/DROTMG +void PUBLIC_API cblas_srotmg(float* sd1, + float* sd2, + float* sx1, + const float sy1, + float* sparam); +void PUBLIC_API cblas_drotmg(double* sd1, + double* sd2, + double* sx1, + const double sy1, + double* sparam); + +// Apply givens plane rotation: SROT/DROT +void PUBLIC_API cblas_srot(const int n, + float* x, const int x_inc, + float* y, const int y_inc, + const float cos, + const float sin); +void PUBLIC_API cblas_drot(const int n, + double* x, const int x_inc, + double* y, const int y_inc, + const double cos, + const double sin); + +// Apply modified givens plane rotation: SROTM/DROTM +void PUBLIC_API cblas_srotm(const int n, + float* x, const int x_inc, + float* y, const int y_inc, + float* sparam); +void PUBLIC_API cblas_drotm(const int n, + double* x, const int x_inc, + double* y, const int y_inc, + double* sparam); + +// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP +void PUBLIC_API cblas_sswap(const int n, + float* x, const int x_inc, + float* y, const int y_inc); +void PUBLIC_API cblas_dswap(const int n, + double* x, const int x_inc, + double* y, const int y_inc); +void PUBLIC_API cblas_cswap(const int n, + void* x, const int x_inc, + void* y, const int y_inc); +void PUBLIC_API cblas_zswap(const int n, + void* x, const int x_inc, + void* y, const int y_inc); + +// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL +void PUBLIC_API cblas_sscal(const int n, + const float alpha, + float* x, const int x_inc); +void PUBLIC_API cblas_dscal(const int n, + const double alpha, + double* x, const int x_inc); +void PUBLIC_API cblas_cscal(const int n, + const void* alpha, + void* x, const int x_inc); +void PUBLIC_API cblas_zscal(const int n, + const void* alpha, + void* x, const int x_inc); + +// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY +void PUBLIC_API cblas_scopy(const int n, + const float* x, const int x_inc, + float* y, const int y_inc); +void PUBLIC_API cblas_dcopy(const int n, + const double* x, const int x_inc, + double* y, const int y_inc); +void PUBLIC_API cblas_ccopy(const int n, + const void* x, const int x_inc, + void* y, const int y_inc); +void PUBLIC_API cblas_zcopy(const int n, + const void* x, const int x_inc, + void* y, const int y_inc); + +// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY +void PUBLIC_API cblas_saxpy(const int n, + const float alpha, + const float* x, const int x_inc, + float* y, const int y_inc); +void PUBLIC_API cblas_daxpy(const int n, + const double alpha, + const double* x, const int x_inc, + double* y, const int y_inc); +void PUBLIC_API cblas_caxpy(const int n, + const void* alpha, + const void* x, const int x_inc, + void* y, const int y_inc); +void PUBLIC_API cblas_zaxpy(const int n, + const void* alpha, + const void* x, const int x_inc, + void* y, const int y_inc); + +// Dot product of two vectors: SDOT/DDOT/HDOT +float PUBLIC_API cblas_sdot(const int n, + const float* x, const int x_inc, + const float* y, const int y_inc); +double PUBLIC_API cblas_ddot(const int n, + const double* x, const int x_inc, + const double* y, const int y_inc); + +// Dot product of two complex vectors: CDOTU/ZDOTU +void PUBLIC_API cblas_cdotu_sub(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* dot); +void PUBLIC_API cblas_zdotu_sub(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* dot); + +// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC +void PUBLIC_API cblas_cdotc_sub(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* dot); +void PUBLIC_API cblas_zdotc_sub(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* dot); + +// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 +float PUBLIC_API cblas_snrm2(const int n, + const float* x, const int x_inc); +double PUBLIC_API cblas_dnrm2(const int n, + const double* x, const int x_inc); +float PUBLIC_API cblas_scnrm2(const int n, + const void* x, const int x_inc); +double PUBLIC_API cblas_dznrm2(const int n, + const void* x, const int x_inc); + +// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM +float PUBLIC_API cblas_sasum(const int n, + const float* x, const int x_inc); +double PUBLIC_API cblas_dasum(const int n, + const double* x, const int x_inc); +float PUBLIC_API cblas_scasum(const int n, + const void* x, const int x_inc); +double PUBLIC_API cblas_dzasum(const int n, + const void* x, const int x_inc); + +// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM +float PUBLIC_API cblas_ssum(const int n, + const float* x, const int x_inc); +double PUBLIC_API cblas_dsum(const int n, + const double* x, const int x_inc); +float PUBLIC_API cblas_scsum(const int n, + const void* x, const int x_inc); +double PUBLIC_API cblas_dzsum(const int n, + const void* x, const int x_inc); + +// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX +int PUBLIC_API cblas_isamax(const int n, + const float* x, const int x_inc); +int PUBLIC_API cblas_idamax(const int n, + const double* x, const int x_inc); +int PUBLIC_API cblas_icamax(const int n, + const void* x, const int x_inc); +int PUBLIC_API cblas_izamax(const int n, + const void* x, const int x_inc); + +// Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN +int PUBLIC_API cblas_isamin(const int n, + const float* x, const int x_inc); +int PUBLIC_API cblas_idamin(const int n, + const double* x, const int x_inc); +int PUBLIC_API cblas_icamin(const int n, + const void* x, const int x_inc); +int PUBLIC_API cblas_izamin(const int n, + const void* x, const int x_inc); + +// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX +int PUBLIC_API cblas_ismax(const int n, + const float* x, const int x_inc); +int PUBLIC_API cblas_idmax(const int n, + const double* x, const int x_inc); +int PUBLIC_API cblas_icmax(const int n, + const void* x, const int x_inc); +int PUBLIC_API cblas_izmax(const int n, + const void* x, const int x_inc); + +// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN +int PUBLIC_API cblas_ismin(const int n, + const float* x, const int x_inc); +int PUBLIC_API cblas_idmin(const int n, + const double* x, const int x_inc); +int PUBLIC_API cblas_icmin(const int n, + const void* x, const int x_inc); +int PUBLIC_API cblas_izmin(const int n, + const void* x, const int x_inc); + +// ================================================================================================= +// BLAS level-2 (matrix-vector) routines +// ================================================================================================= + +// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV +void PUBLIC_API cblas_sgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, + const float alpha, + const float* a, const int a_ld, + const float* x, const int x_inc, + const float beta, + float* y, const int y_inc); +void PUBLIC_API cblas_dgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, + const double alpha, + const double* a, const int a_ld, + const double* x, const int x_inc, + const double beta, + double* y, const int y_inc); +void PUBLIC_API cblas_cgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc); +void PUBLIC_API cblas_zgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc); + +// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV +void PUBLIC_API cblas_sgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, const int kl, const int ku, + const float alpha, + const float* a, const int a_ld, + const float* x, const int x_inc, + const float beta, + float* y, const int y_inc); +void PUBLIC_API cblas_dgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, const int kl, const int ku, + const double alpha, + const double* a, const int a_ld, + const double* x, const int x_inc, + const double beta, + double* y, const int y_inc); +void PUBLIC_API cblas_cgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, const int kl, const int ku, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc); +void PUBLIC_API cblas_zgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, const int kl, const int ku, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc); + +// Hermitian matrix-vector multiplication: CHEMV/ZHEMV +void PUBLIC_API cblas_chemv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc); +void PUBLIC_API cblas_zhemv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc); + +// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV +void PUBLIC_API cblas_chbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc); +void PUBLIC_API cblas_zhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc); + +// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV +void PUBLIC_API cblas_chpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const void* alpha, + const void* ap, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc); +void PUBLIC_API cblas_zhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const void* alpha, + const void* ap, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc); + +// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV +void PUBLIC_API cblas_ssymv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const float alpha, + const float* a, const int a_ld, + const float* x, const int x_inc, + const float beta, + float* y, const int y_inc); +void PUBLIC_API cblas_dsymv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const double alpha, + const double* a, const int a_ld, + const double* x, const int x_inc, + const double beta, + double* y, const int y_inc); + +// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV +void PUBLIC_API cblas_ssbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, const int k, + const float alpha, + const float* a, const int a_ld, + const float* x, const int x_inc, + const float beta, + float* y, const int y_inc); +void PUBLIC_API cblas_dsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, const int k, + const double alpha, + const double* a, const int a_ld, + const double* x, const int x_inc, + const double beta, + double* y, const int y_inc); + +// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV +void PUBLIC_API cblas_sspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const float alpha, + const float* ap, + const float* x, const int x_inc, + const float beta, + float* y, const int y_inc); +void PUBLIC_API cblas_dspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const double alpha, + const double* ap, + const double* x, const int x_inc, + const double beta, + double* y, const int y_inc); + +// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV +void PUBLIC_API cblas_strmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const float* a, const int a_ld, + float* x, const int x_inc); +void PUBLIC_API cblas_dtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const double* a, const int a_ld, + double* x, const int x_inc); +void PUBLIC_API cblas_ctrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const void* a, const int a_ld, + void* x, const int x_inc); +void PUBLIC_API cblas_ztrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const void* a, const int a_ld, + void* x, const int x_inc); + +// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV +void PUBLIC_API cblas_stbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, const int k, + const float* a, const int a_ld, + float* x, const int x_inc); +void PUBLIC_API cblas_dtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, const int k, + const double* a, const int a_ld, + double* x, const int x_inc); +void PUBLIC_API cblas_ctbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, const int k, + const void* a, const int a_ld, + void* x, const int x_inc); +void PUBLIC_API cblas_ztbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, const int k, + const void* a, const int a_ld, + void* x, const int x_inc); + +// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV +void PUBLIC_API cblas_stpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const float* ap, + float* x, const int x_inc); +void PUBLIC_API cblas_dtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const double* ap, + double* x, const int x_inc); +void PUBLIC_API cblas_ctpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const void* ap, + void* x, const int x_inc); +void PUBLIC_API cblas_ztpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const void* ap, + void* x, const int x_inc); + +// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV +void PUBLIC_API cblas_strsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const float* a, const int a_ld, + float* x, const int x_inc); +void PUBLIC_API cblas_dtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const double* a, const int a_ld, + double* x, const int x_inc); +void PUBLIC_API cblas_ctrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const void* a, const int a_ld, + void* x, const int x_inc); +void PUBLIC_API cblas_ztrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const void* a, const int a_ld, + void* x, const int x_inc); + +// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV +void PUBLIC_API cblas_stbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, const int k, + const float* a, const int a_ld, + float* x, const int x_inc); +void PUBLIC_API cblas_dtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, const int k, + const double* a, const int a_ld, + double* x, const int x_inc); +void PUBLIC_API cblas_ctbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, const int k, + const void* a, const int a_ld, + void* x, const int x_inc); +void PUBLIC_API cblas_ztbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, const int k, + const void* a, const int a_ld, + void* x, const int x_inc); + +// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV +void PUBLIC_API cblas_stpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const float* ap, + float* x, const int x_inc); +void PUBLIC_API cblas_dtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const double* ap, + double* x, const int x_inc); +void PUBLIC_API cblas_ctpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const void* ap, + void* x, const int x_inc); +void PUBLIC_API cblas_ztpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const void* ap, + void* x, const int x_inc); + +// General rank-1 matrix update: SGER/DGER/HGER +void PUBLIC_API cblas_sger(const CLBlastLayout layout, + const int m, const int n, + const float alpha, + const float* x, const int x_inc, + const float* y, const int y_inc, + float* a, const int a_ld); +void PUBLIC_API cblas_dger(const CLBlastLayout layout, + const int m, const int n, + const double alpha, + const double* x, const int x_inc, + const double* y, const int y_inc, + double* a, const int a_ld); + +// General rank-1 complex matrix update: CGERU/ZGERU +void PUBLIC_API cblas_cgeru(const CLBlastLayout layout, + const int m, const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld); +void PUBLIC_API cblas_zgeru(const CLBlastLayout layout, + const int m, const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld); + +// General rank-1 complex conjugated matrix update: CGERC/ZGERC +void PUBLIC_API cblas_cgerc(const CLBlastLayout layout, + const int m, const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld); +void PUBLIC_API cblas_zgerc(const CLBlastLayout layout, + const int m, const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld); + +// Hermitian rank-1 matrix update: CHER/ZHER +void PUBLIC_API cblas_cher(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const float alpha, + const void* x, const int x_inc, + void* a, const int a_ld); +void PUBLIC_API cblas_zher(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const double alpha, + const void* x, const int x_inc, + void* a, const int a_ld); + +// Hermitian packed rank-1 matrix update: CHPR/ZHPR +void PUBLIC_API cblas_chpr(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const float alpha, + const void* x, const int x_inc, + void* ap); +void PUBLIC_API cblas_zhpr(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const double alpha, + const void* x, const int x_inc, + void* ap); + +// Hermitian rank-2 matrix update: CHER2/ZHER2 +void PUBLIC_API cblas_cher2(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld); +void PUBLIC_API cblas_zher2(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld); + +// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 +void PUBLIC_API cblas_chpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* ap); +void PUBLIC_API cblas_zhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* ap); + +// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR +void PUBLIC_API cblas_ssyr(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const float alpha, + const float* x, const int x_inc, + float* a, const int a_ld); +void PUBLIC_API cblas_dsyr(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const double alpha, + const double* x, const int x_inc, + double* a, const int a_ld); + +// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR +void PUBLIC_API cblas_sspr(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const float alpha, + const float* x, const int x_inc, + float* ap); +void PUBLIC_API cblas_dspr(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const double alpha, + const double* x, const int x_inc, + double* ap); + +// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 +void PUBLIC_API cblas_ssyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const float alpha, + const float* x, const int x_inc, + const float* y, const int y_inc, + float* a, const int a_ld); +void PUBLIC_API cblas_dsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const double alpha, + const double* x, const int x_inc, + const double* y, const int y_inc, + double* a, const int a_ld); + +// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 +void PUBLIC_API cblas_sspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const float alpha, + const float* x, const int x_inc, + const float* y, const int y_inc, + float* ap); +void PUBLIC_API cblas_dspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const double alpha, + const double* x, const int x_inc, + const double* y, const int y_inc, + double* ap); + +// ================================================================================================= +// BLAS level-3 (matrix-matrix) routines +// ================================================================================================= + +// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM +void PUBLIC_API cblas_sgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, + const int m, const int n, const int k, + const float alpha, + const float* a, const int a_ld, + const float* b, const int b_ld, + const float beta, + float* c, const int c_ld); +void PUBLIC_API cblas_dgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, + const int m, const int n, const int k, + const double alpha, + const double* a, const int a_ld, + const double* b, const int b_ld, + const double beta, + double* c, const int c_ld); +void PUBLIC_API cblas_cgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, + const int m, const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld); +void PUBLIC_API cblas_zgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, + const int m, const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld); + +// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM +void PUBLIC_API cblas_ssymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, + const int m, const int n, + const float alpha, + const float* a, const int a_ld, + const float* b, const int b_ld, + const float beta, + float* c, const int c_ld); +void PUBLIC_API cblas_dsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, + const int m, const int n, + const double alpha, + const double* a, const int a_ld, + const double* b, const int b_ld, + const double beta, + double* c, const int c_ld); +void PUBLIC_API cblas_csymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld); +void PUBLIC_API cblas_zsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld); + +// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM +void PUBLIC_API cblas_chemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld); +void PUBLIC_API cblas_zhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld); + +// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK +void PUBLIC_API cblas_ssyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, + const int n, const int k, + const float alpha, + const float* a, const int a_ld, + const float beta, + float* c, const int c_ld); +void PUBLIC_API cblas_dsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, + const int n, const int k, + const double alpha, + const double* a, const int a_ld, + const double beta, + double* c, const int c_ld); +void PUBLIC_API cblas_csyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* beta, + void* c, const int c_ld); +void PUBLIC_API cblas_zsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* beta, + void* c, const int c_ld); + +// Rank-K update of a hermitian matrix: CHERK/ZHERK +void PUBLIC_API cblas_cherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, + const int n, const int k, + const float alpha, + const void* a, const int a_ld, + const float beta, + void* c, const int c_ld); +void PUBLIC_API cblas_zherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, + const int n, const int k, + const double alpha, + const void* a, const int a_ld, + const double beta, + void* c, const int c_ld); + +// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K +void PUBLIC_API cblas_ssyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, + const int n, const int k, + const float alpha, + const float* a, const int a_ld, + const float* b, const int b_ld, + const float beta, + float* c, const int c_ld); +void PUBLIC_API cblas_dsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, + const int n, const int k, + const double alpha, + const double* a, const int a_ld, + const double* b, const int b_ld, + const double beta, + double* c, const int c_ld); +void PUBLIC_API cblas_csyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld); +void PUBLIC_API cblas_zsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld); + +// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K +void PUBLIC_API cblas_cher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const float beta, + void* c, const int c_ld); +void PUBLIC_API cblas_zher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const double beta, + void* c, const int c_ld); + +// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM +void PUBLIC_API cblas_strmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int m, const int n, + const float alpha, + const float* a, const int a_ld, + float* b, const int b_ld); +void PUBLIC_API cblas_dtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int m, const int n, + const double alpha, + const double* a, const int a_ld, + double* b, const int b_ld); +void PUBLIC_API cblas_ctrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld); +void PUBLIC_API cblas_ztrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld); + +// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM +void PUBLIC_API cblas_strsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int m, const int n, + const float alpha, + const float* a, const int a_ld, + float* b, const int b_ld); +void PUBLIC_API cblas_dtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int m, const int n, + const double alpha, + const double* a, const int a_ld, + double* b, const int b_ld); +void PUBLIC_API cblas_ctrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld); +void PUBLIC_API cblas_ztrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld); + +// ================================================================================================= +// Extra non-BLAS routines (level-X) +// ================================================================================================= + +// Element-wise vector product (Hadamard): SHAD/DHAD/CHAD/ZHAD/HHAD +void PUBLIC_API cblas_shad(const int n, + const float alpha, + const float* x, const int x_inc, + const float* y, const int y_inc, + const float beta, + float* z, const int z_inc); +void PUBLIC_API cblas_dhad(const int n, + const double alpha, + const double* x, const int x_inc, + const double* y, const int y_inc, + const double beta, + double* z, const int z_inc); +void PUBLIC_API cblas_chad(const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + const void* beta, + void* z, const int z_inc); +void PUBLIC_API cblas_zhad(const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + const void* beta, + void* z, const int z_inc); + +// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY +void PUBLIC_API cblas_somatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, + const float alpha, + const float* a, const int a_ld, + float* b, const int b_ld); +void PUBLIC_API cblas_domatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, + const double alpha, + const double* a, const int a_ld, + double* b, const int b_ld); +void PUBLIC_API cblas_comatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld); +void PUBLIC_API cblas_zomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld); + +// Im2col function (non-BLAS function): SIM2COL/DIM2COL/CIM2COL/ZIM2COL/HIM2COL +void PUBLIC_API cblas_sim2col(const CLBlastKernelMode kernel_mode, + const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, + const float* im, + float* col); +void PUBLIC_API cblas_dim2col(const CLBlastKernelMode kernel_mode, + const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, + const double* im, + double* col); +void PUBLIC_API cblas_cim2col(const CLBlastKernelMode kernel_mode, + const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, + const void* im, + void* col); +void PUBLIC_API cblas_zim2col(const CLBlastKernelMode kernel_mode, + const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, + const void* im, + void* col); + +// Col2im function (non-BLAS function): SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM +void PUBLIC_API cblas_scol2im(const CLBlastKernelMode kernel_mode, + const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, + const float* col, + float* im); +void PUBLIC_API cblas_dcol2im(const CLBlastKernelMode kernel_mode, + const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, + const double* col, + double* im); +void PUBLIC_API cblas_ccol2im(const CLBlastKernelMode kernel_mode, + const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, + const void* col, + void* im); +void PUBLIC_API cblas_zcol2im(const CLBlastKernelMode kernel_mode, + const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, + const void* col, + void* im); + +// ================================================================================================= + +#ifdef __cplusplus +} // extern "C" +#endif + +// CLBLAST_CLBLAST_NETLIB_C_H_ +#endif diff --git a/koboldcpp.py b/koboldcpp.py index 329cb5b08..4f1144b9e 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -200,7 +200,7 @@ maxctx = 2048 maxlen = 128 modelbusy = False defaultport = 5001 -KcppVersion = "1.22" +KcppVersion = "1.23" class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): sys_version = "" diff --git a/llama.cpp b/llama.cpp index b4c8a8baa..4dd86ff2b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11,6 +11,8 @@ #include "ggml.h" #ifdef GGML_USE_CUBLAS #include "ggml-cuda.h" +#elif defined(GGML_USE_CLBLAST) +#include "ggml-opencl.h" #endif #include @@ -1041,7 +1043,7 @@ static void llama_model_load_internal( ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL); model.mapping = std::move(ml->mapping); -#ifdef GGML_USE_CUBLAS +#if defined(GGML_USE_CUBLAS) { const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); @@ -1067,6 +1069,32 @@ static void llama_model_load_internal( fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); } +#elif defined(GGML_USE_CLBLAST) + { + const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); + + fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu); + + size_t vram_total = 0; + + for (int i = 0; i < n_gpu; ++i) { + const auto & layer = model.layers[i]; + + ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq); + ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk); + ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv); + ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo); + ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1); + ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2); + ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3); + } + if (n_gpu_layers > (int) hparams.n_layer) { + fprintf(stderr, "%s: [opencl] offloading output layer to GPU\n", __func__); + ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output); + } + + fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); + } #else (void) n_gpu_layers; #endif