Merge 'origin/master' into hipblas

This commit is contained in:
Henri Vasserman 2023-07-02 15:39:57 +03:00
commit 15db19ae7b
No known key found for this signature in database
GPG key ID: 2995FC0F58B1A986
21 changed files with 589 additions and 260 deletions

View file

@ -419,11 +419,6 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES
if (MSVC)
# TODO: arm msvc?
else()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
# Apple M1, M2, etc.
# Raspberry Pi 3, 4, Zero 2 (64-bit)
add_compile_options(-mcpu=native)
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
# Raspberry Pi 1, Zero
add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access)

View file

@ -85,6 +85,7 @@ as the main playground for developing new features for the [ggml](https://github
- [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy)
- [X] [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b)
- [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
- [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B)
**Bindings:**

View file

@ -136,7 +136,7 @@ def find_n_mult(n_ff: int, n_embd: int) -> int:
calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
if calc_ff == n_ff:
return n_mult
return 1
raise Exception(f"failed to find n_mult for (n_ff={n_ff}, n_embd={n_embd}).")
@dataclass
class Params:
@ -321,6 +321,10 @@ class Tensor(metaclass=ABCMeta):
@abstractmethod
def permute(self, n_head: int) -> 'Tensor': ...
@abstractmethod
def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': ...
@abstractmethod
def part(self, n_part: int) -> 'UnquantizedTensor': ...
@abstractmethod
def to_ggml(self) -> 'GGMLCompatibleTensor': ...
@ -345,6 +349,14 @@ class UnquantizedTensor(Tensor):
def to_ggml(self) -> 'UnquantizedTensor':
return self
def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor':
r = self.ndarray.shape[0] // 3
return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head))
def part(self, n_part: int) -> 'UnquantizedTensor':
r = self.ndarray.shape[0] // 3
return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
def permute(self, n_head: int) -> 'UnquantizedTensor':
return UnquantizedTensor(permute(self.ndarray, n_head))
@ -642,6 +654,19 @@ def permute_lazy(lazy_tensor: LazyTensor, n_head: int) -> LazyTensor:
return lazy_tensor.load().permute(n_head)
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor:
def load() -> Tensor:
return lazy_tensor.load().permute_part(n_part, n_head)
s = lazy_tensor.shape.copy()
s[0] = s[0] // 3
return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
def load() -> Tensor:
return lazy_tensor.load().part(n_part)
s = lazy_tensor.shape.copy()
s[0] = s[0] // 3
return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)
def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
out: LazyModel = {}
@ -650,11 +675,17 @@ def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
out["output.weight"] = model["lm_head.weight"]
for i in itertools.count():
if f"model.layers.{i}.self_attn.q_proj.weight" not in model:
break
if f"model.layers.{i}.self_attn.q_proj.weight" in model:
out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head)
out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
out[f"layers.{i}.attention.wq.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head)
out[f"layers.{i}.attention.wk.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head)
out[f"layers.{i}.attention.wv.weight"] = part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
else:
break
out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
out[f"layers.{i}.feed_forward.w1.weight"] = model[f"model.layers.{i}.mlp.gate_proj.weight"]

View file

@ -110,7 +110,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
invalid_param = true;
break;
}
params.seed = std::stoi(argv[i]);
params.seed = std::stoul(argv[i]);
} else if (arg == "-t" || arg == "--threads") {
if (++i >= argc) {
invalid_param = true;

View file

@ -22,7 +22,7 @@
int32_t get_num_physical_cores();
struct gpt_params {
int32_t seed = -1; // RNG seed
uint32_t seed = -1; // RNG seed
int32_t n_threads = get_num_physical_cores();
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 512; // context size

View file

@ -210,9 +210,12 @@ llama_token sampling_id(struct MyModel* mymodel) {
const char * sampling(struct MyModel * mymodel) {
llama_context * ctx = mymodel->ctx;
int id = sampling_id(mymodel);
std::string ret;
if (id == llama_token_eos()) ret = "</s>";
else ret = llama_token_to_str(ctx, id);
static std::string ret;
if (id == llama_token_eos()) {
ret = "</s>";
} else {
ret = llama_token_to_str(ctx, id);
}
eval_id(mymodel, id);
return ret.c_str();
}

View file

@ -5,7 +5,6 @@
#include "llama.h"
#include "build-info.h"
extern "C" {
typedef struct MyModel {
@ -14,7 +13,6 @@ typedef struct MyModel {
int n_past = 0;
} MyModel;
struct MyModel* create_mymodel(int argc, char ** argv);
bool eval_float(void* model, float* input, int N);

View file

@ -24,11 +24,11 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
if (params.seed < 0) {
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.random_prompt) {

View file

@ -242,7 +242,7 @@ Example usage: `--logit-bias 29905-inf`
### RNG Seed
- `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1, < 0 = random seed).
- `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1, -1 = random seed).
The RNG seed is used to initialize the random number generator that influences the text generation process. By setting a specific seed value, you can obtain consistent and reproducible results across multiple runs with the same input and settings. This can be helpful for testing, debugging, or comparing the effects of different options on the generated text to see when they diverge. If the seed is set to a value less than 0, a random seed will be used, which will result in different outputs on each run.

View file

@ -94,11 +94,11 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
if (params.seed < 0) {
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.random_prompt) {

View file

@ -136,11 +136,11 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
if (params.seed < 0) {
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.random_prompt) {

View file

@ -152,7 +152,7 @@ node .
`mirostat_eta`: Set the Mirostat learning rate, parameter eta (default: 0.1).
`seed`: Set the random number generator (RNG) seed (default: -1, < 0 = random seed).
`seed`: Set the random number generator (RNG) seed (default: -1, -1 = random seed).
`ignore_eos`: Ignore end of stream token and continue generating (default: false).

View file

@ -2671,7 +2671,8 @@ struct train_params {
const char * fn_checkpoint_out;
const char * fn_model_out;
int seed;
uint32_t seed;
int n_ctx;
int n_embd;
int n_mult;
@ -2768,7 +2769,7 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
fprintf(stderr, " --checkpoint-in FNAME path from which to load training checkpoint (default '%s')\n", params->fn_checkpoint_in);
fprintf(stderr, " --checkpoint-out FNAME path to save training checkpoint (default '%s')\n", params->fn_checkpoint_out);
fprintf(stderr, " --model-out FNAME path to save ggml model (default '%s')\n", params->fn_model_out);
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for -1)\n");
fprintf(stderr, " -c N, --ctx N Context size used during training (default %d)\n", params->n_ctx);
fprintf(stderr, " --embd N Embedding size used for new models (default %d)\n", params->n_embd);
fprintf(stderr, " --mult N Mult size used for new models, influences feedforward size. (default %d)\n", params->n_mult);
@ -3034,10 +3035,10 @@ int main(int argc, char ** argv) {
return 1;
}
if (params.seed < 0) {
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
printf("%s: seed: %d\n", __func__, params.seed);
printf("%s: seed: %u\n", __func__, params.seed);
srand(params.seed);
struct llama_context_params llama_params = llama_context_default_params();

View file

@ -269,6 +269,11 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
#endif
struct ggml_tensor_extra_gpu {
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
};
static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
@ -2025,7 +2030,6 @@ inline void ggml_cuda_op_add(
} else {
GGML_ASSERT(false);
}
CUDA_CHECK(cudaGetLastError());
(void) src1;
(void) dst;
@ -2057,7 +2061,6 @@ inline void ggml_cuda_op_mul(
// compute
mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
CUDA_CHECK(cudaGetLastError());
}
(void) dst;
@ -2078,7 +2081,6 @@ inline void ggml_cuda_op_silu(
// compute
silu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
CUDA_CHECK(cudaGetLastError());
(void) src1;
(void) dst;
@ -2101,7 +2103,6 @@ inline void ggml_cuda_op_rms_norm(
// compute
rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
CUDA_CHECK(cudaGetLastError());
(void) src1;
(void) dst;
@ -2180,7 +2181,6 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
GGML_ASSERT(false);
break;
}
CUDA_CHECK(cudaGetLastError());
#ifdef GGML_CUDA_DMMV_F16
if (src1_convert_f16) {
@ -2257,7 +2257,6 @@ inline void ggml_cuda_op_rope(
// compute
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
CUDA_CHECK(cudaGetLastError());
(void) dst;
(void) src0_ddq_i;
@ -2281,7 +2280,6 @@ inline void ggml_cuda_op_diag_mask_inf(
// compute
diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
CUDA_CHECK(cudaGetLastError());
(void) dst;
(void) src0_ddq_i;
@ -2303,7 +2301,6 @@ inline void ggml_cuda_op_soft_max(
// compute
soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
CUDA_CHECK(cudaGetLastError());
(void) src1;
(void) dst;
@ -2399,10 +2396,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
// if multiple GPUs are used they need to wait for the main GPU to finish
// if multiple devices are used they need to wait for the main device
// here an event is recorded that signifies that the main device has finished calculating the input data
if (split && g_device_count > 1) {
CUDA_CHECK(cudaSetDevice(g_main_device));
CUDA_CHECK(cudaDeviceSynchronize());
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device], g_cudaStreams_main[g_main_device]));
}
for (int id = 0; id < g_device_count; ++id) {
@ -2428,6 +2426,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
int64_t row_diff = row_high - row_low;
cudaSetDevice(id);
cudaStream_t cudaStream_main = g_cudaStreams_main[id];
// wait for main GPU data if necessary
if (split && id != g_main_device) {
CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, src0_extra->events[g_main_device]));
}
if (src0_on_device && src0_is_contiguous) {
if (src0_is_f32) {
@ -2503,8 +2507,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
}
const int64_t i11 = i13*ne12 + i12;
cudaStream_t cudaStream_main = g_cudaStreams_main[id];
// for split tensors the data begins at i0 == i0_offset_low
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
@ -2564,6 +2566,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
// do the computation
op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
CUDA_CHECK(cudaGetLastError());
// copy dst to host or other device if necessary
if (!dst_on_device) {
@ -2593,6 +2596,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
}
}
// signify to main device that other device is done
if (split && g_device_count > 1 && id != g_main_device) {
CUDA_CHECK(cudaEventRecord(src0_extra->events[id], cudaStream_main));
}
}
}
}
@ -2604,7 +2612,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
}
CUDA_CHECK(cudaSetDevice(id));
CUDA_CHECK(cudaDeviceSynchronize());
if (src0_asq[id] > 0) {
ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
@ -2619,6 +2626,21 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
ggml_cuda_pool_free(dst_ddf[id], dst_asf[id]);
}
}
// main device waits for all other devices to be finished
if (split && g_device_count > 1) {
CUDA_CHECK(cudaSetDevice(g_main_device));
for (int id = 0; id < g_device_count; ++id) {
if (id != g_main_device) {
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
}
}
}
if (dst->backend == GGML_BACKEND_CPU) {
CUDA_CHECK(cudaSetDevice(g_main_device));
CUDA_CHECK(cudaDeviceSynchronize());
}
}
void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@ -2858,6 +2880,10 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
extra->data_device[id] = buf;
if (backend == GGML_BACKEND_GPU_SPLIT) {
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id], cudaEventDisableTiming));
}
}
tensor->extra = extra;
@ -2871,14 +2897,17 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
for (int id = 0; id < g_device_count; ++id) {
if (extra->data_device[id] == nullptr) {
continue;
}
if (extra->data_device[id] != nullptr) {
CUDA_CHECK(cudaSetDevice(id));
CUDA_CHECK(cudaFree(extra->data_device[id]));
}
if (extra->events[id] != nullptr) {
CUDA_CHECK(cudaSetDevice(id));
CUDA_CHECK(cudaEventDestroy(extra->events[id]));
}
}
delete extra;
}

View file

@ -8,10 +8,6 @@ extern "C" {
#define GGML_CUDA_MAX_DEVICES 16
struct ggml_tensor_extra_gpu {
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
};
void ggml_init_cublas(void);
void ggml_cuda_set_tensor_split(const float * tensor_split);

View file

@ -202,7 +202,9 @@ struct ggml_metal_context * ggml_metal_init(void) {
void ggml_metal_free(struct ggml_metal_context * ctx) {
fprintf(stderr, "%s: deallocating\n", __func__);
for (int i = 0; i < ctx->n_buffers; ++i) {
[ctx->buffers[i].metal release];
}
free(ctx);
}

View file

@ -21,11 +21,19 @@
#define CL_DMMV_BLOCK_SIZE 32
#ifndef K_QUANTS_PER_ITERATION
#define K_QUANTS_PER_ITERATION 1
#else
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
#endif
#define MULTILINE_QUOTE(...) #__VA_ARGS__
static std::string program_source = MULTILINE_QUOTE(
typedef char int8_t;
typedef uchar uint8_t;
typedef short int16_t;
typedef ushort uint16_t;
typedef int int32_t;
typedef uint uint32_t;
@ -175,7 +183,9 @@ void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float
*v0 = vload_half(0, &x[ib + 0]);
*v1 = vload_half(0, &x[ib + 1]);
}
);
static std::string k_quants_source = MULTILINE_QUOTE(
inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8_t *m)
{
if (j < 4)
@ -199,7 +209,7 @@ __kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __globa
const int is = 8 * n + l / 16;
const uint8_t q = x[i].qs[32 * n + l];
__global float *y = yy + i * 256 + 128 * n;
__global float *y = yy + i * QK_K + 128 * n;
const float dall = vload_half(0, &x[i].d);
const float dmin = vload_half(0, &x[i].dmin);
@ -231,7 +241,7 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa
float d_all = vload_half(0, &x[i].d);
float dl = d_all * (us - 32);
__global float *y = yy + i * 256 + 128 * n + 32 * j;
__global float *y = yy + i * QK_K + 128 * n + 32 * j;
const __global uint8_t *q = x[i].qs + 32 * n;
const __global uint8_t *hm = x[i].hmask;
@ -248,7 +258,7 @@ __kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __globa
const int is = 2 * il;
const int n = 4;
__global float *y = yy + i * 256 + 64 * il + n * ir;
__global float *y = yy + i * QK_K + 64 * il + n * ir;
const float dall = vload_half(0, &x[i].d);
const float dmin = vload_half(0, &x[i].dmin);
@ -277,7 +287,7 @@ __kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __globa
const int ir = tid % 16;
const int is = 2 * il;
__global float *y = yy + i * 256 + 64 * il + 2 * ir;
__global float *y = yy + i * QK_K + 64 * il + 2 * ir;
const float dall = vload_half(0, &x[i].d);
const float dmin = vload_half(0, &x[i].dmin);
@ -309,7 +319,7 @@ __kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __globa
const int il = tid - 32 * ip;
const int is = 8 * ip + il / 16;
__global float *y = yy + i * 256 + 128 * ip + il;
__global float *y = yy + i * QK_K + 128 * ip + il;
const float d = vload_half(0, &x[i].d);
@ -323,161 +333,383 @@ __kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __globa
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
}
__kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
void vec_dot_q2_K(__global const struct block_q2_K* x, const int ib, const int iqs, const __global float *yy, float *result) {
const int row = get_group_id(0);
int n = iqs / 128;
int r = iqs - 128 * n;
int l = r / 8;
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;
__global const float *y = yy + 128 * n + l;
__global const uint8_t *q = x[ib].qs + 32 * n + l;
__global const uint8_t *s = x[ib].scales + 8 * n;
__global const struct block_q2_K * x = xx + ib0;
const float dall = vload_half(0, &x[ib].d);
const float dmin = vload_half(0, &x[ib].dmin);
const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; // 0 or 0,1
float sum = y[ 0] * (dall * ((s[0] & 0xF) * ((q[ 0] >> 0) & 3)) - dmin * (s[0] >> 4))
+ y[ 32] * (dall * ((s[2] & 0xF) * ((q[ 0] >> 2) & 3)) - dmin * (s[2] >> 4))
+ y[ 64] * (dall * ((s[4] & 0xF) * ((q[ 0] >> 4) & 3)) - dmin * (s[4] >> 4))
+ y[ 96] * (dall * ((s[6] & 0xF) * ((q[ 0] >> 6) & 3)) - dmin * (s[6] >> 4))
+ y[ 16] * (dall * ((s[1] & 0xF) * ((q[16] >> 0) & 3)) - dmin * (s[1] >> 4))
+ y[ 48] * (dall * ((s[3] & 0xF) * ((q[16] >> 2) & 3)) - dmin * (s[3] >> 4))
+ y[ 80] * (dall * ((s[5] & 0xF) * ((q[16] >> 4) & 3)) - dmin * (s[5] >> 4))
+ y[112] * (dall * ((s[7] & 0xF) * ((q[16] >> 6) & 3)) - dmin * (s[7] >> 4));
const int step = 16/K_QUANTS_PER_ITERATION;
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
const int in = tid - step*im; // 0...15 or 0...7
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2
const int q_offset = 32*im + l0;
const int s_offset = 8*im;
const int y_offset = 128*im + l0;
tmp[16 * ix + tid] = 0;
uint32_t aux[4];
const uint8_t * d = (const uint8_t *)aux;
const uint8_t * m = (const uint8_t *)(aux + 2);
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
__global const float * y = yy + i * QK_K + y_offset;
__global const uint8_t * q = x[i].qs + q_offset;
const float dall = vload_half(0, &x[i].d);
const float dmin = vload_half(0, &x[i].dmin);
__global const uint32_t * a = (__global const uint32_t *)(x[i].scales + s_offset);
aux[0] = a[0] & 0x0f0f0f0f;
aux[1] = a[1] & 0x0f0f0f0f;
aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
float sum1 = 0, sum2 = 0;
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
+ y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
+ y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
+ y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
+ y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
+ y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
+ y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
+y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
+ y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
*result = sum;
}
void vec_dot_q3_K(__global const struct block_q3_K* x, const int ib, const int iqs, const __global float *yy, float *result) {
const uint32_t kmask1 = 0x03030303;
const uint32_t kmask2 = 0x0f0f0f0f;
uint32_t aux[3];
uint32_t utmp[4];
int n = iqs/128;
int r = iqs - 128*n;
int l = r/8;
__global const float * y = yy + 128*n + l;
__global const uint8_t * q = x[ib].qs + 32*n + l;
__global const uint8_t * hm = x[ib].hmask + l;
const int8_t * s = (const int8_t *)utmp + 8*n;
aux[0] = x[ib].scales[0] | x[ib].scales[1] << 8 | x[ib].scales[2] << 16 | x[ib].scales[3] << 24;
aux[1] = x[ib].scales[4] | x[ib].scales[5] << 8 | x[ib].scales[6] << 16 | x[ib].scales[7] << 24;
aux[2] = x[ib].scales[8] | x[ib].scales[9] << 8 | x[ib].scales[10] << 16 | x[ib].scales[11] << 24;
utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
const float dall = vload_half(0, &x[ib].d);
const uint8_t m = 1 << (4*n);
float sum = y[ 0] * (s[0] - 32) * (((q[ 0] >> 0) & 3) - (hm[ 0] & (m << 0) ? 0 : 4))
+ y[ 32] * (s[2] - 32) * (((q[ 0] >> 2) & 3) - (hm[ 0] & (m << 1) ? 0 : 4))
+ y[ 64] * (s[4] - 32) * (((q[ 0] >> 4) & 3) - (hm[ 0] & (m << 2) ? 0 : 4))
+ y[ 96] * (s[6] - 32) * (((q[ 0] >> 6) & 3) - (hm[ 0] & (m << 3) ? 0 : 4))
+ y[ 16] * (s[1] - 32) * (((q[16] >> 0) & 3) - (hm[16] & (m << 0) ? 0 : 4))
+ y[ 48] * (s[3] - 32) * (((q[16] >> 2) & 3) - (hm[16] & (m << 1) ? 0 : 4))
+ y[ 80] * (s[5] - 32) * (((q[16] >> 4) & 3) - (hm[16] & (m << 2) ? 0 : 4))
+ y[112] * (s[7] - 32) * (((q[16] >> 6) & 3) - (hm[16] & (m << 3) ? 0 : 4));
*result = sum * dall;
tmp[16 * ix + tid] += dall * sum1 - dmin * sum2;
}
void vec_dot_q4_K(__global const struct block_q4_K* x, const int ib, const int iqs, const __global float *yy, float *result) {
// sum up partial sums and write back result
barrier(CLK_LOCAL_MEM_FENCE);
for (int s=16; s>0; s>>=1) {
if (tid < s) {
tmp[tid] += tmp[tid + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (tid == 0) {
dst[row] = tmp[0];
}
}
const int j = iqs / 64; // j is in 0...3
const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4
const int is = 2*j; // is is in 0...6 in steps of 2
__kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
const uint16_t kmask1 = 0x0303;
const uint16_t kmask2 = 0x0f0f;
__global const float * y = yy + 64*j + ir;
__global const uint8_t * q = x[ib].qs + 32*j + ir;
const int row = get_group_id(0);
const float dall = vload_half(0, &x[ib].d);
const float dmin = vload_half(0, &x[ib].dmin);
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;
uint8_t sc, m;
get_scale_min_k4(is + 0, x[ib].scales, &sc, &m);
const float d1 = dall * sc;
const float m1 = dmin * m;
get_scale_min_k4(is + 1, x[ib].scales, &sc, &m);
const float d2 = dall * sc;
const float m2 = dmin * m;
__global const struct block_q3_K * x = xx + ib0;
const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; // 0 or 0,1
const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop
const int step = 16/K_QUANTS_PER_ITERATION;
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
const int in = tid - step*im; // 0....15 or 0...7
const uint8_t m = 1 << (4*im);
const int l0 = n*in; // 0...15 or 0...14 in steps of 2
const int q_offset = 32*im + l0;
const int y_offset = 128*im + l0;
uint16_t utmp[4];
const int8_t * s = (const int8_t *)utmp;
const uint16_t s_shift = 4*im;
tmp[16 * ix + tid] = 0;
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
__global const float * y = yy + i * QK_K + y_offset;
__global const uint8_t * q = x[i].qs + q_offset;
__global const uint8_t * h = x[i].hmask + l0;
__global const uint16_t * a = (__global const uint16_t *)x[i].scales;
utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
const float d = vload_half(0, &x[i].d);
float sum = 0;
for (int k = 0; k < 4; ++k) {
sum += y[k + 0] * (d1 * (q[k] & 0xF) - m1);
sum += y[k + 32] * (d2 * (q[k] >> 4) - m2);
for (int l = 0; l < n; ++l) {
sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
+ y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
+ y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
+ y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
+ y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
+ y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
+ y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
}
tmp[16 * ix + tid] += d * sum;
}
*result = sum;
// sum up partial sums and write back result
barrier(CLK_LOCAL_MEM_FENCE);
for (int s=16; s>0; s>>=1) {
if (tid < s) {
tmp[tid] += tmp[tid + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (tid == 0) {
dst[row] = tmp[0];
}
}
void vec_dot_q5_K(__global const struct block_q5_K* x, const int ib, const int iqs, const __global float *yy, float *result) {
__kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
const int j = iqs / 64;
const int ir = (iqs - 64*j)/2;
const int is = 2*j;
//to rename it later, just to test now
const uint16_t kmask1 = 0x3f3f;
const uint16_t kmask2 = 0x0f0f;
const uint16_t kmask3 = 0xc0c0;
__global const float * y = yy + 64*j + ir;
__global const uint8_t * ql = x[ib].qs + 32*j + ir;
__global const uint8_t * qh = x[ib].qh + ir;
const int row = get_group_id(0);
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;
const float dall = vload_half(0, &x[ib].d);
const float dmin = vload_half(0, &x[ib].dmin);
const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...15
const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION;
uint8_t sc, m;
get_scale_min_k4(is + 0, x[ib].scales, &sc, &m);
const float d1 = dall * sc;
const float m1 = dmin * m;
get_scale_min_k4(is + 1, x[ib].scales, &sc, &m);
const float d2 = dall * sc;
const float m2 = dmin * m;
const int step = 8/K_QUANTS_PER_ITERATION;
uint8_t hm = 1 << is;
const int il = tid/step; // 0...3
const int ir = tid - step*il;// 0...3
const int n = 2*K_QUANTS_PER_ITERATION;
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
const int in = il%2;
const int l0 = n*(2*ir + in);
const int q_offset = 32*im + l0;
const int y_offset = 64*im + l0;
uint16_t aux[4];
const uint8_t * sc = (const uint8_t *)aux;
__global const struct block_q4_K * x = xx + ib0;
tmp[16 * ix + tid] = 0;
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
__global const uint8_t * q1 = x[i].qs + q_offset;
__global const uint8_t * q2 = q1 + 64;
__global const float * y1 = yy + i*QK_K + y_offset;
__global const float * y2 = y1 + 128;
const float dall = vload_half(0, &x[i].d);
const float dmin = vload_half(0, &x[i].dmin);
__global const uint16_t * a = (__global const uint16_t *)x[i].scales;
aux[0] = a[im+0] & kmask1;
aux[1] = a[im+2] & kmask1;
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
float4 s = (float4)(0.f);
float smin = 0;
for (int l = 0; l < n; ++l) {
s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4);
s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4);
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
}
tmp[16 * ix + tid] += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
}
// sum up partial sums and write back result
barrier(CLK_LOCAL_MEM_FENCE);
for (int s=16; s>0; s>>=1) {
if (tid < s) {
tmp[tid] += tmp[tid + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (tid == 0) {
dst[row] = tmp[0];
}
}
__kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
const uint16_t kmask1 = 0x3f3f;
const uint16_t kmask2 = 0x0f0f;
const uint16_t kmask3 = 0xc0c0;
const int row = get_group_id(0);
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;
const int tid = get_local_id(0)/2; // 0...15
const int ix = get_local_id(0)%2;
const int il = tid/4; // 0...3
const int ir = tid - 4*il;// 0...3
const int n = 2;
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
const int in = il%2;
const int l0 = n*(2*ir + in);
const int q_offset = 32*im + l0;
const int y_offset = 64*im + l0;
const uint8_t hm1 = 1 << (2*im);
const uint8_t hm2 = hm1 << 4;
uint16_t aux[4];
const uint8_t * sc = (const uint8_t *)aux;
__global const struct block_q5_K * x = xx + ib0;
tmp[16 * ix + tid] = 0;
for (int i = ix; i < num_blocks_per_row; i += 2) {
__global const uint8_t * ql1 = x[i].qs + q_offset;
__global const uint8_t * ql2 = ql1 + 64;
__global const uint8_t * qh = x[i].qh + l0;
__global const float * y1 = yy + i*QK_K + y_offset;
__global const float * y2 = y1 + 128;
const float dall = vload_half(0, &x[i].d);
const float dmin = vload_half(0, &x[i].dmin);
__global const uint16_t * a = (__global const uint16_t *)x[i].scales;
aux[0] = a[im+0] & kmask1;
aux[1] = a[im+2] & kmask1;
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
float4 sum = (float4)(0.f);
float smin = 0;
for (int l = 0; l < n; ++l) {
sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
+ y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0));
sum.y += y1[l+32] * ((ql1[l+ 0] >> 4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
+ y1[l+48] * ((ql1[l+16] >> 4) + (qh[l+16] & (hm1 << 1) ? 16 : 0));
sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
+ y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0));
sum.w += y2[l+32] * ((ql2[l+ 0] >> 4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
+ y2[l+48] * ((ql2[l+16] >> 4) + (qh[l+16] & (hm2 << 1) ? 16 : 0));
smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
+ (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
}
tmp[16 * ix + tid] += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
}
// sum up partial sums and write back result
barrier(CLK_LOCAL_MEM_FENCE);
for (int s=16; s>0; s>>=1) {
if (tid < s) {
tmp[tid] += tmp[tid + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (tid == 0) {
dst[row] = tmp[0];
}
}
__kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx, __local float* tmp, __global const float * yy, __global float * dst, const int ncols) {
const int row = get_group_id(0);
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;
__global const struct block_q6_K * x = xx + ib0;
const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; // 0 or 0, 1
const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
const int in = tid - step*im; // 0...15 or 0...7
#if K_QUANTS_PER_ITERATION == 1
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
const int is = 0;
#else
const int l0 = 4 * in; // 0, 4, 8, ..., 28
const int is = in / 4;
#endif
const int ql_offset = 64*im + l0;
const int qh_offset = 32*im + l0;
const int s_offset = 8*im + is;
const int y_offset = 128*im + l0;
tmp[16 * ix + tid] = 0; // partial sum for thread in warp
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
__global const float * y = yy + i * QK_K + y_offset;
__global const uint8_t * ql = x[i].ql + ql_offset;
__global const uint8_t * qh = x[i].qh + qh_offset;
__global const int8_t * s = x[i].scales + s_offset;
const float d = vload_half(0, &x[i].d);
#if K_QUANTS_PER_ITERATION == 1
float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
+ y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
+ y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
+ y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
+ y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
+ y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
+ y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
+y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
tmp[16 * ix + tid] += sum;
#else
float sum = 0;
for (int k = 0; k < 4; ++k) {
sum += y[k + 0] * (d1 * ((ql[k] & 0xF) + (qh[k] & hm ? 16 : 0)) - m1);
for (int l = 0; l < 4; ++l) {
sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
+ y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
+ y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
+ y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
}
hm <<= 1;
for (int k = 0; k < 4; ++k) {
sum += y[k + 32] * (d2 * ((ql[k] >> 4) + (qh[k] & hm ? 16 : 0)) - m2);
}
*result = sum;
tmp[16 * ix + tid] += sum;
#endif
}
void vec_dot_q6_K(__global const struct block_q6_K* x, const int ib, const int iqs, const __global float *yy, float *result) {
const int ip = iqs / 128; // 0 or 1
const int il = (iqs - 128*ip)/8; // 0...15
const int is = 8*ip;
__global const float * y = yy + 128*ip + il;
const float d = vload_half(0, &x[ib].d);
__global const uint8_t * ql = x[ib].ql + 64*ip + il;
__global const uint8_t * qh = x[ib].qh + 32*ip + il;
__global const int8_t * sc = x[ib].scales + is;
*result = y[ 0] * d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh[ 0] >> 0) & 3) << 4)) - 32)
+ y[ 32] * d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh[ 0] >> 2) & 3) << 4)) - 32)
+ y[ 64] * d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh[ 0] >> 4) & 3) << 4)) - 32)
+ y[ 96] * d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh[ 0] >> 6) & 3) << 4)) - 32)
+ y[ 16] * d * sc[1] * ((int8_t)((ql[16] & 0xF) | (((qh[16] >> 0) & 3) << 4)) - 32)
+ y[ 48] * d * sc[3] * ((int8_t)((ql[48] & 0xF) | (((qh[16] >> 2) & 3) << 4)) - 32)
+ y[ 80] * d * sc[5] * ((int8_t)((ql[16] >> 4) | (((qh[16] >> 4) & 3) << 4)) - 32)
+ y[112] * d * sc[7] * ((int8_t)((ql[48] >> 4) | (((qh[16] >> 6) & 3) << 4)) - 32);
// sum up partial sums and write back result
barrier(CLK_LOCAL_MEM_FENCE);
for (int s=16; s>0; s>>=1) {
if (tid < s) {
tmp[tid] += tmp[tid + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (tid == 0) {
dst[row] = tmp[0];
}
}
);
@ -549,44 +781,6 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
}
);
std::string dequant_mul_mat_vec_k_template = MULTILINE_QUOTE(
__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
const int block_size = get_local_size(0);
const int row = get_group_id(0);
const int tid = get_local_id(0);
const int iter_stride = 256;
const int vals_per_iter = iter_stride / block_size;
const int num_blocks_per_row = ncols / 256;
const int ib0 = row*num_blocks_per_row;
tmp[tid] = 0;
for (int i = 0; i < ncols; i += iter_stride) {
const int col = i + vals_per_iter*tid;
const int ib = ib0 + col/256; // x block index
const int iqs = col%256; // x quant index
const int iybs = col - col%256; // y block start index
// dequantize
float v;
DOT_KERNEL(x, ib, iqs, y + iybs, &v);
tmp[tid] += v;
}
// sum up partial sums and write back result
barrier(CLK_LOCAL_MEM_FENCE);
for (int s=block_size/2; s>0; s>>=1) {
if (tid < s) {
tmp[tid] += tmp[tid + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (tid == 0) {
dst[row] = tmp[0];
}
}
);
std::string mul_template = MULTILINE_QUOTE(
__kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) {
@ -649,18 +843,6 @@ std::array<std::string, 2> mul_str_values = {
"mul_f32", "float"
};
std::array<std::string, 3> dmmv_k_str_keys = {
"KERNEL_NAME", "X_TYPE", "DOT_KERNEL"
};
std::array<std::string, 15> dmmv_k_str_values = {
"dequantize_mul_mat_vec_q2_K", "struct block_q2_K", "vec_dot_q2_K",
"dequantize_mul_mat_vec_q3_K", "struct block_q3_K", "vec_dot_q3_K",
"dequantize_mul_mat_vec_q4_K", "struct block_q4_K", "vec_dot_q4_K",
"dequantize_mul_mat_vec_q5_K", "struct block_q5_K", "vec_dot_q5_K",
"dequantize_mul_mat_vec_q6_K", "struct block_q6_K", "vec_dot_q6_K",
};
std::string& replace(std::string& s, const std::string& from, const std::string& to) {
size_t pos = 0;
while ((pos = s.find(from, pos)) != std::string::npos) {
@ -673,6 +855,7 @@ std::string& replace(std::string& s, const std::string& from, const std::string&
std::string generate_kernels() {
std::stringstream src;
src << program_source << '\n';
src << k_quants_source << '\n';
for (size_t i = 0; i < dequant_str_values.size(); i += dequant_str_keys.size()) {
std::string dequant_kernel = dequant_template;
std::string dmmv_kernel = dequant_mul_mat_vec_template;
@ -690,13 +873,6 @@ std::string generate_kernels() {
}
src << mul_kernel << '\n';
}
for (size_t i = 0; i < dmmv_k_str_values.size(); i += dmmv_k_str_keys.size()) {
std::string dmmv_k_kernel = dequant_mul_mat_vec_k_template;
for (size_t j = 0; j < dmmv_k_str_keys.size(); j++) {
replace(dmmv_k_kernel, dmmv_k_str_keys[j], dmmv_k_str_values[i + j]);
}
src << dmmv_k_kernel << '\n';
}
return src.str();
}
@ -729,10 +905,11 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
exit(1);
}
const char* compile_opts = "-cl-mad-enable -cl-unsafe-math-optimizations -cl-finite-math-only -cl-fast-relaxed-math "
"-DQK4_0=32 -DQR4_0=2 -DQK4_1=32 -DQR4_1=2 -DQK5_0=32 -DQR5_0=2 -DQK5_1=32 -DQR5_1=2 -DQK8_0=32 -DQR8_0=1";
std::string compile_opts = "-cl-mad-enable -cl-unsafe-math-optimizations -cl-finite-math-only -cl-fast-relaxed-math "
"-DQK4_0=32 -DQR4_0=2 -DQK4_1=32 -DQR4_1=2 -DQK5_0=32 -DQR5_0=2 -DQK5_1=32 -DQR5_1=2 -DQK8_0=32 -DQR8_0=1 "
"-DQK_K=256 -DK_QUANTS_PER_ITERATION=" + std::to_string(K_QUANTS_PER_ITERATION);
err = clBuildProgram(p, 0, NULL, compile_opts, NULL, NULL);
err = clBuildProgram(p, 0, NULL, compile_opts.c_str(), NULL, NULL);
if(err < 0) {
clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);

48
ggml.c
View file

@ -3848,6 +3848,41 @@ static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64");
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
// WARN:
// Mis-confguration can lead to problem that's hard to reason about:
// * At best it crash or talks nosense.
// * At worst it talks slightly difference but hard to perceive.
//
// An op has to enable INIT or FINALIZE when any of it's branch needs that pass.
// Take care about compile options (e.g., GGML_USE_xxx).
static bool GGML_OP_HAS_INIT [GGML_OP_COUNT] = { 0 };
static bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = { 0 };
static void ggml_setup_op_has_task_pass(void) {
{ // INIT
bool * p = GGML_OP_HAS_INIT;
p[GGML_OP_ACC ] = true;
p[GGML_OP_MUL_MAT ] = true;
p[GGML_OP_OUT_PROD ] = true;
p[GGML_OP_SET ] = true;
p[GGML_OP_GET_ROWS_BACK ] = true;
p[GGML_OP_DIAG_MASK_INF ] = true;
p[GGML_OP_DIAG_MASK_ZERO ] = true;
p[GGML_OP_CONV_1D_S1_PH ] = true;
p[GGML_OP_CONV_1D_S2_PH ] = true;
p[GGML_OP_CONV_2D_SK_P0 ] = true;
p[GGML_OP_FLASH_ATTN_BACK ] = true;
p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
}
{ // FINALIZE
bool * p = GGML_OP_HAS_FINALIZE;
p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
}
}
//
// ggml context
//
@ -4269,6 +4304,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
ggml_cl_init();
#endif
ggml_setup_op_has_task_pass();
is_first_call = false;
}
@ -16793,10 +16830,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
if (node_n != -1) {
/* FINALIZE */
struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
if (GGML_OP_HAS_FINALIZE[node->op]) {
params.nth = node->n_tasks;
ggml_compute_forward(&params, node);
ggml_graph_compute_perf_stats_node(node, state->shared);
}
}
// distribute new work or execute it direct if 1T
while (++node_n < cgraph->n_nodes) {
@ -16807,10 +16846,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
state->shared->perf_node_start_cycles = ggml_perf_cycles();
state->shared->perf_node_start_time_us = ggml_perf_time_us();
/* INIT */
params.type = GGML_TASK_INIT;
params.nth = node->n_tasks;
/* INIT */
if (GGML_OP_HAS_INIT[node->op]) {
params.type = GGML_TASK_INIT;
ggml_compute_forward(&params, node);
}
if (node->n_tasks == 1) {
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
@ -16818,9 +16860,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
params.type = GGML_TASK_COMPUTE;
ggml_compute_forward(&params, node);
if (GGML_OP_HAS_FINALIZE[node->op]) {
params.type = GGML_TASK_FINALIZE;
ggml_compute_forward(&params, node);
ggml_graph_compute_perf_stats_node(node, state->shared);
}
} else {
break;
}

3
ggml.h
View file

@ -444,6 +444,9 @@ extern "C" {
// compute types
// NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
// This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
enum ggml_task_type {
GGML_TASK_INIT = 0,
GGML_TASK_COMPUTE,

View file

@ -66,6 +66,7 @@ enum e_model {
MODEL_65B,
};
static const size_t kB = 1024;
static const size_t MB = 1024*1024;
// computed for n_ctx == 2048
@ -129,6 +130,34 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
return k_sizes;
}
// amount of VRAM needed per batch size to hold temporary results
// the values for 3b and 65b are not derived from testing but instead chosen conservatively
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
{
static std::map<e_model, size_t> k_sizes = {
{ MODEL_3B, 512ull * kB },
{ MODEL_7B, 512ull * kB },
{ MODEL_13B, 640ull * kB },
{ MODEL_30B, 768ull * kB },
{ MODEL_65B, 1536ull * kB },
};
return k_sizes;
}
// amount of VRAM needed per batch size and context to hold temporary results
// the values for 3b and 65b are not derived from testing but instead chosen conservatively
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
{
static std::map<e_model, size_t> k_sizes = {
{ MODEL_3B, 128ull },
{ MODEL_7B, 128ull },
{ MODEL_13B, 160ull },
{ MODEL_30B, 208ull },
{ MODEL_65B, 416ull },
};
return k_sizes;
}
// default hparams (LLaMA 7B)
struct llama_hparams {
uint32_t n_vocab = 32000;
@ -253,7 +282,13 @@ struct llama_model {
struct llama_context {
llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
#ifdef GGML_USE_METAL
~llama_context() {
if (ctx_metal) {
ggml_metal_free(ctx_metal);
}
}
#endif
std::mt19937 rng;
bool has_evaluated_once = false;
@ -777,7 +812,7 @@ static bool kv_cache_init(
struct llama_context_params llama_context_default_params() {
struct llama_context_params result = {
/*.seed =*/ -1,
/*.seed =*/ LLAMA_DEFAULT_SEED,
/*.n_ctx =*/ 512,
/*.n_batch =*/ 512,
/*.gpu_layers =*/ 0,
@ -1112,11 +1147,14 @@ static void llama_model_load_internal(
fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
ggml_cuda_set_scratch_size(0); // disable scratch
} else {
vram_scratch = n_batch * MB;
const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type);
const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT().at(model.type);
vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
ggml_cuda_set_scratch_size(vram_scratch);
if (n_gpu_layers > 0) {
fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n",
__func__, vram_scratch / MB);
fprintf(stderr, "%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n",
__func__, vram_scratch_base / kB, vram_scratch_per_context,
(vram_scratch + MB - 1) / MB); // round up
}
}
#endif // GGML_USE_CUBLAS
@ -2541,7 +2579,7 @@ struct llama_context * llama_new_context_with_model(
llama_context * ctx = new llama_context(*model, model->vocab);
if (params.seed < 0) {
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
@ -2974,8 +3012,8 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
#define LLAMA_MAX_RNG_STATE (64*1024)
void llama_set_rng_seed(struct llama_context * ctx, int seed) {
if (seed < 0) {
void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
if (seed == LLAMA_DEFAULT_SEED) {
seed = time(NULL);
}
ctx->rng.seed(seed);
@ -3219,7 +3257,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
return nread;
}
bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
static bool llama_load_session_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
llama_file file(path_session, "rb");
// sanity checks
@ -3273,6 +3311,15 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
return true;
}
bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
try {
return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
} catch (const std::exception & err) {
fprintf(stderr, "error loading session file: %s\n", err.what());
return false;
}
}
bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
llama_file file(path_session, "wb");

14
llama.h
View file

@ -46,6 +46,8 @@
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
#define LLAMA_SESSION_VERSION 1
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
#define LLAMA_SUPPORTS_GPU_OFFLOAD
@ -81,11 +83,11 @@ extern "C" {
typedef void (*llama_progress_callback)(float progress, void *ctx);
struct llama_context_params {
int seed; // RNG seed, -1 for random
int n_ctx; // text context
int n_batch; // prompt processing batch size
int n_gpu_layers; // number of layers to store in VRAM
int main_gpu; // the GPU that is used for scratch and small tensors
uint32_t seed; // RNG seed, -1 for random
int32_t n_ctx; // text context
int32_t n_batch; // prompt processing batch size
int32_t n_gpu_layers; // number of layers to store in VRAM
int32_t main_gpu; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
// called with a progress value between 0 and 1, pass NULL to disable
llama_progress_callback progress_callback;
@ -196,7 +198,7 @@ extern "C" {
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
// Sets the current rng seed.
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
// Returns the maximum size in bytes of the state (rng, logits, embedding
// and kv_cache) - will often be smaller after compacting tokens