Merge branch 'master' into speculative-tree

This commit is contained in:
Georgi Gerganov 2023-10-17 17:24:11 +03:00 committed by GitHub
commit 010c52ec59
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 95 additions and 56 deletions

View file

@ -98,6 +98,8 @@ gguf_writer.add_embedding_length(hparams["d_model"])
gguf_writer.add_block_count(block_count)
gguf_writer.add_feed_forward_length(4 * hparams["d_model"])
gguf_writer.add_head_count(hparams["n_heads"])
if kv_n_heads := hparams["attn_config"].get("kv_n_heads"):
gguf_writer.add_head_count_kv(kv_n_heads)
gguf_writer.add_layer_norm_eps(1e-05)
if hparams["attn_config"]["clip_qkv"] is not None:
gguf_writer.add_clamp_kqv(hparams["attn_config"]["clip_qkv"])

View file

@ -529,13 +529,14 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora
set_param_lora(lora);
// measure data size
struct ggml_allocr * alloc = NULL;
alloc = ggml_allocr_new_measure(tensor_alignment);
alloc_lora(alloc, lora);
size_t size = 0;
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
size += GGML_PAD(ggml_nbytes(t), tensor_alignment);
}
// allocate data
lora->data.resize(ggml_allocr_max_size(alloc) + tensor_alignment);
ggml_allocr_free(alloc);
struct ggml_allocr * alloc = NULL;
lora->data.resize(size + tensor_alignment);
alloc = ggml_allocr_new(lora->data.data(), lora->data.size(), tensor_alignment);
alloc_lora(alloc, lora);
ggml_allocr_free(alloc);
@ -1714,11 +1715,9 @@ int main(int argc, char ** argv) {
struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
// measure required memory for input tensors
alloc = ggml_allocr_new_measure(tensor_alignment);
ggml_allocr_alloc(alloc, tokens_input);
ggml_allocr_alloc(alloc, target_probs);
size_t max_input_size = ggml_allocr_max_size(alloc) + tensor_alignment;
ggml_allocr_free(alloc);
size_t max_input_size = GGML_PAD(ggml_nbytes(tokens_input), tensor_alignment) +
GGML_PAD(ggml_nbytes(target_probs), tensor_alignment) +
tensor_alignment;
printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
// allocate input tensors

View file

@ -49,9 +49,9 @@ inline bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
return eval_tokens(ctx_llama, tokens, 1, n_past);
}
inline bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past){
inline bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
std::string str2 = str;
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, true);
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos);
eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
return true;
}

View file

@ -79,7 +79,13 @@ int main(int argc, char ** argv) {
llama_backend_init(params.numa);
llama_model_params model_params = llama_model_default_params();
llama_model_params model_params = llama_model_default_params();
model_params.n_gpu_layers = params.n_gpu_layers;
model_params.main_gpu = params.main_gpu;
model_params.tensor_split = params.tensor_split;
model_params.use_mmap = params.use_mmap;
model_params.use_mlock = params.use_mlock;
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
@ -91,6 +97,7 @@ int main(int argc, char ** argv) {
ctx_params.n_ctx = params.n_ctx < 2048 ? 2048 : params.n_ctx; // we need a longer context size to process image embeddings
ctx_params.n_threads = params.n_threads;
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
ctx_params.seed = params.seed;
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
@ -100,7 +107,8 @@ int main(int argc, char ** argv) {
}
// make sure that the correct mmproj was used, i.e., compare apples to apples
int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
const int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
if (n_img_embd != n_llama_embd) {
printf("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_img_embd, n_llama_embd);
@ -119,14 +127,14 @@ int main(int argc, char ** argv) {
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
// GG: are we sure that there should be a trailing whitespace at the end of this string?
eval_string(ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER: ", params.n_batch, &n_past);
eval_string(ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params.n_batch, &n_past, true);
eval_image_embd(ctx_llama, image_embd, n_img_pos, params.n_batch, &n_past);
eval_string(ctx_llama, params.prompt.c_str(), params.n_batch, &n_past);
eval_string(ctx_llama, "\nASSISTANT:", params.n_batch, &n_past);
eval_string(ctx_llama, (params.prompt + "\nASSISTANT:").c_str(), params.n_batch, &n_past, false);
// generate the response
printf("\n");
printf("prompt: '%s'\n", params.prompt.c_str());
printf("\n");
for (int i = 0; i < max_tgt_len; i++) {

View file

@ -19,7 +19,7 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
#define CL_DMMV_BLOCK_SIZE 32
#define CL_DMMV_LOCAL_SIZE 32
#ifndef K_QUANTS_PER_ITERATION
#define K_QUANTS_PER_ITERATION 1
@ -338,7 +338,7 @@ __kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx,
const int row = get_group_id(0);
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
__global const struct block_q2_K * x = xx + ib0;
@ -413,7 +413,7 @@ __kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx,
const int row = get_group_id(0);
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
__global const struct block_q3_K * x = xx + ib0;
@ -489,7 +489,7 @@ __kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx,
const int row = get_group_id(0);
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...15
const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION;
@ -562,7 +562,7 @@ __kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx,
const int row = get_group_id(0);
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
const int tid = get_local_id(0)/2; // 0...15
const int ix = get_local_id(0)%2;
@ -641,7 +641,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
const int row = get_group_id(0);
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
__global const struct block_q6_K * x = xx + ib0;
@ -745,19 +745,21 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
const int block_size = get_local_size(0);
const int local_size = get_local_size(0);
const int row = get_group_id(0);
const int tid = get_local_id(0);
const uint qk = QUANT_K;
const uint qr = QUANT_R;
const int col_step = local_size * 2;
const int y_offset = qr == 1 ? 1 : qk/2;
x += get_global_offset(0);
tmp[tid] = 0;
for (int i = 0; i < ncols/block_size; i += 2) {
const int col = i*block_size + 2*tid;
for (int col = tid*2; col < ncols; col += col_step) {
const int ib = (row*ncols + col)/qk; // block index
const int iqs = (col%qk)/qr; // quant index
const int iybs = col - col%qk; // y block start index
@ -773,7 +775,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
// sum up partial sums and write back result
barrier(CLK_LOCAL_MEM_FENCE);
for (int s=block_size/2; s>0; s>>=1) {
for (int s=local_size/2; s>0; s>>=1) {
if (tid < s) {
tmp[tid] += tmp[tid + s];
}
@ -1704,7 +1706,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
const int nb2 = dst->nb[2];
const int nb3 = dst->nb[3];
const ggml_type type = src0->type;
const bool mul_mat_vec = ne11 == 1;
const bool mul_mat_vec = ne11 == 1 && ne00%2 == 0;
const int64_t r2 = ne12 / ne02;
const int64_t r3 = ne13 / ne03;
@ -1737,7 +1739,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
GGML_ASSERT(to_fp32_cl != nullptr);
const size_t global_denom = ggml_cl_global_denom(type);
const size_t local = ggml_cl_local_size(type);
const size_t local = mul_mat_vec ? CL_DMMV_LOCAL_SIZE : ggml_cl_local_size(type);
size_t ev_idx = 0;
std::vector<cl_event> events;
@ -1770,8 +1772,8 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
// compute
const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
const size_t local = CL_DMMV_BLOCK_SIZE;
const size_t global = ne01 * local;
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
const cl_int ncols = ne00;
events.emplace_back();
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
@ -1779,7 +1781,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
} else { // general dequantization kernel + CLBlast matrix matrix multiplication
// convert src0 to fp32 on device
const size_t global = x_ne / global_denom;

34
ggml.c
View file

@ -5494,6 +5494,39 @@ struct ggml_tensor * ggml_view_tensor(
return result;
}
struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
struct ggml_object * obj = ctx->objects_begin;
char * const mem_buffer = ctx->mem_buffer;
while (obj != NULL) {
if (obj->type == GGML_OBJECT_TENSOR) {
return (struct ggml_tensor *)(mem_buffer + obj->offs);
}
obj = obj->next;
}
return NULL;
}
struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml_tensor * tensor) {
struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
obj = obj->next;
char * const mem_buffer = ctx->mem_buffer;
while (obj != NULL) {
if (obj->type == GGML_OBJECT_TENSOR) {
return (struct ggml_tensor *)(mem_buffer + obj->offs);
}
obj = obj->next;
}
return NULL;
}
struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
struct ggml_object * obj = ctx->objects_begin;
@ -8647,6 +8680,7 @@ void ggml_set_param(
GGML_ASSERT(tensor->grad == NULL);
tensor->grad = ggml_dup_tensor(ctx, tensor);
ggml_format_name(tensor->grad, "%s (grad)", tensor->name);
}
// ggml_compute_forward_dup

3
ggml.h
View file

@ -704,6 +704,9 @@ extern "C" {
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
// Context tensor enumeration and lookup
GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);

View file

@ -462,12 +462,9 @@ void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
}
size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
const int nb = k / QK_K;
(void)hist; // TODO: collect histograms
// TODO - collect histograms - although, at a second thought, I don't really care about them
(void)hist;
for (int j = 0; j < nb; j += k) {
for (int j = 0; j < n; j += k) {
block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
quantize_row_q2_K_reference(src + j, y, k);
}
@ -678,12 +675,9 @@ void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
}
size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
const int nb = k / QK_K;
(void)hist; // TODO: collect histograms
// TODO - collect histograms - although, at a second thought, I don't really care about them
(void)hist;
for (int j = 0; j < nb; j += k) {
for (int j = 0; j < n; j += k) {
block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
quantize_row_q3_K_reference(src + j, y, k);
}
@ -846,9 +840,9 @@ void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
assert(k % QK_K == 0);
const int nb = k / QK_K;
(void)hist; // TODO: collect histograms
for (int j = 0; j < nb; j += k) {
for (int j = 0; j < n; j += k) {
block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
quantize_row_q4_K_reference(src + j, y, k);
}
@ -1052,9 +1046,9 @@ void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
assert(k % QK_K == 0);
const int nb = k / QK_K;
(void)hist;
for (int j = 0; j < nb; j += k) {
(void)hist; // TODO: collect histograms
for (int j = 0; j < n; j += k) {
block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
quantize_row_q5_K_reference(src + j, y, k);
}
@ -1200,11 +1194,9 @@ void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
assert(k % QK_K == 0);
const int nb = k / QK_K;
(void)hist; // TODO: collect histograms
(void)hist; // TODO
for (int j = 0; j < nb; j += k) {
for (int j = 0; j < n; j += k) {
block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
quantize_row_q6_K_reference(src + j, y, k);
}

View file

@ -2845,8 +2845,8 @@ static void llm_load_tensors(
auto & layer = model.layers[i];
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3*n_embd}, backend_split);
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
@ -5374,7 +5374,7 @@ static struct ggml_cgraph * llm_build_mpt(
const int64_t n_layer = hparams.n_layer;
const int64_t n_ctx = cparams.n_ctx;
const int64_t n_head = hparams.n_head;
const int64_t n_head_kv = hparams.n_head_kv; // == n_head for MPT, as there's no MQA/GQA
const int64_t n_head_kv = hparams.n_head_kv;
const int64_t n_embd_head = hparams.n_embd_head();
const int64_t n_embd_gqa = hparams.n_embd_gqa();
@ -5727,7 +5727,6 @@ static struct ggml_cgraph * llama_build_graph(
//
// - lctx: llama context
// - batch: batch to evaluate
// - n_threads: number of threads to use
//
// return 0 on success
// return positive int on warning