Add support for quantized models
This commit is contained in:
parent
ac3fbe492a
commit
7136adac8a
2 changed files with 179 additions and 6 deletions
170
ggml.c
170
ggml.c
|
@ -5830,13 +5830,13 @@ static void ggml_compute_forward_add_f16_f32(
|
||||||
const int n = ggml_nrows(src0);
|
const int n = ggml_nrows(src0);
|
||||||
const int nc = src0->ne[0];
|
const int nc = src0->ne[0];
|
||||||
|
|
||||||
const size_t nb00 = src0->nb[0];
|
//const size_t nb00 = src0->nb[0];
|
||||||
const size_t nb01 = src0->nb[1];
|
const size_t nb01 = src0->nb[1];
|
||||||
|
|
||||||
const size_t nb10 = src1->nb[0];
|
const size_t nb10 = src1->nb[0];
|
||||||
const size_t nb11 = src1->nb[1];
|
const size_t nb11 = src1->nb[1];
|
||||||
|
|
||||||
const size_t nb0 = dst->nb[0];
|
//const size_t nb0 = dst->nb[0];
|
||||||
const size_t nb1 = dst->nb[1];
|
const size_t nb1 = dst->nb[1];
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||||
|
@ -5848,12 +5848,163 @@ static void ggml_compute_forward_add_f16_f32(
|
||||||
ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01);
|
ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01);
|
||||||
for (int i = 0; i < nc; i++) {
|
for (int i = 0; i < nc; i++) {
|
||||||
float * src1_ptr = (float *) ((char *) src1->data + j*nb11 + i*nb10);
|
float * src1_ptr = (float *) ((char *) src1->data + j*nb11 + i*nb10);
|
||||||
|
|
||||||
dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + *src1_ptr);
|
dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + *src1_ptr);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void ggml_compute_forward_add_f16_f16(
|
||||||
|
const struct ggml_compute_params * params,
|
||||||
|
const struct ggml_tensor * src0,
|
||||||
|
const struct ggml_tensor * src1,
|
||||||
|
struct ggml_tensor * dst) {
|
||||||
|
GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
||||||
|
|
||||||
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int ith = params->ith;
|
||||||
|
const int nth = params->nth;
|
||||||
|
|
||||||
|
const int n = ggml_nrows(src0);
|
||||||
|
const int nc = src0->ne[0];
|
||||||
|
|
||||||
|
//const size_t nb00 = src0->nb[0];
|
||||||
|
const size_t nb01 = src0->nb[1];
|
||||||
|
|
||||||
|
const size_t nb10 = src1->nb[0];
|
||||||
|
const size_t nb11 = src1->nb[1];
|
||||||
|
|
||||||
|
//const size_t nb0 = dst->nb[0];
|
||||||
|
const size_t nb1 = dst->nb[1];
|
||||||
|
|
||||||
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||||
|
GGML_ASSERT(src1->type == GGML_TYPE_F16);
|
||||||
|
GGML_ASSERT(dst->type == GGML_TYPE_F16);
|
||||||
|
|
||||||
|
for (int j = ith; j < n; j += nth) {
|
||||||
|
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + j*nb1);
|
||||||
|
ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01);
|
||||||
|
for (int i = 0; i < nc; i++) {
|
||||||
|
ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + j*nb11 + i*nb10);
|
||||||
|
dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + GGML_FP16_TO_FP32(*src1_ptr));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_compute_forward_add_q_f32(
|
||||||
|
const struct ggml_compute_params * params,
|
||||||
|
const struct ggml_tensor * src0,
|
||||||
|
const struct ggml_tensor * src1,
|
||||||
|
struct ggml_tensor * dst) {
|
||||||
|
GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
||||||
|
|
||||||
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int64_t ne00 = src0->ne[0];
|
||||||
|
const int64_t ne01 = src0->ne[1];
|
||||||
|
const int64_t ne02 = src0->ne[2];
|
||||||
|
const int64_t ne03 = src0->ne[3];
|
||||||
|
|
||||||
|
//const int64_t ne10 = src1->ne[0];
|
||||||
|
const int64_t ne11 = src1->ne[1];
|
||||||
|
const int64_t ne12 = src1->ne[2];
|
||||||
|
const int64_t ne13 = src1->ne[3];
|
||||||
|
|
||||||
|
const int64_t ne0 = dst->ne[0];
|
||||||
|
const int64_t ne1 = dst->ne[1];
|
||||||
|
const int64_t ne2 = dst->ne[2];
|
||||||
|
const int64_t ne3 = dst->ne[3];
|
||||||
|
|
||||||
|
const int nb00 = src0->nb[0];
|
||||||
|
const int nb01 = src0->nb[1];
|
||||||
|
const int nb02 = src0->nb[2];
|
||||||
|
const int nb03 = src0->nb[3];
|
||||||
|
|
||||||
|
const int nb10 = src1->nb[0];
|
||||||
|
const int nb11 = src1->nb[1];
|
||||||
|
const int nb12 = src1->nb[2];
|
||||||
|
const int nb13 = src1->nb[3];
|
||||||
|
|
||||||
|
const int nb0 = dst->nb[0];
|
||||||
|
const int nb1 = dst->nb[1];
|
||||||
|
const int nb2 = dst->nb[2];
|
||||||
|
const int nb3 = dst->nb[3];
|
||||||
|
|
||||||
|
const int ith = params->ith;
|
||||||
|
const int nth = params->nth;
|
||||||
|
|
||||||
|
GGML_ASSERT(ne02 == ne12);
|
||||||
|
GGML_ASSERT(ne03 == ne13);
|
||||||
|
GGML_ASSERT(ne2 == ne12);
|
||||||
|
GGML_ASSERT(ne3 == ne13);
|
||||||
|
|
||||||
|
const enum ggml_type type = src0->type;
|
||||||
|
dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
|
||||||
|
quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q;
|
||||||
|
|
||||||
|
// we don't support permuted src0 or src1
|
||||||
|
GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[type]);
|
||||||
|
GGML_ASSERT(nb10 == sizeof(float));
|
||||||
|
|
||||||
|
// dst cannot be transposed or permuted
|
||||||
|
GGML_ASSERT(nb0 <= nb1);
|
||||||
|
GGML_ASSERT(nb1 <= nb2);
|
||||||
|
GGML_ASSERT(nb2 <= nb3);
|
||||||
|
|
||||||
|
GGML_ASSERT(ne0 == ne01);
|
||||||
|
GGML_ASSERT(ne1 == ne11);
|
||||||
|
GGML_ASSERT(ne2 == ne02);
|
||||||
|
GGML_ASSERT(ne3 == ne03);
|
||||||
|
|
||||||
|
GGML_ASSERT(src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1);
|
||||||
|
GGML_ASSERT(dst->type == src0->type);
|
||||||
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
// total rows in src0
|
||||||
|
const int nr = ne01*ne02*ne03;
|
||||||
|
|
||||||
|
// rows per thread
|
||||||
|
const int dr = (nr + nth - 1)/nth;
|
||||||
|
|
||||||
|
// row range for this thread
|
||||||
|
const int ir0 = dr*ith;
|
||||||
|
const int ir1 = MIN(ir0 + dr, nr);
|
||||||
|
|
||||||
|
for (int ir = ir0; ir < ir1; ++ir) {
|
||||||
|
// src0 indices
|
||||||
|
const int i03 = ir/(ne02*ne01);
|
||||||
|
const int i02 = (ir - i03*ne02*ne01)/ne01;
|
||||||
|
const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
||||||
|
|
||||||
|
// src1 and dst are same shape as src0 => same indices
|
||||||
|
const int i13 = i03;
|
||||||
|
const int i12 = i02;
|
||||||
|
const int i11 = i01;
|
||||||
|
|
||||||
|
const int i3 = i03;
|
||||||
|
const int i2 = i02;
|
||||||
|
const int i1 = i01;
|
||||||
|
|
||||||
|
void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
|
||||||
|
float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13));
|
||||||
|
void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb0));
|
||||||
|
|
||||||
|
assert(ne00 % 32 == 0);
|
||||||
|
|
||||||
|
// unquantize row from src0 to temp buffer
|
||||||
|
float tmp[ne00];
|
||||||
|
dequantize_row_q(src0_row, tmp, ne00);
|
||||||
|
// add src1
|
||||||
|
ggml_vec_acc_f32(ne00, tmp, src1_row);
|
||||||
|
// quantize row to dst
|
||||||
|
quantize_row_q(tmp, dst_row, ne00);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void ggml_compute_forward_add(
|
static void ggml_compute_forward_add(
|
||||||
const struct ggml_compute_params * params,
|
const struct ggml_compute_params * params,
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0,
|
||||||
|
@ -5866,7 +6017,20 @@ static void ggml_compute_forward_add(
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
{
|
{
|
||||||
|
if (src1->type == GGML_TYPE_F16) {
|
||||||
|
ggml_compute_forward_add_f16_f16(params, src0, src1, dst);
|
||||||
|
}
|
||||||
|
else if (src1->type == GGML_TYPE_F32) {
|
||||||
ggml_compute_forward_add_f16_f32(params, src0, src1, dst);
|
ggml_compute_forward_add_f16_f32(params, src0, src1, dst);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
} break;
|
||||||
|
case GGML_TYPE_Q4_0:
|
||||||
|
case GGML_TYPE_Q4_1:
|
||||||
|
{
|
||||||
|
ggml_compute_forward_add_q_f32(params, src0, src1, dst);
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
|
|
13
llama.cpp
13
llama.cpp
|
@ -1887,14 +1887,23 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// w = w + BA
|
// w = w + BA*s
|
||||||
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraB, loraA);
|
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraB, loraA);
|
||||||
ggml_tensor * r = ggml_add_inplace(lora_ctx, tensor, BA);
|
|
||||||
|
//if (true) {
|
||||||
|
// ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, 1.0f);
|
||||||
|
// BA = ggml_scale(lora_ctx, BA, scale_tensor);
|
||||||
|
//}
|
||||||
|
ggml_tensor * r = ggml_add(lora_ctx, tensor, BA);
|
||||||
|
//r = ggml_cpy(lora_ctx, r, tensor);
|
||||||
|
|
||||||
struct ggml_cgraph gf = ggml_build_forward(r);
|
struct ggml_cgraph gf = ggml_build_forward(r);
|
||||||
gf.n_threads = n_threads;
|
gf.n_threads = n_threads;
|
||||||
ggml_graph_compute(lora_ctx, &gf);
|
ggml_graph_compute(lora_ctx, &gf);
|
||||||
|
|
||||||
|
// hack until ggml_cpy supports quantized tensors
|
||||||
|
memcpy(tensor->data, r->data, ggml_nbytes(tensor));
|
||||||
|
|
||||||
// we won't need these tensors again, reset the context to save memory
|
// we won't need these tensors again, reset the context to save memory
|
||||||
ggml_free(lora_ctx);
|
ggml_free(lora_ctx);
|
||||||
lora_ctx = ggml_init(params);
|
lora_ctx = ggml_init(params);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue