diff --git a/ggml.c b/ggml.c index a486cad67..b9bfa2f5e 100644 --- a/ggml.c +++ b/ggml.c @@ -5830,13 +5830,13 @@ static void ggml_compute_forward_add_f16_f32( const int n = ggml_nrows(src0); const int nc = src0->ne[0]; - const size_t nb00 = src0->nb[0]; + //const size_t nb00 = src0->nb[0]; const size_t nb01 = src0->nb[1]; const size_t nb10 = src1->nb[0]; const size_t nb11 = src1->nb[1]; - const size_t nb0 = dst->nb[0]; + //const size_t nb0 = dst->nb[0]; const size_t nb1 = dst->nb[1]; GGML_ASSERT(src0->type == GGML_TYPE_F16); @@ -5848,12 +5848,163 @@ static void ggml_compute_forward_add_f16_f32( ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01); for (int i = 0; i < nc; i++) { float * src1_ptr = (float *) ((char *) src1->data + j*nb11 + i*nb10); - dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + *src1_ptr); } } } +static void ggml_compute_forward_add_f16_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int ith = params->ith; + const int nth = params->nth; + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + //const size_t nb00 = src0->nb[0]; + const size_t nb01 = src0->nb[1]; + + const size_t nb10 = src1->nb[0]; + const size_t nb11 = src1->nb[1]; + + //const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F16); + GGML_ASSERT(dst->type == GGML_TYPE_F16); + + for (int j = ith; j < n; j += nth) { + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + j*nb1); + ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01); + for (int i = 0; i < nc; i++) { + ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + j*nb11 + i*nb10); + dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + GGML_FP16_TO_FP32(*src1_ptr)); + } + } +} + +static void ggml_compute_forward_add_q_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + //const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + const int64_t ne2 = dst->ne[2]; + const int64_t ne3 = dst->ne[3]; + + const int nb00 = src0->nb[0]; + const int nb01 = src0->nb[1]; + const int nb02 = src0->nb[2]; + const int nb03 = src0->nb[3]; + + const int nb10 = src1->nb[0]; + const int nb11 = src1->nb[1]; + const int nb12 = src1->nb[2]; + const int nb13 = src1->nb[3]; + + const int nb0 = dst->nb[0]; + const int nb1 = dst->nb[1]; + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + + const int ith = params->ith; + const int nth = params->nth; + + GGML_ASSERT(ne02 == ne12); + GGML_ASSERT(ne03 == ne13); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + const enum ggml_type type = src0->type; + dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q; + quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q; + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[type]); + GGML_ASSERT(nb10 == sizeof(float)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + GGML_ASSERT(ne0 == ne01); + GGML_ASSERT(ne1 == ne11); + GGML_ASSERT(ne2 == ne02); + GGML_ASSERT(ne3 == ne03); + + GGML_ASSERT(src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1); + GGML_ASSERT(dst->type == src0->type); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + // total rows in src0 + const int nr = ne01*ne02*ne03; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 indices + const int i03 = ir/(ne02*ne01); + const int i02 = (ir - i03*ne02*ne01)/ne01; + const int i01 = (ir - i03*ne02*ne01 - i02*ne01); + + // src1 and dst are same shape as src0 => same indices + const int i13 = i03; + const int i12 = i02; + const int i11 = i01; + + const int i3 = i03; + const int i2 = i02; + const int i1 = i01; + + void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)); + float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13)); + void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb0)); + + assert(ne00 % 32 == 0); + + // unquantize row from src0 to temp buffer + float tmp[ne00]; + dequantize_row_q(src0_row, tmp, ne00); + // add src1 + ggml_vec_acc_f32(ne00, tmp, src1_row); + // quantize row to dst + quantize_row_q(tmp, dst_row, ne00); + } +} + static void ggml_compute_forward_add( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -5866,7 +6017,20 @@ static void ggml_compute_forward_add( } break; case GGML_TYPE_F16: { - ggml_compute_forward_add_f16_f32(params, src0, src1, dst); + if (src1->type == GGML_TYPE_F16) { + ggml_compute_forward_add_f16_f16(params, src0, src1, dst); + } + else if (src1->type == GGML_TYPE_F32) { + ggml_compute_forward_add_f16_f32(params, src0, src1, dst); + } + else { + GGML_ASSERT(false); + } + } break; + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + { + ggml_compute_forward_add_q_f32(params, src0, src1, dst); } break; default: { diff --git a/llama.cpp b/llama.cpp index bb7d3e2d9..4fcd2ecfb 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1887,14 +1887,23 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor return 1; } - // w = w + BA + // w = w + BA*s ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraB, loraA); - ggml_tensor * r = ggml_add_inplace(lora_ctx, tensor, BA); + + //if (true) { + // ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, 1.0f); + // BA = ggml_scale(lora_ctx, BA, scale_tensor); + //} + ggml_tensor * r = ggml_add(lora_ctx, tensor, BA); + //r = ggml_cpy(lora_ctx, r, tensor); struct ggml_cgraph gf = ggml_build_forward(r); gf.n_threads = n_threads; ggml_graph_compute(lora_ctx, &gf); + // hack until ggml_cpy supports quantized tensors + memcpy(tensor->data, r->data, ggml_nbytes(tensor)); + // we won't need these tensors again, reset the context to save memory ggml_free(lora_ctx); lora_ctx = ggml_init(params);