From 73ac18d856f3de32e724ef87c69dfef82a31610d Mon Sep 17 00:00:00 2001 From: xaedes Date: Mon, 1 May 2023 02:39:54 +0200 Subject: [PATCH] implement 8 of 14 missing backward pass operations used by llama - GGML_OP_ADD_AT - GGML_OP_CPY - GGML_OP_MUL_MAT (src0.grad) - GGML_OP_PERMUTE - GGML_OP_RESHAPE - GGML_OP_SCALE - GGML_OP_TRANSPOSE - GGML_OP_VIEW implement additional ggml operation GGML_OP_ADD_AT, which is necessary for backward pass of GGML_OP_VIEW. this operation adds src1 to src0 with data offset, i.e. to view(src0, ..., offset). the values are return in a tensor size of src0. values outside of [data+offset:data+offset+nbytes(src1)] are just the original values from src0. still missing backward passes for llama: - GGML_OP_DIAG_MASK_INF - GGML_OP_GET_ROWS - GGML_OP_RMS_NORM - GGML_OP_ROPE - GGML_OP_SILU - GGML_OP_SOFT_MAX --- ggml.c | 595 +++++++++++++++++++++++++++++++++++++++++++++++++++++---- ggml.h | 13 ++ 2 files changed, 568 insertions(+), 40 deletions(-) diff --git a/ggml.c b/ggml.c index 8cc48344e..4dbaabb59 100644 --- a/ggml.c +++ b/ggml.c @@ -4966,6 +4966,47 @@ struct ggml_tensor * ggml_add_inplace( return ggml_add_impl(ctx, a, b, true); } +struct ggml_tensor * ggml_add_at_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t offset, + bool inplace) { + GGML_ASSERT(ggml_are_same_shape(a, b)); + + bool is_node = false; + + if (!inplace && (a->grad || b->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_ADD_AT; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + memcpy(result->padding, &offset, sizeof(size_t)); + + return result; +} + +struct ggml_tensor * ggml_add_at( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t offset) { + return ggml_add_at_impl(ctx, a, b, offset, false); +} + +struct ggml_tensor * ggml_add_at_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t offset) { + return ggml_add_at_impl(ctx, a, b, offset, true); +} + // ggml_sub struct ggml_tensor * ggml_sub_impl( @@ -5577,7 +5618,6 @@ struct ggml_tensor * ggml_scale_impl( bool is_node = false; if (!inplace && (a->grad || b->grad)) { - GGML_ASSERT(false); // TODO: implement backward is_node = true; } @@ -5619,7 +5659,6 @@ struct ggml_tensor * ggml_cpy_impl( bool is_node = false; if (!inplace && (a->grad || b->grad)) { - GGML_ASSERT(false); // TODO: implement backward is_node = true; } @@ -5695,11 +5734,15 @@ struct ggml_tensor * ggml_reshape( bool is_node = false; - if (a->grad || b->grad) { - GGML_ASSERT(false); // TODO: implement backward + if (a->grad) { is_node = true; } + if (b->grad) { + // gradient propagation is not supported + GGML_ASSERT(false); + } + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data); result->op = GGML_OP_RESHAPE; @@ -5721,7 +5764,6 @@ struct ggml_tensor * ggml_reshape_2d( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward is_node = true; } @@ -5748,7 +5790,6 @@ struct ggml_tensor * ggml_reshape_3d( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward is_node = true; } @@ -5770,16 +5811,23 @@ struct ggml_tensor * ggml_view_1d( struct ggml_tensor * a, int64_t ne0, size_t offset) { + + bool is_node = false; + if (a->grad) { - GGML_ASSERT(false); // gradient propagation is not supported + is_node = true; } struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset); result->op = GGML_OP_VIEW; - result->grad = NULL; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src0 = a; - result->src1 = NULL; // TODO: maybe store the offset here? + result->src1 = NULL; + + if (is_node) { + memcpy(result->padding, &offset, sizeof(size_t)); + } return result; } @@ -5793,8 +5841,11 @@ struct ggml_tensor * ggml_view_2d( int64_t ne1, size_t nb1, size_t offset) { + + bool is_node = false; + if (a->grad) { - GGML_ASSERT(false); // gradient propagation is not supported + is_node = true; } const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 }; @@ -5806,9 +5857,13 @@ struct ggml_tensor * ggml_view_2d( result->nb[3] = result->nb[2]; result->op = GGML_OP_VIEW; - result->grad = NULL; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src0 = a; - result->src1 = NULL; // TODO: maybe store the offset here? + result->src1 = NULL; + + if (is_node) { + memcpy(result->padding, &offset, sizeof(size_t)); + } return result; } @@ -5824,8 +5879,11 @@ struct ggml_tensor * ggml_view_3d( size_t nb1, size_t nb2, size_t offset) { + + bool is_node = false; + if (a->grad) { - GGML_ASSERT(false); // gradient propagation is not supported + is_node = true; } const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 }; @@ -5837,9 +5895,13 @@ struct ggml_tensor * ggml_view_3d( result->nb[3] = result->nb[2]*ne2; result->op = GGML_OP_VIEW; - result->grad = NULL; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src0 = a; - result->src1 = NULL; // TODO: maybe store the offset here? + result->src1 = NULL; + + if (is_node) { + memcpy(result->padding, &offset, sizeof(size_t)); + } return result; } @@ -5868,7 +5930,6 @@ struct ggml_tensor * ggml_permute( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward is_node = true; } @@ -5900,7 +5961,14 @@ struct ggml_tensor * ggml_permute( result->op = GGML_OP_PERMUTE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src0 = a; - result->src1 = NULL; // TODO: maybe store the permutation here? + result->src1 = NULL; + + if (is_node) { + result->padding[0] = axis0; + result->padding[1] = axis1; + result->padding[2] = axis2; + result->padding[3] = axis3; + } return result; } @@ -5913,7 +5981,6 @@ struct ggml_tensor * ggml_transpose( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward is_node = true; } @@ -7206,6 +7273,318 @@ static void ggml_compute_forward_add( } } + +// ggml_compute_forward_add_at + +static void ggml_compute_forward_add_at_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst, + size_t offset) { + // GGML_ASSERT(ggml_are_same_shape(src0, src1)); // TODO: assert that offset+len(src1) <= len(src1) + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int ith = params->ith; + const int nth = params->nth; + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + const size_t nb00 = src0->nb[0]; + const size_t nb01 = src0->nb[1]; + + const size_t nb10 = src1->nb[0]; + const size_t nb11 = src1->nb[1]; + + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + + GGML_ASSERT( nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + + if (nb10 == sizeof(float)) { + for (int j = ith; j < n; j += nth) { +#ifdef GGML_USE_ACCELERATE + vDSP_vadd( + (float *) ((char *) src0->data + j*nb01 + offset), 1, + (float *) ((char *) src1->data + j*nb11), 1, + (float *) ((char *) dst->data + j*nb1 + offset), 1, nc); +#else + ggml_vec_add_f32(nc, + (float *) ((char *) dst->data + j*nb1 + offset), + (float *) ((char *) src0->data + j*nb01 + offset), + (float *) ((char *) src1->data + j*nb11)); +#endif + } + } else { + // src1 is not contiguous + for (int j = ith; j < n; j += nth) { + float * dst_ptr = (float *) ((char *) dst->data + j*nb1 + offset); + float * src0_ptr = (float *) ((char *) src0->data + j*nb01 + offset); + for (int i = 0; i < nc; i++) { + float * src1_ptr = (float *) ((char *) src1->data + j*nb11 + i*nb10); + + dst_ptr[i] = src0_ptr[i] + *src1_ptr; + } + } + } +} + +static void ggml_compute_forward_add_at_f16_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst, + size_t offset) { + // GGML_ASSERT(ggml_are_same_shape(src0, src1)); // TODO: assert that offset+len(src1) <= len(src1) + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int ith = params->ith; + const int nth = params->nth; + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + const size_t nb00 = src0->nb[0]; + const size_t nb01 = src0->nb[1]; + + const size_t nb10 = src1->nb[0]; + const size_t nb11 = src1->nb[1]; + + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F16); + + GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + + if (nb10 == sizeof(float)) { + for (int j = ith; j < n; j += nth) { + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + j*nb1 + offset); + ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01 + offset); + for (int i = 0; i < nc; i++) { + float * src1_ptr = (float *) ((char *) src1->data + j*nb11 + i*nb10); + dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + *src1_ptr); + } + } + } + else { + // src1 is not contiguous + GGML_ASSERT(false); + } +} + +static void ggml_compute_forward_add_at_f16_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst, + size_t offset) { + // GGML_ASSERT(ggml_are_same_shape(src0, src1)); // TODO: assert that offset+len(src1) <= len(src1) + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int ith = params->ith; + const int nth = params->nth; + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + const size_t nb00 = src0->nb[0]; + const size_t nb01 = src0->nb[1]; + + const size_t nb10 = src1->nb[0]; + const size_t nb11 = src1->nb[1]; + + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F16); + GGML_ASSERT(dst->type == GGML_TYPE_F16); + + GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + + if (nb10 == sizeof(ggml_fp16_t)) { + for (int j = ith; j < n; j += nth) { + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + j*nb1 + offset); + ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01 + offset); + for (int i = 0; i < nc; i++) { + ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + j*nb11 + i*nb10); + dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + GGML_FP16_TO_FP32(*src1_ptr)); + } + } + } + else { + // src1 is not contiguous + GGML_ASSERT(false); + } +} + +static void ggml_compute_forward_add_at_q_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst, + size_t offset) { + // GGML_ASSERT(ggml_are_same_shape(src0, src1)); // TODO: assert that offset+len(src1) <= len(src1) + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + //const int64_t ne10 = src1->ne[0]; + //const int64_t ne11 = src1->ne[1]; + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; + + //const int64_t ne0 = dst->ne[0]; + //const int64_t ne1 = dst->ne[1]; + const int64_t ne2 = dst->ne[2]; + const int64_t ne3 = dst->ne[3]; + + const int nb00 = src0->nb[0]; + const int nb01 = src0->nb[1]; + const int nb02 = src0->nb[2]; + const int nb03 = src0->nb[3]; + + const int nb10 = src1->nb[0]; + const int nb11 = src1->nb[1]; + const int nb12 = src1->nb[2]; + const int nb13 = src1->nb[3]; + + const int nb0 = dst->nb[0]; + const int nb1 = dst->nb[1]; + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + + const int ith = params->ith; + const int nth = params->nth; + + GGML_ASSERT(ne02 == ne12); + GGML_ASSERT(ne03 == ne13); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + const enum ggml_type type = src0->type; + dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q; + quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q; + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[type]); + GGML_ASSERT(nb10 == sizeof(float)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + GGML_ASSERT(ggml_is_quantized(src0->type)); + GGML_ASSERT(dst->type == src0->type); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + // total rows in src0 + const int nr = ne01*ne02*ne03; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + float * wdata = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith; + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 indices + const int i03 = ir/(ne02*ne01); + const int i02 = (ir - i03*ne02*ne01)/ne01; + const int i01 = (ir - i03*ne02*ne01 - i02*ne01); + + // src1 and dst are same shape as src0 => same indices + const int i13 = i03; + const int i12 = i02; + const int i11 = i01; + + const int i3 = i03; + const int i2 = i02; + const int i1 = i01; + + void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03) + offset); + float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13)); + void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb0) + offset); + + assert(ne00 % 32 == 0); + + // unquantize row from src0 to temp buffer + dequantize_row_q(src0_row, wdata, ne00); + // add src1 + ggml_vec_acc_f32(ne00, wdata, src1_row); + // quantize row to dst + quantize_row_q(wdata, dst_row, ne00); + } +} + +static void ggml_compute_forward_add_at( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + size_t offset; + memcpy(&offset, dst->padding, sizeof(size_t)); + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_add_at_f32(params, src0, src1, dst, offset); + } break; + case GGML_TYPE_F16: + { + if (src1->type == GGML_TYPE_F16) { + ggml_compute_forward_add_at_f16_f16(params, src0, src1, dst, offset); + } + else if (src1->type == GGML_TYPE_F32) { + ggml_compute_forward_add_at_f16_f32(params, src0, src1, dst, offset); + } + else { + GGML_ASSERT(false); + } + } break; + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q4_2: + case GGML_TYPE_Q4_3: + { + ggml_compute_forward_add_at_q_f32(params, src0, src1, dst, offset); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + // ggml_compute_forward_sub static void ggml_compute_forward_sub_f32( @@ -9220,44 +9599,45 @@ static void ggml_compute_forward_soft_max_f32( const int ir1 = MIN(ir0 + dr, nr); for (int i1 = ir0; i1 < ir1; i1++) { - float *p = (float *)((char *) dst->data + i1*dst->nb[1]); + float *sp = (float *)((char *) src0->data + i1*src0->nb[1]); + float *dp = (float *)((char *) dst->data + i1*dst->nb[1]); #ifndef NDEBUG for (int i = 0; i < nc; ++i) { - //printf("p[%d] = %f\n", i, p[i]); - assert(!isnan(p[i])); + //printf("sp[%d] = %f\n", i, sp[i]); + assert(!isnan(sp[i])); } #endif float max = -INFINITY; - ggml_vec_max_f32(nc, &max, p); + ggml_vec_max_f32(nc, &max, sp); ggml_float sum = 0.0; uint16_t scvt; for (int i = 0; i < nc; i++) { - //printf("p[%3d] = %8.4f\n", i, p[i]); - if (p[i] == -INFINITY) { - p[i] = 0.0f; + //printf("sp[%3d] = %8.4f\n", i, sp[i]); + if (sp[i] == -INFINITY) { + dp[i] = 0.0f; } else { //const float val = (p[i] == -INFINITY) ? 0.0 : exp(p[i] - max); - ggml_fp16_t s = GGML_FP32_TO_FP16(p[i] - max); + ggml_fp16_t s = GGML_FP32_TO_FP16(sp[i] - max); memcpy(&scvt, &s, sizeof(scvt)); const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]); sum += (ggml_float)val; - p[i] = val; + dp[i] = val; } } assert(sum > 0.0); sum = 1.0/sum; - ggml_vec_scale_f32(nc, p, sum); + ggml_vec_scale_f32(nc, dp, sum); #ifndef NDEBUG for (int i = 0; i < nc; ++i) { - assert(!isnan(p[i])); - assert(!isinf(p[i])); + assert(!isnan(dp[i])); + assert(!isinf(dp[i])); } #endif } @@ -10956,6 +11336,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_add(params, tensor->src0, tensor->src1, tensor); } break; + case GGML_OP_ADD_AT: + { + ggml_compute_forward_add_at(params, tensor->src0, tensor->src1, tensor); + } break; case GGML_OP_SUB: { ggml_compute_forward_sub(params, tensor->src0, tensor->src1, tensor); @@ -11140,6 +11524,28 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor src1->grad = ggml_add_impl(ctx, src1->grad, tensor->grad, inplace); } } break; + case GGML_OP_ADD_AT: + { + if (src0->grad) { + src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + } + if (src1->grad) { + size_t offset; + memcpy(&offset, tensor->padding, sizeof(size_t)); + src1->grad = + ggml_add_impl(ctx, + src1->grad, + ggml_view_3d(ctx, + tensor->grad, + tensor->ne[0], + tensor->ne[1], + tensor->ne[2], + tensor->nb[1], + tensor->nb[2], + offset), + inplace); + } + } break; case GGML_OP_SUB: { if (src0->grad) { @@ -11284,6 +11690,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_SILU: { + // necessary for llama GGML_ASSERT(false); // TODO: not implemented } break; case GGML_OP_NORM: @@ -11292,31 +11699,83 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_RMS_NORM: { + // necessary for llama GGML_ASSERT(false); // TODO: not implemented } break; case GGML_OP_MUL_MAT: { + // https://cs231n.github.io/optimization-2/#staged + // # forward pass + // s0 = np.random.randn(5, 10) + // s1 = np.random.randn(10, 3) + // t = s0.dot(s1) + + // # now suppose we had the gradient on t from above in the circuit + // dt = np.random.randn(*t.shape) # same shape as t + // ds0 = dt.dot(s1.T) #.T gives the transpose of the matrix + // ds1 = t.T.dot(dt) + + // tensor.T == (src0 @ src1.T).T + // tensor.shape [m,p] + // src0.shape [n,m] + // src1.shape [n,p] + + // necessary for llama if (src0->grad) { // TODO: this requires outer product - ggml_out_prod(ctx, src1, tensor->grad); - GGML_ASSERT(false); + src0->grad = + ggml_add_impl(ctx, + src0->grad, + // ds0 = dt.dot(s1.T) + // ggml_out_prod(ctx, // [n,m] + // src1, // [n,p] + // tensor->grad), // [m,p] + // for now just using A*B==(B.T*A.T).T + ggml_cont(ctx, // [n,m] not necessary TODO: investigate influence on speed + ggml_transpose(ctx, // [n,m] + ggml_mul_mat(ctx, // [m,n] + ggml_cont(ctx, ggml_transpose(ctx, tensor->grad)), // [p,m] + ggml_cont(ctx, ggml_transpose(ctx, src1))))), // [p,n] + inplace); } if (src1->grad) { src1->grad = ggml_add_impl(ctx, src1->grad, - ggml_mul_mat(ctx, - ggml_cont(ctx, ggml_transpose(ctx, src0)), - tensor->grad), + // ds1 = s0.T.dot(dt): + ggml_mul_mat(ctx, // [n,p] + ggml_cont(ctx, ggml_transpose(ctx, src0)), // [m,n] + tensor->grad), // [m,p] inplace); } } break; case GGML_OP_SCALE: { - GGML_ASSERT(false); // TODO: not implemented + // necessary for llama + if (src0->grad) { + src0->grad = + ggml_add_impl(ctx, + src0->grad, + ggml_scale_impl(ctx, tensor->grad, src1, false), + inplace); + } + if (src1->grad) { + src1->grad = + ggml_add_impl(ctx, + src1->grad, + ggml_mean(ctx, ggml_mul_impl(ctx, tensor->grad, src0, false)), + inplace); + } } break; case GGML_OP_CPY: { - GGML_ASSERT(false); // TODO: not implemented + // necessary for llama + if (src0->grad) { + src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + } + if (src1->grad) { + src1->grad = ggml_add_impl(ctx, src1->grad, tensor->grad, inplace); + } } break; case GGML_OP_CONT: { @@ -11324,34 +11783,78 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_RESHAPE: { - GGML_ASSERT(false); // TODO: not implemented + // necessary for llama + if (src0->grad) { + src0->grad = + ggml_add_impl(ctx, src0->grad, + ggml_reshape(ctx, tensor->grad, src1), + inplace); + } + if (src1->grad) { + // noop + } } break; case GGML_OP_VIEW: { - GGML_ASSERT(false); // not supported + // necessary for llama + if (src0->grad) { + size_t offset; + memcpy(&offset, tensor->padding, sizeof(size_t)); + src0->grad = ggml_add_at_impl(ctx, src0->grad, tensor->grad, offset, inplace); + } } break; case GGML_OP_PERMUTE: { - GGML_ASSERT(false); // TODO: not implemented + // necessary for llama + if (src0->grad) { + int axis0 = tensor->padding[0] & 0x3; + int axis1 = tensor->padding[1] & 0x3; + int axis2 = tensor->padding[2] & 0x3; + int axis3 = tensor->padding[3] & 0x3; + int axes_backward[4] = {0,0,0,0}; + axes_backward[axis0] = 0; + axes_backward[axis1] = 1; + axes_backward[axis2] = 2; + axes_backward[axis3] = 3; + src0->grad = + ggml_add_impl(ctx, src0->grad, + ggml_permute(ctx, + tensor->grad, + axes_backward[0], + axes_backward[1], + axes_backward[2], + axes_backward[3]), + inplace); + } } break; case GGML_OP_TRANSPOSE: { - GGML_ASSERT(false); // TODO: not implemented + // necessary for llama + if (src0->grad) { + src0->grad = + ggml_add_impl(ctx, src0->grad, + ggml_transpose(ctx, tensor->grad), + inplace); + } } break; case GGML_OP_GET_ROWS: { + // necessary for llama GGML_ASSERT(false); // TODO: not implemented } break; case GGML_OP_DIAG_MASK_INF: { + // necessary for llama GGML_ASSERT(false); // TODO: not implemented } break; case GGML_OP_SOFT_MAX: { + // necessary for llama GGML_ASSERT(false); // TODO: not implemented } break; case GGML_OP_ROPE: { + // necessary for llama GGML_ASSERT(false); // TODO: not implemented } break; case GGML_OP_CONV_1D_1S: @@ -11715,6 +12218,18 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_threads; } + work_size = MAX(work_size, cur); + } break; + case GGML_OP_ADD_AT: + { + node->n_tasks = n_threads; + + size_t cur = 0; + + if (ggml_is_quantized(node->src0->type)) { + cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_threads; + } + work_size = MAX(work_size, cur); } break; case GGML_OP_SUB: diff --git a/ggml.h b/ggml.h index d6feacd78..4843fd6fc 100644 --- a/ggml.h +++ b/ggml.h @@ -252,6 +252,7 @@ extern "C" { GGML_OP_DUP, GGML_OP_ADD, + GGML_OP_ADD_AT, GGML_OP_SUB, GGML_OP_MUL, GGML_OP_DIV, @@ -480,6 +481,18 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_add_at( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t offset); + + GGML_API struct ggml_tensor * ggml_add_at_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t offset); + GGML_API struct ggml_tensor * ggml_sub( struct ggml_context * ctx, struct ggml_tensor * a,