finetune : add -ngl parameter (#3762)
* Add '-ngl' support to finetune.cpp * Add fprintf in ggml_cuda_op_add When I tried CUDA offloading during finetuning following the readme, I got an assert here. This probably isn't an important case because inference later gives a warning saying you should use f16 or f32 instead when using lora * Add 'finetune.sh', which currently fails when using GPU "error: operator (): Finetuning on tensors with type 'f16' is not yet supported" * tweak finetune.sh * Suppress some warnings in ggml.c * Add f16 implementation to ggml_compute_forward_add_f16_f32 * Add an f16 case to ggml_add_cast_impl and llama_build_lora_finetune_graphs * finetune.sh: Edit comments * Add "add_f16_f32_f32_cuda" * Tweak an error message * finetune.sh: Add an optional LLAMA_MODEL_DIR variable * finetune.sh: Add an optional LLAMA_TRAINING_DIR variable * train : minor * tabs to spaces --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
This commit is contained in:
parent
f0e209324a
commit
73bdcb395e
8 changed files with 106 additions and 15 deletions
49
ggml.c
49
ggml.c
|
@ -3153,7 +3153,7 @@ static struct ggml_tensor * ggml_add_cast_impl(
|
|||
// TODO: support less-strict constraint
|
||||
// GGML_ASSERT(ggml_can_repeat(b, a));
|
||||
GGML_ASSERT(ggml_can_repeat_rows(b, a));
|
||||
GGML_ASSERT(ggml_is_quantized(a->type)); // currently only supported for quantized input
|
||||
GGML_ASSERT(ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16); // currently only supported for quantized input and f16
|
||||
|
||||
bool is_node = false;
|
||||
|
||||
|
@ -6927,9 +6927,15 @@ static void ggml_compute_forward_add_f16_f32(
|
|||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F16);
|
||||
|
||||
GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
GGML_ASSERT( nb0 == sizeof(float));
|
||||
}
|
||||
else {
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
|
||||
}
|
||||
|
||||
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
||||
|
||||
// rows per thread
|
||||
|
@ -6940,18 +6946,35 @@ static void ggml_compute_forward_add_f16_f32(
|
|||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
if (nb10 == sizeof(float)) {
|
||||
for (int ir = ir0; ir < ir1; ++ir) {
|
||||
// src0, src1 and dst are same shape => same indices
|
||||
const int i3 = ir/(ne2*ne1);
|
||||
const int i2 = (ir - i3*ne2*ne1)/ne1;
|
||||
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
|
||||
if (dst->type == GGML_TYPE_F16) {
|
||||
for (int ir = ir0; ir < ir1; ++ir) {
|
||||
// src0, src1 and dst are same shape => same indices
|
||||
const int i3 = ir/(ne2*ne1);
|
||||
const int i2 = (ir - i3*ne2*ne1)/ne1;
|
||||
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
|
||||
|
||||
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1);
|
||||
ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
|
||||
float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
|
||||
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1);
|
||||
ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
|
||||
float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
|
||||
|
||||
for (int i = 0; i < ne0; i++) {
|
||||
dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]);
|
||||
for (int i = 0; i < ne0; i++) {
|
||||
dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int ir = ir0; ir < ir1; ++ir) {
|
||||
// src0, src1 and dst are same shape => same indices
|
||||
const int i3 = ir/(ne2*ne1);
|
||||
const int i2 = (ir - i3*ne2*ne1)/ne1;
|
||||
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
|
||||
|
||||
float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1);
|
||||
ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
|
||||
float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
|
||||
|
||||
for (int i = 0; i < ne0; i++) {
|
||||
dst_ptr[i] = GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue