de-duplicate ggml_forward_dup code taking care of contiguous tensors of same type.

with this we can duplicate tensor of any typ as long as they are contiguous.
2023-04-27 16:55:22 +02:00 · 2023-04-27 16:55:22 +02:00 · c1a8893de3
commit c1a8893de3
parent 38675e537c
1 changed files with 36 additions and 22 deletions
--- a/ggml.c
+++ b/ggml.c
@ -6638,6 +6638,36 @@ void ggml_set_param(
 // ggml_compute_forward_dup
 static void ggml_compute_forward_dup_same_cont(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        struct ggml_tensor * dst) {
    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
    GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
    GGML_ASSERT(src0->type == dst->type);
    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
        return;
    }
    const size_t nb00 = src0->nb[0];
    const size_t nb0 = dst->nb[0];
    const int ith = params->ith; // thread index
    const int nth = params->nth; // number of threads
    // parallelize by elements
    const int ne = ggml_nelements(dst);
    const int dr = (ne + nth - 1) / nth;
    const int ie0 = dr * ith;
    const int ie1 = MIN(ie0 + dr, ne);
    memcpy(
        ((char *)  dst->data + ie0*nb0),
        ((char *) src0->data + ie0*nb00),
        (ie1 - ie0) * GGML_TYPE_SIZE[src0->type]);
 }    
 static void ggml_compute_forward_dup_f16(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
@ -6672,17 +6702,7 @@ static void ggml_compute_forward_dup_f16(
    const int nth = params->nth; // number of threads
    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
-        // parallelize by elements
+        ggml_compute_forward_dup_same_cont(params, src0, dst);
        const int ne = ggml_nelements(dst);
        const int dr = (ne + nth - 1) / nth;
        const int ie0 = dr * ith;
        const int ie1 = MIN(ie0 + dr, ne);
        memcpy(
            ((char *)  dst->data + ie0*nb0),
            ((char *) src0->data + ie0*nb00),
            (ie1 - ie0) * GGML_TYPE_SIZE[src0->type]);
        return;
    }
@ -6971,17 +6991,7 @@ static void ggml_compute_forward_dup_f32(
    const int nth = params->nth; // number of threads
    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
-        // parallelize by elements
+        ggml_compute_forward_dup_same_cont(params, src0, dst);
        const int ne = ggml_nelements(dst);
        const int dr = (ne + nth - 1) / nth;
        const int ie0 = dr * ith;
        const int ie1 = MIN(ie0 + dr, ne);
        memcpy(
            ((char *)  dst->data + ie0*nb0),
            ((char *) src0->data + ie0*nb00),
            (ie1 - ie0) * GGML_TYPE_SIZE[src0->type]);
        return;
    }
@ -7236,6 +7246,10 @@ static void ggml_compute_forward_dup(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        struct ggml_tensor * dst) {
    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
        ggml_compute_forward_dup_same_cont(params, src0, dst);
        return;
    }
    switch (src0->type) {
        case GGML_TYPE_F16:
            {