llama : keep same graph topology even when n_outputs == 0
This commit is contained in:
parent
711b0bcb11
commit
d100502251
2 changed files with 12 additions and 30 deletions
19
ggml.c
19
ggml.c
|
@ -2542,6 +2542,11 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
|
|||
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
||||
}
|
||||
|
||||
static inline bool ggml_is_empty(const struct ggml_tensor * tensor) {
|
||||
// nb[3] depends on the previous nb and ne
|
||||
return tensor->nb[3] == 0 || tensor->ne[3] == 0;
|
||||
}
|
||||
|
||||
bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||
|
||||
|
@ -2556,11 +2561,11 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
|
|||
static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||
|
||||
return
|
||||
(t1->ne[0]%t0->ne[0] == 0) &&
|
||||
(t1->ne[1]%t0->ne[1] == 0) &&
|
||||
(t1->ne[2]%t0->ne[2] == 0) &&
|
||||
(t1->ne[3]%t0->ne[3] == 0);
|
||||
return ggml_is_empty(t0) ||
|
||||
((t1->ne[0]%t0->ne[0] == 0) &&
|
||||
(t1->ne[1]%t0->ne[1] == 0) &&
|
||||
(t1->ne[2]%t0->ne[2] == 0) &&
|
||||
(t1->ne[3]%t0->ne[3] == 0));
|
||||
}
|
||||
|
||||
static inline bool ggml_can_repeat_rows(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
||||
|
@ -16047,7 +16052,7 @@ static void ggml_compute_forward_cross_entropy_loss_back(
|
|||
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
||||
GGML_ASSERT(params);
|
||||
|
||||
if (tensor->op == GGML_OP_NONE) {
|
||||
if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -18011,7 +18016,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|||
{
|
||||
// FIXME: the cost of launching additional threads decreases performance with GPU offloading
|
||||
//n_tasks = MIN(n_threads, ggml_nelements(node->src[1]));
|
||||
n_tasks = MIN(n_cur_threads, ggml_nelements(node->src[1]));
|
||||
n_tasks = MIN(n_cur_threads, MAX(ggml_nelements(node->src[1]), 1));
|
||||
} break;
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_SET:
|
||||
|
|
23
llama.cpp
23
llama.cpp
|
@ -5894,7 +5894,6 @@ struct llm_build_context {
|
|||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
if (n_outputs == 0) { return gf; }
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
n_tokens = n_outputs;
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
|
@ -6082,7 +6081,6 @@ struct llm_build_context {
|
|||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
if (n_outputs == 0) { return gf; }
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
|
@ -6205,7 +6203,6 @@ struct llm_build_context {
|
|||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
if (n_outputs == 0) { return gf; }
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
||||
|
@ -6308,7 +6305,6 @@ struct llm_build_context {
|
|||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
if (n_outputs == 0) { return gf; }
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
||||
|
@ -6513,7 +6509,6 @@ struct llm_build_context {
|
|||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
if (n_outputs == 0) { return gf; }
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
|
||||
|
@ -6610,7 +6605,6 @@ struct llm_build_context {
|
|||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
if (n_outputs == 0) { return gf; }
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
|
@ -6775,7 +6769,6 @@ struct llm_build_context {
|
|||
|
||||
if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
||||
// skip computing output for unused tokens
|
||||
if (n_outputs == 0) { return gf; }
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
||||
|
@ -6905,7 +6898,6 @@ struct llm_build_context {
|
|||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
if (n_outputs == 0) { return gf; }
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
||||
|
@ -7011,7 +7003,6 @@ struct llm_build_context {
|
|||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
if (n_outputs == 0) { return gf; }
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
||||
|
@ -7132,7 +7123,6 @@ struct llm_build_context {
|
|||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
if (n_outputs == 0) { return gf; }
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
|
@ -7246,7 +7236,6 @@ struct llm_build_context {
|
|||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
if (n_outputs == 0) { return gf; }
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
|
@ -7366,7 +7355,6 @@ struct llm_build_context {
|
|||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
if (n_outputs == 0) { return gf; }
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
|
@ -7492,7 +7480,6 @@ struct llm_build_context {
|
|||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
if (n_outputs == 0) { return gf; }
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
||||
|
@ -7598,7 +7585,6 @@ struct llm_build_context {
|
|||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
if (n_outputs == 0) { return gf; }
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
sa_out = ggml_get_rows(ctx0, sa_out, inp_out_ids);
|
||||
|
@ -7699,7 +7685,6 @@ struct llm_build_context {
|
|||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
if (n_outputs == 0) { return gf; }
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
||||
|
@ -7807,7 +7792,6 @@ struct llm_build_context {
|
|||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
if (n_outputs == 0) { return gf; }
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
||||
|
@ -7924,7 +7908,6 @@ struct llm_build_context {
|
|||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
if (n_outputs == 0) { return gf; }
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
|
@ -8042,7 +8025,6 @@ struct llm_build_context {
|
|||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
if (n_outputs == 0) { return gf; }
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
|
@ -8173,7 +8155,6 @@ struct llm_build_context {
|
|||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
if (n_outputs == 0) { return gf; }
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
|
@ -8295,7 +8276,6 @@ struct llm_build_context {
|
|||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
if (n_outputs == 0) { return gf; }
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
||||
|
@ -8416,7 +8396,6 @@ struct llm_build_context {
|
|||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
if (n_outputs == 0) { return gf; }
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
|
@ -8571,7 +8550,6 @@ struct llm_build_context {
|
|||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
if (n_outputs == 0) { return gf; }
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
x = ggml_get_rows(ctx0, x, inp_out_ids);
|
||||
y = ggml_get_rows(ctx0, y, inp_out_ids);
|
||||
|
@ -8683,7 +8661,6 @@ struct llm_build_context {
|
|||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
if (n_outputs == 0) { return gf; }
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue