ggml : reduce hash table reset cost (#8698)

* ggml : reduce hash table reset cost * fix unreachable code warnings after GGML_ASSERT(false) * GGML_ASSERT(false) -> GGML_ABORT("fatal error") * GGML_ABORT use format string
2024-07-27 04:41:55 +02:00 · 2024-07-27 04:41:55 +02:00 · 2b1f616b20
commit 2b1f616b20
parent 01245f5b16
46 changed files with 851 additions and 754 deletions
--- a/src/llama-grammar.cpp
+++ b/src/llama-grammar.cpp
@ -221,7 +221,7 @@ static void llama_grammar_advance_stack(
            // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
            // (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
            // those
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
    }
 }

@ -517,7 +517,7 @@ void llama_grammar_accept_token_impl(struct llama_grammar * grammar, const struc
                return;
            }
        }
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
    }

    const std::string & piece = vocab->cache_token_to_piece.at(token);
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@ -152,14 +152,14 @@ static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
            return strtol(buf.c_str(), NULL, 16);
        }
        case LLAMA_VOCAB_TYPE_BPE: {
-            GGML_ASSERT(false);
-            return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT?
+            GGML_ABORT("fatal error");
+            //return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT?
        }
        case LLAMA_VOCAB_TYPE_WPM: {
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
        }
        default:
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
    }
 }

@ -1396,7 +1396,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
                }
            } break;
        case LLAMA_VOCAB_TYPE_NONE:
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
    }

    return output;
@ -1422,7 +1422,7 @@ llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch) {
            return vocab.token_to_id.at(unicode_byte_to_utf8(ch));
        }
        default:
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
    }
 }

@ -1606,7 +1606,7 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
                break;
            }
            default:
-                GGML_ASSERT(false);
+                GGML_ABORT("fatal error");
        }
    }

--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -2259,8 +2259,7 @@ struct llama_hparams {
            return n_head_arr[il];
        }

-        GGML_ASSERT(false);
-        return 0;
+        GGML_ABORT("fatal error");
    }

    uint32_t n_head_kv(uint32_t il = 0) const {
@ -2268,8 +2267,7 @@ struct llama_hparams {
            return n_head_kv_arr[il];
        }

-        GGML_ASSERT(false);
-        return 0;
+        GGML_ABORT("fatal error");
    }

    uint32_t n_ff(uint32_t il = 0) const {
@ -2277,8 +2275,7 @@ struct llama_hparams {
            return n_ff_arr[il];
        }

-        GGML_ASSERT(false);
-        return 0;
+        GGML_ABORT("fatal error");
    }

    uint32_t n_gqa(uint32_t il = 0) const {
@ -8072,7 +8069,7 @@ static struct ggml_tensor * llm_build_moe_ffn(
                cb(gate, "ffn_moe_gelu", il);
            } break;
        default:
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
    }

    ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
@ -8635,8 +8632,8 @@ struct llm_build_context {
                } break;
            default:
                {
-                    GGML_ASSERT(false && "unknown pooling type");
-                } break;
+                    GGML_ABORT("unknown pooling type");
+                }
        }

        cb(cur, "result_embd_pooled", -1);
@ -8891,7 +8888,7 @@ struct llm_build_context {
                        Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd/n_head, n_head, n_tokens);
                        break;
                    default:
-                        GGML_ASSERT(false);
+                        GGML_ABORT("fatal error");
                }
                cb(Qcur, "Qcur", il);
                cb(Kcur, "Kcur", il);
@ -11723,7 +11720,7 @@ struct llm_build_context {
                switch (model.type) {
                    case e_model::MODEL_9B:  Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));   break;
                    case e_model::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
-                    default: GGML_ASSERT(false);
+                    default: GGML_ABORT("fatal error");
                };
                cb(Qcur, "Qcur_scaled", il);

@ -13888,7 +13885,7 @@ static struct ggml_cgraph * llama_build_graph(
                result = llm.build_jais();
            } break;
        default:
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
    }

    // add on pooling layer
@ -14687,8 +14684,8 @@ static int llama_decode_internal(
                    } break;
                case LLAMA_POOLING_TYPE_UNSPECIFIED:
                    {
-                        GGML_ASSERT(false && "unknown pooling type");
-                    } break;
+                        GGML_ABORT("unknown pooling type");
+                    }
            }
        }
        n_outputs_prev += lctx.n_outputs;
@ -15079,7 +15076,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
    // apply K-shift if needed
    if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
        if (lctx.model.arch == LLM_ARCH_DEEPSEEK2) { // not supported due to MLA
-            GGML_ASSERT(false && "Deepseek2 does not support K-shift");
+            GGML_ABORT("Deepseek2 does not support K-shift");
        }

        {
@ -15218,7 +15215,7 @@ static void llama_tensor_dequantize_internal(
        } else if (ggml_is_quantized(tensor->type)) {
            qtype.to_float(tensor->data, f32_output, nelements);
        } else {
-            GGML_ASSERT(false); // unreachable
+            GGML_ABORT("fatal error"); // unreachable
        }
        return;
    }
@ -16904,8 +16901,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {

        // all model arches should be listed explicitly here
        case LLM_ARCH_UNKNOWN:
-            GGML_ASSERT(false && "unknown architecture");
-            break;
+            GGML_ABORT("unknown architecture");
    }

    return LLAMA_ROPE_TYPE_NONE;
@ -18469,7 +18465,7 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
 #ifndef NDEBUG
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
 #endif
        return nullptr;
    }
@ -18514,7 +18510,7 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
 #ifndef NDEBUG
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
 #endif
        return nullptr;
    }