diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 60362c5d9..577dc6e82 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -40,7 +40,6 @@ struct llama_hparams { uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head uint32_t n_expert = 0; uint32_t n_expert_used = 0; - uint32_t n_vocab_type = 0; // for BERT-style token types uint32_t n_rel_attn_bkts = 0; // for WavTokenizer diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 1d06af60c..8563652a0 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -497,8 +497,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.n_embd_head_v = 0; } + // for differentiating model types uint32_t n_vocab = 0; - ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false); // arch-specific KVs @@ -622,7 +622,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); - ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type); ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); switch (hparams.n_layer) { @@ -645,7 +644,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); - ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type); ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); hparams.f_max_alibi_bias = 8.0f; @@ -659,7 +657,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); - ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type); ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); if (hparams.n_layer == 12 && hparams.n_embd == 768) { @@ -1367,7 +1364,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { const int64_t n_ff = hparams.n_ff(); const int64_t n_embd_gqa = n_embd_v_gqa; const int64_t n_vocab = vocab.n_vocab(); - const int64_t n_vocab_type = hparams.n_vocab_type; + const int64_t n_token_types = vocab.n_token_types(); const int64_t n_rot = hparams.n_rot; const int64_t n_expert = hparams.n_expert; const int64_t n_expert_used = hparams.n_expert_used; @@ -1812,7 +1809,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { case LLM_ARCH_NOMIC_BERT: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); - type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}, 0); + type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); if (arch == LLM_ARCH_BERT) { pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0); @@ -1866,7 +1863,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { case LLM_ARCH_JINA_BERT_V2: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings - type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}, 0); // token_type_embeddings + type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); // token_type_embeddings tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); // LayerNorm tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0); //LayerNorm bias diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 1ae959b95..ee93eeab5 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1205,6 +1205,7 @@ struct fragment_buffer_variant { struct llama_vocab::impl { uint32_t n_vocab = 0; + uint32_t n_token_types = 0; // for BERT-style token types std::unordered_map token_to_id; std::vector id_to_token; @@ -1286,6 +1287,7 @@ void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) { struct gguf_context * ctx = ml.meta.get(); auto & n_vocab = pimpl->n_vocab; + auto & n_token_types = pimpl->n_token_types; auto & id_to_token = pimpl->id_to_token; auto & token_to_id = pimpl->token_to_id; auto & special_eog_ids = pimpl->special_eog_ids; @@ -1300,6 +1302,8 @@ void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) { ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model); ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false); + ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, n_token_types, false); + if (tokenizer_model == "no_vocab" || tokenizer_model == "none") { type = LLAMA_VOCAB_TYPE_NONE; @@ -2013,6 +2017,10 @@ uint32_t llama_vocab::n_vocab() const { return (uint32_t) pimpl->id_to_token.size(); } +uint32_t llama_vocab::n_token_types() const { + return (uint32_t) pimpl->n_token_types; +} + std::string llama_vocab::type_name() const{ switch (type) { case LLAMA_VOCAB_TYPE_NONE: return "no vocab"; diff --git a/src/llama-vocab.h b/src/llama-vocab.h index 84bd7c440..710464f21 100644 --- a/src/llama-vocab.h +++ b/src/llama-vocab.h @@ -24,8 +24,8 @@ struct llama_vocab { enum llama_vocab_type get_type() const; enum llama_vocab_pre_type get_pre_type() const; - // TODO: how to deduplicate with llama_hparams.n_vocab ? uint32_t n_vocab() const; + uint32_t n_token_types() const; std::string type_name() const;