diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 60362c5d9..577dc6e82 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -40,7 +40,6 @@ struct llama_hparams {
     uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
     uint32_t n_expert = 0;
     uint32_t n_expert_used = 0;
-    uint32_t n_vocab_type = 0; // for BERT-style token types
     uint32_t n_rel_attn_bkts = 0;
 
     // for WavTokenizer
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 1d06af60c..8563652a0 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -497,8 +497,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
         hparams.n_embd_head_v = 0;
     }
 
+    // for differentiating model types
     uint32_t n_vocab = 0;
-
     ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
 
     // arch-specific KVs
@@ -622,7 +622,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
                 ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
-                ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
                 ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type, false);
 
                 switch (hparams.n_layer) {
@@ -645,7 +644,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
                 ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
-                ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
                 ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type, false);
                 hparams.f_max_alibi_bias = 8.0f;
 
@@ -659,7 +657,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
                 ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
-                ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
                 ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type);
 
                 if (hparams.n_layer == 12 && hparams.n_embd == 768) {
@@ -1367,7 +1364,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
         const int64_t n_ff          = hparams.n_ff();
         const int64_t n_embd_gqa    = n_embd_v_gqa;
         const int64_t n_vocab       = vocab.n_vocab();
-        const int64_t n_vocab_type  = hparams.n_vocab_type;
+        const int64_t n_token_types = vocab.n_token_types();
         const int64_t n_rot         = hparams.n_rot;
         const int64_t n_expert      = hparams.n_expert;
         const int64_t n_expert_used = hparams.n_expert_used;
@@ -1812,7 +1809,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
             case LLM_ARCH_NOMIC_BERT:
                 {
                     tok_embd     = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0);
-                    type_embd    = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}, 0);
+                    type_embd    = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
 
                     if (arch == LLM_ARCH_BERT) {
                         pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,    "weight"), {n_embd, n_ctx_train}, 0);
@@ -1866,7 +1863,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
             case LLM_ARCH_JINA_BERT_V2:
                 {
                     tok_embd  = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0); // word_embeddings
-                    type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}, 0); // token_type_embeddings
+                    type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); // token_type_embeddings
 
                     tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); // LayerNorm
                     tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd}, 0); //LayerNorm bias
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 1ae959b95..ee93eeab5 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1205,6 +1205,7 @@ struct fragment_buffer_variant {
 
 struct llama_vocab::impl {
     uint32_t n_vocab = 0;
+    uint32_t n_token_types = 0; // for BERT-style token types
 
     std::unordered_map<std::string, llama_token> token_to_id;
     std::vector<token_data>                      id_to_token;
@@ -1286,6 +1287,7 @@ void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
     struct gguf_context * ctx = ml.meta.get();
 
     auto & n_vocab = pimpl->n_vocab;
+    auto & n_token_types = pimpl->n_token_types;
     auto & id_to_token = pimpl->id_to_token;
     auto & token_to_id = pimpl->token_to_id;
     auto & special_eog_ids = pimpl->special_eog_ids;
@@ -1300,6 +1302,8 @@ void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
         ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
         ml.get_key(LLM_KV_TOKENIZER_PRE,   tokenizer_pre, false);
 
+        ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, n_token_types, false);
+
         if (tokenizer_model == "no_vocab" || tokenizer_model == "none") {
             type = LLAMA_VOCAB_TYPE_NONE;
 
@@ -2013,6 +2017,10 @@ uint32_t llama_vocab::n_vocab() const {
     return (uint32_t) pimpl->id_to_token.size();
 }
 
+uint32_t llama_vocab::n_token_types() const {
+    return (uint32_t) pimpl->n_token_types;
+}
+
 std::string llama_vocab::type_name() const{
     switch (type) {
         case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
diff --git a/src/llama-vocab.h b/src/llama-vocab.h
index 84bd7c440..710464f21 100644
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@@ -24,8 +24,8 @@ struct llama_vocab {
     enum llama_vocab_type     get_type()     const;
     enum llama_vocab_pre_type get_pre_type() const;
 
-    // TODO: how to deduplicate with llama_hparams.n_vocab ?
     uint32_t n_vocab() const;
+    uint32_t n_token_types() const;
 
     std::string type_name() const;