diff --git a/convert.py b/convert.py
index f680f8596..4ba36f280 100755
--- a/convert.py
+++ b/convert.py
@@ -241,17 +241,19 @@ class BpeVocab:
             added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
         else:
             added_tokens = {}
+
         vocab_size: int = len(self.bpe_tokenizer)
-        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
-        actual_ids = sorted(added_tokens.values())
+        expected_ids    = list(range(vocab_size, vocab_size + len(added_tokens)))
+        actual_ids      = sorted(added_tokens.values())
         if expected_ids != actual_ids:
             raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
+
         items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
-        self.added_tokens_list = [text for (text, idx) in items]
+        self.added_tokens_list    = [text for (text, idx) in items]
         self.vocab_size_base: int = vocab_size
-        self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
-        self.fname_tokenizer = fname_tokenizer
-        self.fname_added_tokens = fname_added_tokens
+        self.vocab_size: int      = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer      = fname_tokenizer
+        self.fname_added_tokens   = fname_added_tokens
 
     def bpe_tokens(self) -> Iterable[Tuple[bytes, float]]:
         tokenizer = self.bpe_tokenizer
@@ -261,12 +263,12 @@ class BpeVocab:
         for i, item in enumerate(tokenizer):
             text: bytes = item.encode("utf-8")
             score: float = -i
-            yield text, score, 4
+            yield text, score, gguf.TokenType.USER_DEFINED
 
     def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
         for text in self.added_tokens_list:
             score = -1000.0
-            yield text.encode("utf-8"), score, 4
+            yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
 
     def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
         yield from self.bpe_tokens()
@@ -304,27 +306,27 @@ class SentencePieceVocab:
             text: bytes = piece.encode("utf-8")
             score: float = tokenizer.get_score(i)
 
-            toktype = 1  # defualt to normal token type
+            toktype = gguf.TokenType.NORMAL
             if tokenizer.is_unknown(i):
-                toktype = 2
+                toktype = gguf.TokenType.UNKNOWN
             if tokenizer.is_control(i):
-                toktype = 3
+                toktype = gguf.TokenType.CONTROL
 
             # NOTE: I think added_tokens are user defined.
             # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
-            # if tokenizer.is_user_defined(i): toktype = 4
+            # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
 
             if tokenizer.is_unused(i):
-                toktype = 5
+                toktype = gguf.TokenType.UNUSED
             if tokenizer.is_byte(i):
-                toktype = 6
+                toktype = gguf.TokenType.BYTE
 
             yield text, score, toktype
 
     def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
         for text in self.added_tokens_list:
             score = -1000.0
-            yield text.encode("utf-8"), score, 4
+            yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
 
     def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
         yield from self.sentencepiece_tokens()
@@ -725,6 +727,7 @@ class OutputFile:
         self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
 
     def add_meta_arch(self, params: Params) -> None:
+        self.gguf.add_name                ("llama")
         self.gguf.add_context_length      (params.n_ctx)
         self.gguf.add_embedding_length    (params.n_embd)
         self.gguf.add_block_count         (params.n_layer)
diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
index af493e15b..469d6e3de 100644
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -139,14 +139,16 @@ void print_sample_weights(TransformerWeights *w){
 struct llama_vocab {
     using id    = int32_t;
     using token = std::string;
+    using ttype = llama_token_type;
 
-    struct token_score {
-        token tok;
+    struct token_data {
+        token text;
         float score;
+        ttype type;
     };
 
     std::unordered_map<token, id> token_to_id;
-    std::vector<token_score> id_to_token;
+    std::vector<token_data> id_to_token;
 };
 
 struct my_llama_hparams {
@@ -516,36 +518,30 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
         struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
         struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
 
-        std::vector<const char *> strings;
-        std::vector<float> scores;
-        int n_vocab = llama_n_vocab(lctx);
-        strings.resize(n_vocab, NULL);
-        scores.resize(n_vocab, 0);
-        n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
-        GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
+        const int n_vocab = llama_n_vocab(lctx);
         vocab->id_to_token.resize(n_vocab);
         for (int i=0; i<n_vocab; ++i) {
-            std::string tok   = std::string(strings[i]);
-            float       score = scores[i];
-            vocab->id_to_token[i].tok   = tok;
-            vocab->id_to_token[i].score = score;
-            vocab->token_to_id.emplace(tok, i);
+            vocab->id_to_token[i].text  = llama_token_get_text(lctx, i);
+            vocab->id_to_token[i].score = llama_token_get_score(lctx, i);
+            vocab->id_to_token[i].type  = llama_token_get_type(lctx, i);
+            vocab->token_to_id.emplace(vocab->id_to_token[i].text, i);
         }
         llama_free(lctx);
         llama_free_model(lmodel);
     } else { // assume llama2.c vocabulary
         printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename);
         llama_file file(filename, "rb");
-        uint32_t n_vocab = config->vocab_size;
+        const int  n_vocab = config->vocab_size;
         /* uint32_t max_token_length =  */ file.read_u32(); // unused
         vocab->id_to_token.resize(n_vocab);
-        for (uint32_t i=0; i<n_vocab; ++i) {
+        for (int i=0; i<n_vocab; ++i) {
             float_t score = file.read_f32();
             uint32_t len = file.read_u32();
-            std::string tok = file.read_string(len);
-            vocab->id_to_token[i].tok = tok;
+            std::string text = file.read_string(len);
+            vocab->id_to_token[i].text = text;
             vocab->id_to_token[i].score = score;
-            vocab->token_to_id.emplace(tok, i);
+            vocab->id_to_token[i].type = LLAMA_TOKEN_TYPE_UNDEFINED;
+            vocab->token_to_id.emplace(text, i);
         }
     }
 }
@@ -611,10 +607,10 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
 //    // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
 //    uint32_t n_vocab = model->hparams.n_vocab;
 //    for (uint32_t i = 0; i < n_vocab; i++) {
-//        const auto & token_score = vocab->id_to_token.at(i);
-//        file.write_u32((uint32_t) token_score.tok.size());
-//        file.write_raw(token_score.tok.data(), token_score.tok.size());
-//        file.write_raw(&token_score.score, sizeof(token_score.score));
+//        const auto & token_data = vocab->id_to_token.at(i);
+//        file.write_u32((uint32_t) token_data.tok.size());
+//        file.write_raw(token_data.tok.data(), token_data.tok.size());
+//        file.write_raw(&token_data.score, sizeof(token_data.score));
 //    }
 //
 //    // stuff AK weights into GG weights one by one.
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 922518da4..31d6620a2 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -170,14 +170,16 @@ struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struc
 struct llama_vocab {
     using id    = int32_t;
     using token = std::string;
+    using ttype = llama_token_type;
 
-    struct token_score {
-        token tok;
+    struct token_data {
+        token text;
         float score;
+        ttype type;
     };
 
     std::unordered_map<token, id> token_to_id;
-    std::vector<token_score> id_to_token;
+    std::vector<token_data> id_to_token;
 };
 
 struct my_llama_hparams {
@@ -2629,10 +2631,10 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
 //    // write_vocab
 //    uint32_t n_vocab = model->hparams.n_vocab;
 //    for (uint32_t i = 0; i < n_vocab; i++) {
-//        const auto & token_score = vocab->id_to_token.at(i);
-//        file.write_u32((uint32_t) token_score.tok.size());
-//        file.write_raw(token_score.tok.data(), token_score.tok.size());
-//        file.write_raw(&token_score.score, sizeof(token_score.score));
+//        const auto & token_data = vocab->id_to_token.at(i);
+//        file.write_u32((uint32_t) token_data.tok.size());
+//        file.write_raw(token_data.tok.data(), token_data.tok.size());
+//        file.write_raw(&token_data.score, sizeof(token_data.score));
 //    }
 //    // write tensors
 //    write_tensor(&file, model->tok_embeddings);
@@ -3055,20 +3057,13 @@ int main(int argc, char ** argv) {
 
     struct llama_vocab vocab;
     {
-        std::vector<const char *> strings;
-        std::vector<float> scores;
-        int n_vocab = llama_n_vocab(lctx);
-        strings.resize(n_vocab, NULL);
-        scores.resize(n_vocab, 0);
-        n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
-        GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
+        const int n_vocab = llama_n_vocab(lctx);
         vocab.id_to_token.resize(n_vocab);
         for (int i=0; i<n_vocab; ++i) {
-            std::string tok   = std::string(strings[i]);
-            float       score = scores[i];
-            vocab.id_to_token[i].tok   = tok;
-            vocab.id_to_token[i].score = score;
-            vocab.token_to_id.emplace(tok, i);
+            vocab.id_to_token[i].text  = llama_token_get_text(lctx, i);
+            vocab.id_to_token[i].score = llama_token_get_score(lctx, i);
+            vocab.id_to_token[i].type  = llama_token_get_type(lctx, i);
+            vocab.token_to_id.emplace(vocab.id_to_token[i].text, i);
         }
     }
 
diff --git a/gguf.py b/gguf.py
index 60ee52f09..9776649c7 100644
--- a/gguf.py
+++ b/gguf.py
@@ -61,6 +61,7 @@ KEY_TOKENIZER_PAD_ID     = "tokenizer.ggml.padding_token_id"
 KEY_TOKENIZER_HF_JSON    = "tokenizer.huggingface.json"
 KEY_TOKENIZER_RWKV       = "tokenizer.rwkv.world"
 
+
 #
 # recommended mapping of model tensor names for storage in gguf
 #
@@ -319,6 +320,15 @@ def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> dict:
 
     return tensor_map
 
+
+class TokenType(IntEnum):
+    NORMAL       = 1
+    UNKNOWN      = 2
+    CONTROL      = 3
+    USER_DEFINED = 4
+    UNUSED       = 5
+    BYTE         = 6
+
 #
 # implementation
 #
diff --git a/llama.cpp b/llama.cpp
index 1785025f0..c97aaee69 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -771,11 +771,12 @@ struct llama_vocab {
 
     using id    = int32_t;
     using token = std::string;
+    using ttype = llama_token_type;
 
     struct token_data {
-        token tok;
+        token text;
         float score;
-        int toktype;
+        ttype type;
     };
 
     llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
@@ -1521,12 +1522,12 @@ static void llama_model_load_internal(
             vocab.token_to_id[word] = i;
 
             auto & token_data = vocab.id_to_token[i];
-            token_data.tok = std::move(word);
+            token_data.text  = std::move(word);
             token_data.score = scores[i];
-            token_data.toktype = toktypes[i];
+            token_data.type  = (llama_token_type) toktypes[i];
 
             // determine the newline token: 0x0A == 10 == '\n'
-            if (token_data.tok == "<0x0A>") {
+            if (token_data.text == "<0x0A>") {
                 vocab.linefeed_id = i;
             }
         }
@@ -1558,12 +1559,12 @@ static void llama_model_load_internal(
         LLAMA_LOG_INFO("%s: general.name = %s\n",    __func__, general_name.c_str());
 
         // special tokens
-        if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].tok.c_str() ); }
-        if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].tok.c_str() ); }
-        if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].tok.c_str() ); }
-        if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].tok.c_str() ); }
-        if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].tok.c_str() ); }
-        if (vocab.linefeed_id    != -1) { LLAMA_LOG_INFO( "%s: LF token  = %d '%s'\n", __func__, vocab.linefeed_id,    vocab.id_to_token[vocab.linefeed_id].tok.c_str() );    }
+        if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
+        if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
+        if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
+        if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
+        if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
+        if (vocab.linefeed_id    != -1) { LLAMA_LOG_INFO( "%s: LF token  = %d '%s'\n", __func__, vocab.linefeed_id,    vocab.id_to_token[vocab.linefeed_id].text.c_str() );    }
     }
 
     if (vocab_only) {
@@ -2355,15 +2356,27 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
 }
 
 static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
-    return vocab.id_to_token[id].toktype == 1;
+    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL;
 }
 
 static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
-    return vocab.id_to_token[id].toktype == 2;
+    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN;
 }
 
 static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
-    return vocab.id_to_token[id].toktype == 3;
+    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
+}
+
+static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
+    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
+}
+
+static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
+    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNUSED;
+}
+
+static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
+    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
 }
 
 static bool llama_is_bos_token(const llama_vocab & vocab, llama_token id) {
@@ -2381,22 +2394,10 @@ static bool llama_is_pad_token(const llama_vocab & vocab, llama_token id ) {
     return id == vocab.special_pad_id;
 }
 
-static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
-    return vocab.id_to_token[id].toktype == 4;
-}
-
-static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
-    return vocab.id_to_token[id].toktype == 5;
-}
-
-static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
-    return vocab.id_to_token[id].toktype == 6;
-}
-
 static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
     GGML_ASSERT(llama_is_byte_token(vocab, id));
     const auto& token_data = vocab.id_to_token.at(id);
-    auto buf = token_data.tok.substr(3, 2);
+    auto buf = token_data.text.substr(3, 2);
     return strtol(buf.c_str(), NULL, 16);
 }
 
@@ -2709,6 +2710,7 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
 
     bool found            = false;
     bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
+
     GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
 
     do {
@@ -4957,25 +4959,16 @@ float * llama_get_embeddings(struct llama_context * ctx) {
     return ctx->embedding.data();
 }
 
-int llama_get_vocab(
-        const struct llama_context * ctx,
-        const char * * strings,
-        float  * scores,
-        int capacity) {
-    return llama_model_get_vocab(&ctx->model, strings, scores, capacity);
+const char * llama_token_get_text(const struct llama_context * ctx, llama_token token) {
+    return ctx->model.vocab.id_to_token[token].text.c_str();
 }
 
-int llama_model_get_vocab(
-        const struct llama_model * model,
-        const char * * strings,
-        float  * scores,
-        int capacity) {
-    int n = std::min(capacity, (int) model->vocab.id_to_token.size());
-    for (int i = 0; i<n; ++i) {
-        strings[i] = model->vocab.id_to_token[i].tok.c_str();
-        scores[i]  = model->vocab.id_to_token[i].score;
-    }
-    return n;
+float llama_token_get_score(const struct llama_context * ctx, llama_token token) {
+    return ctx->model.vocab.id_to_token[token].score;
+}
+
+llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token) {
+    return ctx->model.vocab.id_to_token[token].type;
 }
 
 llama_token llama_token_bos(const struct llama_context * ctx) {
@@ -5046,7 +5039,7 @@ int llama_token_to_str(const struct llama_context * ctx, llama_token token, char
 
 int llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token, char * buf, int length) {
     if (0 <= token && token < llama_model_n_vocab(&ctx->model)) {
-        std::string result = ctx->model.vocab.id_to_token[token].tok;
+        std::string result = ctx->model.vocab.id_to_token[token].text;
         if (length < (int) result.length()) {
             return -result.length();
         }
@@ -5060,7 +5053,7 @@ int llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token,
 int llama_token_to_str_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
     if (0 <= token && token < llama_model_n_vocab(model)) {
         if (llama_is_normal_token(model->vocab, token)) {
-            std::string result = model->vocab.id_to_token[token].tok;
+            std::string result = model->vocab.id_to_token[token].text;
             if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM) {
                 result = llama_unescape_whitespace(result);
             }
diff --git a/llama.h b/llama.h
index 0ea65c1b5..aa5b7d69c 100644
--- a/llama.h
+++ b/llama.h
@@ -72,6 +72,16 @@ extern "C" {
         LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
     };
 
+    enum llama_token_type {
+        LLAMA_TOKEN_TYPE_UNDEFINED    = 0,
+        LLAMA_TOKEN_TYPE_NORMAL       = 1,
+        LLAMA_TOKEN_TYPE_UNKNOWN      = 2,
+        LLAMA_TOKEN_TYPE_CONTROL      = 3,
+        LLAMA_TOKEN_TYPE_USER_DEFINED = 4,
+        LLAMA_TOKEN_TYPE_UNUSED       = 5,
+        LLAMA_TOKEN_TYPE_BYTE         = 6,
+    };
+
     // model file types
     enum llama_ftype {
         LLAMA_FTYPE_ALL_F32              = 0,
@@ -330,19 +340,11 @@ extern "C" {
     // Vocab
     //
 
-    // Get the vocabulary as output parameters.
-    // Returns number of results.
-    LLAMA_API int llama_get_vocab(
-            const struct llama_context * ctx,
-                          const char * * strings,
-                                 float * scores,
-                                   int   capacity);
+    LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token);
 
-    LLAMA_API int llama_model_get_vocab(
-              const struct llama_model * model,
-                          const char * * strings,
-                                 float * scores,
-                                   int   capacity);
+    LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token);
+
+    LLAMA_API llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
 
     // Special tokens
     LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx);  // beginning-of-sentence