diff --git a/convert.py b/convert.py
index d4750df9f..ba1522990 100755
--- a/convert.py
+++ b/convert.py
@@ -237,12 +237,12 @@ class SentencePieceVocab:
     def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path], vocabtype: Optional[str]) -> None:
         self.vocabtype = vocabtype
         if self.vocabtype == "bpe":
-          self.sentencepiece_tokenizer = json.loads(open(str(fname_tokenizer)).read())
+            self.sentencepiece_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
         else:
-          self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
+            self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
         added_tokens: Dict[str, int]
         if fname_added_tokens is not None:
-            added_tokens = json.load(open(fname_added_tokens))
+            added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
         else:
             added_tokens = {}
         if self.vocabtype == "bpe":
@@ -267,8 +267,9 @@ class SentencePieceVocab:
           byte_encoder = tokenization_gpt2.bytes_to_unicode()
           byte_decoder = {v: k for k, v in byte_encoder.items()}
           for i, item in enumerate(tokenizer):
-            text: bytes
-            text = b''.join([x.to_bytes(1, byteorder='big') for x in [byte_decoder[y] for y in item]])
+            # text: bytes
+            # text = b''.join([x.to_bytes(1, byteorder='big') for x in [byte_decoder[y] for y in item]])
+            text: bytes = item.encode("utf-8")
             score: float = -i
             yield text, score
         else:
diff --git a/llama.cpp b/llama.cpp
index 53f73f6b8..797bbd760 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -539,7 +539,7 @@ struct llama_file_loader {
             float score = 0.0f;
             file.read_raw(&score, sizeof(score));
 
-            assert(vocab.token_to_id.find(word) == vocab.token_to_id.end());
+            GGML_ASSERT(vocab.token_to_id.find(word) == vocab.token_to_id.end());
             vocab.token_to_id[word] = i;
 
             auto & tok_score = vocab.id_to_token[i];
@@ -1832,36 +1832,77 @@ static bool llama_eval_internal(
 // tokenizer
 //
 
-bool llama_is_normal_token(llama_token token) {
-    return token >= 259;
+static std::string llama_vocab_type(const llama_vocab& vocab) {
+    return vocab.token_to_id.size() == 32000 ? "spm": "bpe";
 }
 
-bool llama_is_unknown_token(llama_token token) {
-    return token == 0;
+static bool llama_is_normal_token(const llama_vocab& vocab, llama_token token) {
+    if(llama_vocab_type(vocab) == "spm")
+        return token >= 259;
+    else if(llama_vocab_type(vocab) == "bpe")
+        return token >= 95;
+    else
+        return false;
 }
 
-bool llama_is_control_token(llama_token token) {
-    return token == 1 || token == 2;
+static bool llama_is_unknown_token(const llama_vocab& vocab, llama_token token) {
+    if(llama_vocab_type(vocab) == "spm")
+        return token == 0;
+    else
+        // TODO: improve?
+        return false;
 }
 
-bool llama_is_bos_token(llama_token token) {
-    return token == 1;
+static bool llama_is_control_token(const llama_vocab& vocab, llama_token token) {
+    if(llama_vocab_type(vocab) == "spm")
+        return token == 1 || token == 2;
+    else
+        // TODO: improve?
+        return false;
 }
 
-bool llama_is_eos_token(llama_token token) {
-    return token == 2;
+static bool llama_is_bos_token(const llama_vocab& vocab, llama_token token) {
+    if(llama_vocab_type(vocab) == "spm")
+        return token == 1;
+    else
+        // TODO: improve?
+        return false;
 }
 
-bool llama_is_user_defined_token(llama_token token) {
+static bool llama_is_eos_token(const llama_vocab& vocab, llama_token token) {
+    if(llama_vocab_type(vocab) == "spm")
+        return token == 2;
+    else
+        // TODO: improve?
+        return false;
+}
+
+static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token token) {
+    // TODO: improve?
     return false;
 }
 
-bool llama_is_unused_token(llama_token token) {
+static bool llama_is_unused_token(const llama_vocab& vocab, llama_token token) {
+    // TODO: improve?
     return false;
 }
 
-bool llama_is_byte_token(llama_token token) {
-    return 3 <= token && token < 259;
+static bool llama_is_byte_token(const llama_vocab& vocab, llama_token token) {
+    if(llama_vocab_type(vocab) == "spm")
+        return 3 <= token && token < 259;
+    else if(llama_vocab_type(vocab) == "bpe")
+        return 1 <= token && token < 95;
+    else
+        return false;
+}
+
+static uint8_t llama_byte_to_char(const llama_vocab& vocab, uint8_t byte) {
+    if(llama_vocab_type(vocab) == "spm")
+        return byte + 3;
+    else if(llama_vocab_type(vocab) == "bpe")
+        return byte + 32;
+    else
+        return false;
 }
 
 static std::string llama_escape_whitespace(const std::string& text) {
@@ -1932,6 +1973,7 @@ struct llama_tokenizer {
         while (offs < text.size()) {
             llama_sp_symbol sym;
             size_t len = utf8_len(text[offs]);
+            GGML_ASSERT(offs + len <= text.size());
             sym.text = text.c_str() + offs;
             sym.n = len;
             offs += len;
@@ -1999,7 +2041,7 @@ private:
         if (p == rev_merge.end()) {
             // output any symbols that did not form tokens as bytes.
             for (int j = 0; j < (int)symbol.n; ++j) {
-                llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
+                llama_vocab::id token_id = llama_byte_to_char(vocab_, symbol.text[j]);
                 output.push_back(token_id);
             }
             return;
@@ -4054,7 +4096,7 @@ int llama_tokenize_with_model(
                  llama_token * tokens,
                          int   n_max_tokens,
                         bool   add_bos) {
-    auto res = llama_tokenize(model->vocab, text, add_bos, true);
+    auto res = llama_tokenize(model->vocab, text, add_bos, llama_vocab_type(model->vocab) == "spm");
 
     if (n_max_tokens < (int) res.size()) {
         return -((int) res.size());
@@ -4186,29 +4228,32 @@ float * llama_get_embeddings(struct llama_context * ctx) {
 
 int llama_token_to_str_with_model(const struct llama_model * model, llama_token token, char * str, int length) {
     if (0 <= token && token < llama_n_vocab_from_model(model)) {
-        if (llama_is_normal_token(token)) {
-            std::string result = llama_unescape_whitespace(model->vocab.id_to_token[token].tok);
+        if (llama_is_normal_token(model->vocab, token)) {
+            std::string result = model->vocab.id_to_token[token].tok;
+            if(llama_vocab_type(model->vocab) == "spm") {
+                result = llama_unescape_whitespace(result);
+            }
             if(result.length() > length) {
                 return - result.length();
             }
             strcpy(str, result.c_str());
             return result.length();
-        } else if (llama_is_unknown_token(token)) {
+        } else if (llama_is_unknown_token(model->vocab, token)) {
             if(3 > length) {
                 return -3;
             }
             strcpy(str, "\xe2\x96\x85");
             return 3;
-        } else if (llama_is_control_token(token)) {
+        } else if (llama_is_control_token(model->vocab, token)) {
             ;
-        } else if (llama_is_byte_token(token)) {
+        } else if (llama_is_byte_token(model->vocab, token)) {
             if(1 > length) {
                 return -1;
             }
-            str[0] = token - 3;
+            str[0] = llama_byte_to_char(model->vocab, token);
             str[1] = 0x00;
             return 1;
-        }    
+        }
     }
     return 0;
 }
diff --git a/models/ggml-vocab-aquila.bin b/models/ggml-vocab-aquila.bin
new file mode 100644
index 000000000..e06b39b5a
Binary files /dev/null and b/models/ggml-vocab-aquila.bin differ
diff --git a/models/ggml-vocab.bin b/models/ggml-vocab-llama.bin
similarity index 100%
rename from models/ggml-vocab.bin
rename to models/ggml-vocab-llama.bin
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index d54f0fdbb..79f16b0f8 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,4 +1,19 @@
-function(llama_add_test source)
+function(llama_build_executable source)
+    get_filename_component(TEST_TARGET ${source} NAME_WE)
+    add_executable(${TEST_TARGET} ${source})
+    install(TARGETS ${TEST_TARGET} RUNTIME)
+    target_link_libraries(${TEST_TARGET} PRIVATE llama)
+endfunction()
+
+function(llama_test_executable name source)
+    get_filename_component(TEST_TARGET ${source} NAME_WE)
+    # add_executable(${TEST_TARGET} ${source})
+    # install(TARGETS ${TEST_TARGET} RUNTIME)
+    # target_link_libraries(${TEST_TARGET} PRIVATE llama)
+    add_test(NAME ${name} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
+endfunction()
+
+function(llama_build_and_test_executable source)
     get_filename_component(TEST_TARGET ${source} NAME_WE)
     add_executable(${TEST_TARGET} ${source})
     install(TARGETS ${TEST_TARGET} RUNTIME)
@@ -6,11 +21,14 @@ function(llama_add_test source)
     add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
 endfunction()
 
-# llama_add_test(test-double-float.c) # SLOW
-llama_add_test(test-quantize-fns.cpp)
-llama_add_test(test-quantize-perf.cpp)
-llama_add_test(test-sampling.cpp)
-llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
-llama_add_test(test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
-llama_add_test(test-grad0.c) # SLOW
-# llama_add_test(test-opt.c) # SLOW
+# llama_build_and_test_executable(test-double-float.c) # SLOW
+llama_build_and_test_executable(test-quantize-fns.cpp)
+llama_build_and_test_executable(test-quantize-perf.cpp)
+llama_build_and_test_executable(test-sampling.cpp)
+llama_build_executable(test-tokenizer-0.cpp)
+llama_test_executable(test-tokenizer-0.llama test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.bin)
+llama_build_executable(test-tokenizer-1.cpp)
+llama_test_executable(test-tokenizer-1.llama test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.bin)
+llama_test_executable(test-tokenizer-1.aquila test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.bin)
+llama_build_and_test_executable(test-grad0.c) # SLOW
+# llama_build_and_test_executable(test-opt.c) # SLOW
diff --git a/tests/test-tokenizer-1.cpp b/tests/test-tokenizer-1.cpp
index 3803dad8d..122e51684 100644
--- a/tests/test-tokenizer-1.cpp
+++ b/tests/test-tokenizer-1.cpp
@@ -9,6 +9,10 @@
 #include <map>
 #include <vector>
 
+static std::string vocab_type(llama_context* ctx) {
+    return llama_n_vocab(ctx) == 32000 ? "spm": "bpe";
+}
+
 static std::string escape_whitespace(const std::string& text) {
     std::string result;
     bool escaping = false;
@@ -75,13 +79,6 @@ int main(int argc, char **argv) {
 
     const int n_vocab = llama_n_vocab(ctx);
 
-    if (n_vocab != 32000) {
-        fprintf(stderr, "%s : expected 32000 tokens, got %d\n", __func__, n_vocab);
-        llama_free_model(model);
-        llama_free(ctx);
-        return 2;
-    }
-
     for (int i = 0; i < n_vocab; ++i) {
         std::string forward = llama_token_to_str_bpe(ctx, i);
         std::vector<llama_token> tokens = llama_tokenize_bpe(ctx, forward, false);
@@ -90,16 +87,17 @@ int main(int argc, char **argv) {
                 std::string backward = llama_token_to_str(ctx, tokens[0]);
                 fprintf(stderr, "%s : error: token %d is string %s but bpe returns token %d %s\n", 
                     __func__, i, llama_token_to_str(ctx, i).c_str(), tokens[0], backward.c_str());
-                return 3;
+                return 2;
             }
         } else {
-            if (i <= 258) {
+            if ((vocab_type(ctx) == "spm" && i <= 258) || 
+                (vocab_type(ctx) == "bpe" && (i == 0 || i >= 100000))) {
                 fprintf(stderr, "%s : info: token %d is string %s and bpe returns tokens %s\n", 
                     __func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str());
             } else {
                 fprintf(stderr, "%s : error: token %d is string %s but bpe returns tokens %s\n", 
                     __func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str());
-                return 3;
+                return 2;
             }
         }
     }