diff --git a/convert.py b/convert.py
index 250659248..4748e262b 100644
--- a/convert.py
+++ b/convert.py
@@ -142,7 +142,7 @@ def find_n_mult(n_ff: int, n_embd: int) -> int:
 @dataclass
 class Params:
     n_vocab:   int
-    n_vocab_sp:int
+    n_vocab_base: int
     n_embd:    int
     n_mult:    int
     n_head:    int
@@ -170,7 +170,7 @@ class Params:
 
         return Params(
             n_vocab   = n_vocab,
-            n_vocab_sp= n_vocab,
+            n_vocab_base=n_vocab,
             n_embd    = n_embd,
             n_mult    = 256,
             n_head    = n_head,
@@ -193,7 +193,7 @@ class Params:
 
         return Params(
             n_vocab   = n_vocab,
-            n_vocab_sp= n_vocab,
+            n_vocab_base=n_vocab,
             n_embd    = n_embd,
             n_mult    = n_mult,
             n_head    = n_head,
@@ -218,7 +218,7 @@ class Params:
 
         return Params(
             n_vocab   = n_vocab,
-            n_vocab_sp= n_vocab
+            n_vocab_base=n_vocab,
             n_embd    = n_embd,
             n_mult    = n_mult,
             n_head    = n_head,
@@ -283,7 +283,7 @@ class SentencePieceVocab:
         else:
             tokenizer_config = {}
         for key, value in tokenizer_config.items():
-            if not isinstance(value, dict) or not isinstance(value, str):
+            if not isinstance(value, dict) and not isinstance(value, str):
                 continue
             token_id = TOKEN_NAME_TO_ID.get(key, -1)
             if token_id == -1:
@@ -296,15 +296,13 @@ class SentencePieceVocab:
         else:
             special_tokens = {}
         for key, value in special_tokens.items():
-            if not isinstance(value, dict) or not isinstance(value, str):
+            if not isinstance(value, dict) and not isinstance(value, str):
                 continue
             token_id = TOKEN_NAME_TO_ID.get(key, -1)
             if token_id == -1 or token_id in self.special_tokens_map:
                 continue
             self.special_tokens_map[token_id] = value["content"] if isinstance(value, dict) else value
 
-        self.vocab_special_size: int = len(self.added_tokens_list) + len(self.special_tokens_map)
-
     def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
         tokenizer = self.sentencepiece_tokenizer
         if self.vocabtype == "bpe":
@@ -361,7 +359,7 @@ class GGMLVocab:
         self.tokens = tokens
         self.special_tokens = []
         self.vocab_size = len(tokens)
-        self.vocab_special_size = 0
+        self.vocab_size_base = 0
 
     def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
         return self.tokens
@@ -1120,17 +1118,21 @@ class OutputFile:
     def write_file_header(self, params: Params, file_type: GGMLFileType) -> None:
         self.fout.write(b"ggjt"[::-1])  # magic
         values = [
-            4,  # file version
+            1,  # file version
             params.n_vocab,
-            params.n_vocab_sp,
             params.n_embd,
             params.n_mult,
             params.n_head,
             params.n_layer,
+<<<<<<< HEAD
             params.n_embd // params.n_head,  # rot (obsolete)
             file_type.value,
+=======
+            params.n_vocab_base | 0xF0000000, # reuse obsolete rot value to store vocab_base
+            params.file_type.value,
+>>>>>>> bfccc62 (Use some tricks to eliminate the necessity for a new format)
         ]
-        self.fout.write(struct.pack("i" * len(values), *values))
+        self.fout.write(struct.pack("I" * len(values), *values))
 
     def write_tensor_header(self, name: str, shape: Sequence[int], data_type: DataType) -> None:
         sname = name.encode('utf-8')
@@ -1144,13 +1146,11 @@ class OutputFile:
             self.fout.write(struct.pack("i", len(text)))
             self.fout.write(text)
             self.fout.write(struct.pack("f", score))
-        for token_id in vocab.all_special_tokens():
-            self.fout.write(struct.pack("i", token_id))
 
     @staticmethod
     def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
         of = OutputFile(fname_out)
-        params = Params(n_vocab=vocab.vocab_size, n_vocab_sp=vocab.vocab_special_size, n_embd=0, n_mult=0,
+        params = Params(n_vocab=vocab.vocab_size, n_vocab_base=vocab.vocab_size_base, n_embd=0, n_mult=0,
                         n_head=1, n_layer=0)
         of = OutputFile(fname_out)
         of.write_file_header(params, file_type=GGMLFileType.AllF32)
@@ -1373,7 +1373,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
             vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
             vocab = load_vocab(vocab_dir, args.vocabtype)
         params = Params.load(model_plus)
-        params.n_vocab_sp = vocab.vocab_special_size
+        params.n_vocab_base = vocab.vocab_size_base
         model = model_plus.model
         model = do_necessary_conversions(model, params)
         output_type = pick_output_type(model, args.outtype)
diff --git a/llama.cpp b/llama.cpp
index af12931e0..8bbe51009 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -181,14 +181,13 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
 // default hparams (LLaMA 7B)
 struct llama_hparams {
     uint32_t n_vocab   = 32000;
-    uint32_t n_vocab_sp = 0;
+    uint32_t n_vocab_base = 32000;
     uint32_t n_ctx     = 512;   // this is provided as user input?
     uint32_t n_embd    = 4096;
     uint32_t n_mult    = 256;
     uint32_t n_head    = 32;
     uint32_t n_head_kv = 32;
     uint32_t n_layer   = 32;
-    uint32_t n_rot     = 64;
 
     // LLaMAv2
     // TODO: load from model data hparams
@@ -499,7 +498,6 @@ enum llama_file_version {
     LLAMA_FILE_VERSION_GGJT_V1, // added padding
     LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
     LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format
-    LLAMA_FILE_VERSION_GGJT_V4, // improved support for added/special tokens
 };
 
 struct llama_file_loader {
@@ -515,6 +513,7 @@ struct llama_file_loader {
         read_hparams();
         read_vocab();
         read_tensor_metadata(tensors_map);
+        set_vocab_sp();
     }
     void read_magic() {
         uint32_t magic = file.read_u32();
@@ -537,7 +536,6 @@ struct llama_file_loader {
                     case 1: file_version = LLAMA_FILE_VERSION_GGJT_V1; return;
                     case 2: file_version = LLAMA_FILE_VERSION_GGJT_V2; return;
                     case 3: file_version = LLAMA_FILE_VERSION_GGJT_V3; return;
-                    case 4: file_version = LLAMA_FILE_VERSION_GGJT_V4; return;
                 }
         }
 
@@ -546,18 +544,18 @@ struct llama_file_loader {
     }
     void read_hparams() {
         hparams.n_vocab = file.read_u32();
-        hparams.n_vocab_sp = file_version >= LLAMA_FILE_VERSION_GGJT_V4 ? file.read_u32() : 0;
         hparams.n_embd  = file.read_u32();
         hparams.n_mult  = file.read_u32();
         hparams.n_head  = file.read_u32();
         hparams.n_layer = file.read_u32();
-        hparams.n_rot   = file.read_u32();
+        hparams.n_vocab_base = file.read_u32();
+        hparams.n_vocab_base = (hparams.n_vocab_base & 0xF0000000) == 0 ? hparams.n_vocab : (hparams.n_vocab_base & ~0xF0000000); // this bitwise operation is necessary for compatibility with older models
         hparams.ftype   = (enum llama_ftype) file.read_u32();
 
         // LLaMAv2
         // TODO: read from header
         hparams.n_head_kv = hparams.n_head;
-    }
+=======
     void read_vocab() {
         vocab.id_to_token.resize(hparams.n_vocab);
 
@@ -574,20 +572,6 @@ struct llama_file_loader {
             tok_score.tok = std::move(word);
             tok_score.score = score;
         }
-
-        vocab.special_token_to_id.reserve(hparams.n_vocab_sp);
-
-        for (uint32_t i = 0; i < hparams.n_vocab_sp; i++) {
-            llama_vocab::id token_id = file.read_u32();
-            const auto & word = vocab.id_to_token[token_id].tok;
-
-            vocab.special_token_trie.add(word);
-            vocab.special_token_to_id[word] = token_id;
-
-            if (vocab.max_special_token_length < word.size()) {
-                vocab.max_special_token_length = word.size();
-            }
-        }
     }
     void read_tensor_metadata(llama_load_tensors_map & tensors_map) {
         while (file.tell() < file.size) {
@@ -634,6 +618,24 @@ struct llama_file_loader {
             tensors_map.name_to_idx[name] = tensors_map.tensors.size() - 1;
         }
     }
+    void set_vocab_sp() {
+        uint32_t vocab_sp = 3 + hparams.n_vocab - hparams.n_vocab_base;
+        vocab.special_token_to_id.reserve(vocab_sp);
+        for (uint32_t i = 0; i < vocab_sp; i++) {
+            llama_vocab::id token_id = i > 2 ? hparams.n_vocab_base + i : i;
+            const auto & word = vocab.id_to_token[token_id].tok;
+            if (word.empty()) {
+                continue;
+            }
+
+            vocab.special_token_trie.add(word);
+            vocab.special_token_to_id[word] = token_id;
+
+            if (vocab.max_special_token_length < word.size()) {
+                vocab.max_special_token_length = word.size();
+            }
+        }
+    }
 };
 
 struct llama_file_saver {
@@ -653,12 +655,11 @@ struct llama_file_saver {
     void write_hparams(enum llama_ftype new_ftype) {
         const llama_hparams & hparams = any_file_loader->hparams;
         file.write_u32(hparams.n_vocab);
-        file.write_u32(hparams.n_vocab_sp);
         file.write_u32(hparams.n_embd);
         file.write_u32(hparams.n_mult);
         file.write_u32(hparams.n_head);
         file.write_u32(hparams.n_layer);
-        file.write_u32(hparams.n_rot);
+        file.write_u32(hparams.n_vocab_base | 0xF0000000); // this bitwise operation is necessary for compatibility with older models
         file.write_u32(new_ftype);
     }
     void write_vocab() {
@@ -672,9 +673,6 @@ struct llama_file_saver {
             file.write_raw(token_score.tok.data(), token_score.tok.size());
             file.write_raw(&token_score.score, sizeof(token_score.score));
         }
-        for (const auto & pair : any_file_loader->vocab.special_token_to_id) {
-            file.write_u32(pair.second);
-        }
     }
     void write_tensor(llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) {
         switch (new_type) {
@@ -1001,8 +999,7 @@ static const char *llama_file_version_name(llama_file_version version) {
         case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
         case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
         case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)";
-        case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (pre #1931)";
-        case LLAMA_FILE_VERSION_GGJT_V4: return "ggjt v4 (latest)";
+        case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)";
     }
 
     return "unknown";
@@ -1127,7 +1124,7 @@ static void llama_model_load_internal(
         fprintf(stderr, "%s: n_head     = %u\n",   __func__, hparams.n_head);
         fprintf(stderr, "%s: n_head_kv  = %u\n",   __func__, hparams.n_head_kv);
         fprintf(stderr, "%s: n_layer    = %u\n",   __func__, hparams.n_layer);
-        fprintf(stderr, "%s: n_rot      = %u\n",   __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
+        fprintf(stderr, "%s: n_rot      = %u\n",   __func__, hparams.n_embd/hparams.n_head); // a.k.a. n_embd_head, n_head_dim
         fprintf(stderr, "%s: n_gqa      = %u\n",   __func__, hparams.n_gqa());
         fprintf(stderr, "%s: rnorm_eps  = %.1e\n", __func__, hparams.f_rms_norm_eps);
         fprintf(stderr, "%s: n_ff       = %u\n",   __func__, n_ff);
diff --git a/llama.h b/llama.h
index 40d0737a2..fa1977f2d 100644
--- a/llama.h
+++ b/llama.h
@@ -40,7 +40,7 @@
 #define LLAMA_FILE_MAGIC_GGML        0x67676d6cu // 'ggml'
 #define LLAMA_FILE_MAGIC_GGSN        0x6767736eu // 'ggsn'
 
-#define LLAMA_FILE_VERSION           4
+#define LLAMA_FILE_VERSION           3
 #define LLAMA_FILE_MAGIC             LLAMA_FILE_MAGIC_GGJT
 #define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
 #define LLAMA_SESSION_MAGIC          LLAMA_FILE_MAGIC_GGSN