refactor: Revise check_vocab_size for Enhanced Clarity and Correctness

- Resolved an unreachable branch issue by reorganizing the conditional structure. - Moved the special case check for `params.n_vocab == -1` to the top for immediate assertion. - Flattened the conditional logic for improved clarity and predictability of the function's behavior. These changes enhance the readability and functional correctness of the `check_vocab_size` function without altering its intended functionality.
2024-01-09 13:30:35 -05:00 · 2024-01-09 13:30:35 -05:00 · 787860ada2
commit 787860ada2
parent dd1c1004f8
1 changed files with 26 additions and 27 deletions
--- a/convert.py
+++ b/convert.py
@ -988,35 +988,34 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc


 def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None:
-    if params.n_vocab != vocab.vocab_size:
-        if params.n_vocab == vocab.vocab_size:
-            print(
-                "Ignoring added_tokens.json since model matches vocab size without it."
-            )
-            return
-        if pad_vocab and params.n_vocab > vocab.vocab_size:
-            pad_count = params.n_vocab - vocab.vocab_size
-            print(
-                f"Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>"
-            )
-            for i in range(1, (params.n_vocab - vocab.vocab_size) + 1):
-                vocab.added_tokens_dict[f"<dummy{i:05}>"] = -1
-            vocab.vocab_size = params.n_vocab
-            return
-        msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
-        msg += f" has {vocab.vocab_size})."
-        if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20:
-            msg += f"  Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
-        if vocab.vocab_size < params.n_vocab:
-            msg += " Add the --pad-vocab option and try again."
+    # Handle special case where the model's vocab size is not set
+    if params.n_vocab == -1:
+        raise ValueError(
+            "The model's vocab size is set to -1 in params.json. Please update it manually."
+        )

-        # Check if params.n_vocab is -1 and issue a warning
-        if params.n_vocab == -1:
-            warnings.warn(
-                "WARNING: The model's vocab size is set to -1 in params.json. Please update it manually."
-            )
+    # Check for a vocab size mismatch
+    if params.n_vocab == vocab.vocab_size:
+        print("Ignoring added_tokens.json since model matches vocab size without it.")
+        return

-        raise Exception(msg)
+    if pad_vocab and params.n_vocab > vocab.vocab_size:
+        pad_count = params.n_vocab - vocab.vocab_size
+        print(
+            f"Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>"
+        )
+        for i in range(1, pad_count + 1):
+            vocab.added_tokens_dict[f"<dummy{i:05}>"] = -1
+        vocab.vocab_size = params.n_vocab
+        return
+
+    msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer} has {vocab.vocab_size})."
+    if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20:
+        msg += f"  Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
+    if vocab.vocab_size < params.n_vocab:
+        msg += " Add the --pad-vocab option and try again."
+
+    raise Exception(msg)


 class OutputFile: