From ea9f35f082c3ff377055af462a4f5e94866240e7 Mon Sep 17 00:00:00 2001
From: wonjun Jang <strutive07@gmail.com>
Date: Sun, 15 Oct 2023 09:42:03 +0000
Subject: [PATCH] add bytes_to_unicode function

---
 convert.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/convert.py b/convert.py
index 34f380cd7..98ea6d9c8 100755
--- a/convert.py
+++ b/convert.py
@@ -301,6 +301,27 @@ class Params:
 #
 # vocab
 #
+def bytes_to_unicode():
+    # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a significant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    return dict(zip(bs, (chr(n) for n in cs)))
+
 
 class BpeVocab:
     def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: