From ea9f35f082c3ff377055af462a4f5e94866240e7 Mon Sep 17 00:00:00 2001 From: wonjun Jang Date: Sun, 15 Oct 2023 09:42:03 +0000 Subject: [PATCH] add bytes_to_unicode function --- convert.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/convert.py b/convert.py index 34f380cd7..98ea6d9c8 100755 --- a/convert.py +++ b/convert.py @@ -301,6 +301,27 @@ class Params: # # vocab # +def bytes_to_unicode(): + # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a significant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8+n) + n += 1 + return dict(zip(bs, (chr(n) for n in cs))) + class BpeVocab: def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: