remove byte_encoder

2023-10-15 09:46:48 +00:00 · 2023-10-15 09:46:48 +00:00 · 1f16e5f234
commit 1f16e5f234
parent 6ec856b3ef
1 changed files with 0 additions and 24 deletions
--- a/convert.py
+++ b/convert.py
@ -301,28 +301,6 @@ class Params:
 #
 # vocab
 #
 def bytes_to_unicode():
    # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
    """
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a significant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8+n)
            n += 1
    return dict(zip(bs, (chr(n) for n in cs)))
 class BpeVocab:
    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
        self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
@ -463,8 +441,6 @@ class HFVocab:
    def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        tokenizer = self.tokenizer
        reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
        byte_encoder = bytes_to_unicode()
        byte_decoder = {v: k for k, v in byte_encoder.items()}
        for i in range(tokenizer.vocab_size):
            text = reverse_vocab[i].encode("utf-8")