Minor + style

This commit is contained in:
jaime-m-p 2024-05-15 15:05:40 +02:00
parent 9bc5d83502
commit a9d8329d45
3 changed files with 22 additions and 17 deletions

View file

@ -92,7 +92,12 @@ for codepoint, norm in table_nfd:
# Generate 'unicode-data.cpp' # Generate 'unicode-data.cpp'
print("""\
def out(line=""):
print(line, end='\n') # noqa
out("""\
// generated with scripts/gen-unicode-data.py // generated with scripts/gen-unicode-data.py
#include "unicode-data.h" #include "unicode-data.h"
@ -103,27 +108,27 @@ print("""\
#include <unordered_set> #include <unordered_set>
""") """)
print("const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = { // start, flags // last=next_start-1") out("const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = { // start, flags // last=next_start-1")
for codepoint, flags in ranges_flags: for codepoint, flags in ranges_flags:
flags = int.from_bytes(bytes(flags), "little") flags = int.from_bytes(bytes(flags), "little")
print("{0x%06X, 0x%04X}," % (codepoint, flags)) out("{0x%06X, 0x%04X}," % (codepoint, flags))
print("};\n") out("};\n")
print("const std::unordered_set<uint32_t> unicode_set_whitespace = {") out("const std::unordered_set<uint32_t> unicode_set_whitespace = {")
print(", ".join("0x%06X" % cpt for cpt in table_whitespace)) out(", ".join("0x%06X" % cpt for cpt in table_whitespace))
print("};\n") out("};\n")
print("const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {") out("const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {")
for tuple in table_lowercase: for tuple in table_lowercase:
print("{0x%06X, 0x%06X}," % tuple) out("{0x%06X, 0x%06X}," % tuple)
print("};\n") out("};\n")
print("const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {") out("const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {")
for tuple in table_uppercase: for tuple in table_uppercase:
print("{0x%06X, 0x%06X}," % tuple) out("{0x%06X, 0x%06X}," % tuple)
print("};\n") out("};\n")
print("const std::vector<std::tuple<uint32_t, uint32_t, uint32_t>> unicode_ranges_nfd = { // start, last, nfd") out("const std::vector<std::tuple<uint32_t, uint32_t, uint32_t>> unicode_ranges_nfd = { // start, last, nfd")
for triple in ranges_nfd: for triple in ranges_nfd:
print("{0x%06X, 0x%06X, 0x%06X}," % triple) out("{0x%06X, 0x%06X, 0x%06X}," % triple)
print("};\n") out("};\n")

View file

@ -6,7 +6,6 @@
# python3 tests/test-tokenizer-random.py ./models/ggml-vocab-llama-bpe.gguf ./models/tokenizers/llama-bpe # python3 tests/test-tokenizer-random.py ./models/ggml-vocab-llama-bpe.gguf ./models/tokenizers/llama-bpe
# #
import os
import time import time
import logging import logging
import argparse import argparse

View file

@ -12,6 +12,7 @@
#include <unordered_set> #include <unordered_set>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include <array>
#include <locale> #include <locale>
#include <codecvt> #include <codecvt>