Minor + style
This commit is contained in:
parent
9bc5d83502
commit
a9d8329d45
3 changed files with 22 additions and 17 deletions
|
@ -92,7 +92,12 @@ for codepoint, norm in table_nfd:
|
||||||
|
|
||||||
# Generate 'unicode-data.cpp'
|
# Generate 'unicode-data.cpp'
|
||||||
|
|
||||||
print("""\
|
|
||||||
|
def out(line=""):
|
||||||
|
print(line, end='\n') # noqa
|
||||||
|
|
||||||
|
|
||||||
|
out("""\
|
||||||
// generated with scripts/gen-unicode-data.py
|
// generated with scripts/gen-unicode-data.py
|
||||||
|
|
||||||
#include "unicode-data.h"
|
#include "unicode-data.h"
|
||||||
|
@ -103,27 +108,27 @@ print("""\
|
||||||
#include <unordered_set>
|
#include <unordered_set>
|
||||||
""")
|
""")
|
||||||
|
|
||||||
print("const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = { // start, flags // last=next_start-1")
|
out("const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = { // start, flags // last=next_start-1")
|
||||||
for codepoint, flags in ranges_flags:
|
for codepoint, flags in ranges_flags:
|
||||||
flags = int.from_bytes(bytes(flags), "little")
|
flags = int.from_bytes(bytes(flags), "little")
|
||||||
print("{0x%06X, 0x%04X}," % (codepoint, flags))
|
out("{0x%06X, 0x%04X}," % (codepoint, flags))
|
||||||
print("};\n")
|
out("};\n")
|
||||||
|
|
||||||
print("const std::unordered_set<uint32_t> unicode_set_whitespace = {")
|
out("const std::unordered_set<uint32_t> unicode_set_whitespace = {")
|
||||||
print(", ".join("0x%06X" % cpt for cpt in table_whitespace))
|
out(", ".join("0x%06X" % cpt for cpt in table_whitespace))
|
||||||
print("};\n")
|
out("};\n")
|
||||||
|
|
||||||
print("const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {")
|
out("const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {")
|
||||||
for tuple in table_lowercase:
|
for tuple in table_lowercase:
|
||||||
print("{0x%06X, 0x%06X}," % tuple)
|
out("{0x%06X, 0x%06X}," % tuple)
|
||||||
print("};\n")
|
out("};\n")
|
||||||
|
|
||||||
print("const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {")
|
out("const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {")
|
||||||
for tuple in table_uppercase:
|
for tuple in table_uppercase:
|
||||||
print("{0x%06X, 0x%06X}," % tuple)
|
out("{0x%06X, 0x%06X}," % tuple)
|
||||||
print("};\n")
|
out("};\n")
|
||||||
|
|
||||||
print("const std::vector<std::tuple<uint32_t, uint32_t, uint32_t>> unicode_ranges_nfd = { // start, last, nfd")
|
out("const std::vector<std::tuple<uint32_t, uint32_t, uint32_t>> unicode_ranges_nfd = { // start, last, nfd")
|
||||||
for triple in ranges_nfd:
|
for triple in ranges_nfd:
|
||||||
print("{0x%06X, 0x%06X, 0x%06X}," % triple)
|
out("{0x%06X, 0x%06X, 0x%06X}," % triple)
|
||||||
print("};\n")
|
out("};\n")
|
||||||
|
|
|
@ -6,7 +6,6 @@
|
||||||
# python3 tests/test-tokenizer-random.py ./models/ggml-vocab-llama-bpe.gguf ./models/tokenizers/llama-bpe
|
# python3 tests/test-tokenizer-random.py ./models/ggml-vocab-llama-bpe.gguf ./models/tokenizers/llama-bpe
|
||||||
#
|
#
|
||||||
|
|
||||||
import os
|
|
||||||
import time
|
import time
|
||||||
import logging
|
import logging
|
||||||
import argparse
|
import argparse
|
||||||
|
|
|
@ -12,6 +12,7 @@
|
||||||
#include <unordered_set>
|
#include <unordered_set>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <array>
|
||||||
#include <locale>
|
#include <locale>
|
||||||
#include <codecvt>
|
#include <codecvt>
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue