From a9d8329d45afbb513e0dc7cb6f0cbba19dbdcb19 Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Wed, 15 May 2024 15:05:40 +0200 Subject: [PATCH] Minor + style --- scripts/gen-unicode-data.py | 37 +++++++++++++++++++--------------- tests/test-tokenizer-random.py | 1 - unicode.cpp | 1 + 3 files changed, 22 insertions(+), 17 deletions(-) diff --git a/scripts/gen-unicode-data.py b/scripts/gen-unicode-data.py index 587bc5e02..06fab5372 100644 --- a/scripts/gen-unicode-data.py +++ b/scripts/gen-unicode-data.py @@ -92,7 +92,12 @@ for codepoint, norm in table_nfd: # Generate 'unicode-data.cpp' -print("""\ + +def out(line=""): + print(line, end='\n') # noqa + + +out("""\ // generated with scripts/gen-unicode-data.py #include "unicode-data.h" @@ -103,27 +108,27 @@ print("""\ #include """) -print("const std::vector> unicode_ranges_flags = { // start, flags // last=next_start-1") +out("const std::vector> unicode_ranges_flags = { // start, flags // last=next_start-1") for codepoint, flags in ranges_flags: flags = int.from_bytes(bytes(flags), "little") - print("{0x%06X, 0x%04X}," % (codepoint, flags)) -print("};\n") + out("{0x%06X, 0x%04X}," % (codepoint, flags)) +out("};\n") -print("const std::unordered_set unicode_set_whitespace = {") -print(", ".join("0x%06X" % cpt for cpt in table_whitespace)) -print("};\n") +out("const std::unordered_set unicode_set_whitespace = {") +out(", ".join("0x%06X" % cpt for cpt in table_whitespace)) +out("};\n") -print("const std::unordered_map unicode_map_lowercase = {") +out("const std::unordered_map unicode_map_lowercase = {") for tuple in table_lowercase: - print("{0x%06X, 0x%06X}," % tuple) -print("};\n") + out("{0x%06X, 0x%06X}," % tuple) +out("};\n") -print("const std::unordered_map unicode_map_uppercase = {") +out("const std::unordered_map unicode_map_uppercase = {") for tuple in table_uppercase: - print("{0x%06X, 0x%06X}," % tuple) -print("};\n") + out("{0x%06X, 0x%06X}," % tuple) +out("};\n") -print("const std::vector> unicode_ranges_nfd = { // start, last, nfd") +out("const std::vector> unicode_ranges_nfd = { // start, last, nfd") for triple in ranges_nfd: - print("{0x%06X, 0x%06X, 0x%06X}," % triple) -print("};\n") + out("{0x%06X, 0x%06X, 0x%06X}," % triple) +out("};\n") diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index 0ef15d327..d5a6f185f 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -6,7 +6,6 @@ # python3 tests/test-tokenizer-random.py ./models/ggml-vocab-llama-bpe.gguf ./models/tokenizers/llama-bpe # -import os import time import logging import argparse diff --git a/unicode.cpp b/unicode.cpp index 8e42ed4a4..26e234031 100644 --- a/unicode.cpp +++ b/unicode.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include