From c30056a700d7e54ebd4d6ad370e08070dc6b3ccc Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 3 May 2024 21:34:18 +0300 Subject: [PATCH] lint : fix --- scripts/gen-unicode-data.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/scripts/gen-unicode-data.py b/scripts/gen-unicode-data.py index ee6de3f08..d49cbf2a0 100644 --- a/scripts/gen-unicode-data.py +++ b/scripts/gen-unicode-data.py @@ -1,5 +1,5 @@ import regex -import struct + def cpt_to_utf8_str(cpt): if cpt <= 0xFF: @@ -11,13 +11,15 @@ def cpt_to_utf8_str(cpt): else: return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, cpt >> 24]) + def is_match(codepoint, regex_expr): try: res = regex.match(regex_expr, cpt_to_utf8_str(codepoint).decode('utf-32')) return res is not None - except: + except Exception: return False + def get_matches(regex_expr): unicode_ranges = [] current_range = None @@ -37,6 +39,7 @@ def get_matches(regex_expr): return unicode_ranges + def print_cat(cat, ranges): print("const std::vector> unicode_ranges_{} = {{".format(cat)) cnt = 0 @@ -53,6 +56,7 @@ def print_cat(cat, ranges): print("};") print("") + print_cat("number", get_matches(r'\p{N}')) print_cat("letter", get_matches(r'\p{L}')) print_cat("whitespace", get_matches(r'\p{Z}')) @@ -60,4 +64,3 @@ print_cat("accent_mark", get_matches(r'\p{M}')) print_cat("punctuation", get_matches(r'\p{P}')) print_cat("symbol", get_matches(r'\p{S}')) print_cat("control", get_matches(r'\p{C}')) -