lint : fix

This commit is contained in:
Georgi Gerganov 2024-05-03 21:34:18 +03:00
parent d53240ccc2
commit c30056a700
No known key found for this signature in database
GPG key ID: BF970631944C16B7

View file

@ -1,5 +1,5 @@
import regex import regex
import struct
def cpt_to_utf8_str(cpt): def cpt_to_utf8_str(cpt):
if cpt <= 0xFF: if cpt <= 0xFF:
@ -11,13 +11,15 @@ def cpt_to_utf8_str(cpt):
else: else:
return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, cpt >> 24]) return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, cpt >> 24])
def is_match(codepoint, regex_expr): def is_match(codepoint, regex_expr):
try: try:
res = regex.match(regex_expr, cpt_to_utf8_str(codepoint).decode('utf-32')) res = regex.match(regex_expr, cpt_to_utf8_str(codepoint).decode('utf-32'))
return res is not None return res is not None
except: except Exception:
return False return False
def get_matches(regex_expr): def get_matches(regex_expr):
unicode_ranges = [] unicode_ranges = []
current_range = None current_range = None
@ -37,6 +39,7 @@ def get_matches(regex_expr):
return unicode_ranges return unicode_ranges
def print_cat(cat, ranges): def print_cat(cat, ranges):
print("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat)) print("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat))
cnt = 0 cnt = 0
@ -53,6 +56,7 @@ def print_cat(cat, ranges):
print("};") print("};")
print("") print("")
print_cat("number", get_matches(r'\p{N}')) print_cat("number", get_matches(r'\p{N}'))
print_cat("letter", get_matches(r'\p{L}')) print_cat("letter", get_matches(r'\p{L}'))
print_cat("whitespace", get_matches(r'\p{Z}')) print_cat("whitespace", get_matches(r'\p{Z}'))
@ -60,4 +64,3 @@ print_cat("accent_mark", get_matches(r'\p{M}'))
print_cat("punctuation", get_matches(r'\p{P}')) print_cat("punctuation", get_matches(r'\p{P}'))
print_cat("symbol", get_matches(r'\p{S}')) print_cat("symbol", get_matches(r'\p{S}'))
print_cat("control", get_matches(r'\p{C}')) print_cat("control", get_matches(r'\p{C}'))