Unicode codepoint flags for custom regexs (#7245)

* Replace CODEPOINT_TYPE_* with codepoint_flags
* Update and bugfix brute force random test
* Deterministic brute force random test
* Unicode normalization NFD
* Get rid of BOM
This commit is contained in:
jaime-m-p 2024-05-18 01:09:13 +02:00 committed by GitHub
parent 0fc1e820a9
commit b43272afa2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 7299 additions and 2409 deletions

View file

@ -1,17 +1,20 @@
#pragma once
#include <cstdint>
#include <map>
#include <utility>
#include <vector>
#include <unordered_map>
#include <unordered_set>
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_number;
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter;
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_separator;
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace;
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_accent_mark;
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_punctuation;
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_symbol;
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_control;
extern const std::multimap<uint32_t, uint32_t> unicode_map_nfd;
extern const std::map<char32_t, char32_t> unicode_map_lowercase;
struct range_nfd {
uint32_t first;
uint32_t last;
uint32_t nfd;
};
static const uint32_t MAX_CODEPOINTS = 0x110000;
extern const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
extern const std::unordered_set<uint32_t> unicode_set_whitespace;
extern const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase;
extern const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase;
extern const std::vector<range_nfd> unicode_ranges_nfd;