mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-02-12 09:17:53 +00:00
- 10.5% reduction of o//depend dependency graph - 8.8% reduction in latency of make command - Fix issue with temporary file cleanup There's a new -w option in compile.com that turns off the recent Landlock output path workaround for "good commands" which do not unlink() the output file like GNU tooling does. Our new GNU Make unveil sandboxing appears to have zero overhead in the grand scheme of things. Full builds are pretty fast since the only thing that's actually slowed us down is probably libcxx make -j16 MODE=rel RL: took 85,732,063µs wall time RL: ballooned to 323,612kb in size RL: needed 828,560,521µs cpu (11% kernel) RL: caused 39,080,670 page faults (99% memcpy) RL: 350,073 context switches (72% consensual) RL: performed 0 reads and 11,494,960 write i/o operations pledge() and unveil() no longer consider ENOSYS to be an error. These functions have also been added to Python's cosmo module. This change also removes some WIN32 APIs and System Five magnums which we're not using and it's doubtful anyone else would be too
1565 lines
46 KiB
C
1565 lines
46 KiB
C
/*
|
|
* Unicode utilities
|
|
*
|
|
* Copyright (c) 2017-2018 Fabrice Bellard
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
* of this software and associated documentation files (the "Software"), to deal
|
|
* in the Software without restriction, including without limitation the rights
|
|
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
* copies of the Software, and to permit persons to whom the Software is
|
|
* furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included in
|
|
* all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
* THE SOFTWARE.
|
|
*/
|
|
#include "libc/assert.h"
|
|
#include "libc/limits.h"
|
|
#include "libc/mem/mem.h"
|
|
#include "libc/runtime/runtime.h"
|
|
#include "libc/stdio/stdio.h"
|
|
#include "libc/str/str.h"
|
|
#include "third_party/quickjs/cutils.h"
|
|
#include "third_party/quickjs/libunicode.h"
|
|
|
|
asm(".ident\t\"\\n\\n\
|
|
QuickJS (MIT License)\\n\
|
|
Copyright (c) 2017-2021 Fabrice Bellard\\n\
|
|
Copyright (c) 2017-2021 Charlie Gordon\"");
|
|
asm(".include \"libc/disclaimer.inc\"");
|
|
|
|
/* clang-format off */
|
|
|
|
#include "third_party/quickjs/libunicode-table.inc"
|
|
|
|
enum {
|
|
RUN_TYPE_U,
|
|
RUN_TYPE_L,
|
|
RUN_TYPE_UF,
|
|
RUN_TYPE_LF,
|
|
RUN_TYPE_UL,
|
|
RUN_TYPE_LSU,
|
|
RUN_TYPE_U2L_399_EXT2,
|
|
RUN_TYPE_UF_D20,
|
|
RUN_TYPE_UF_D1_EXT,
|
|
RUN_TYPE_U_EXT,
|
|
RUN_TYPE_LF_EXT,
|
|
RUN_TYPE_U_EXT2,
|
|
RUN_TYPE_L_EXT2,
|
|
RUN_TYPE_U_EXT3,
|
|
};
|
|
|
|
/* conv_type:
|
|
0 = to upper
|
|
1 = to lower
|
|
2 = case folding (= to lower with modifications)
|
|
*/
|
|
int lre_case_conv(uint32_t *res, uint32_t c, int conv_type)
|
|
{
|
|
if (c < 128) {
|
|
if (conv_type) {
|
|
if (c >= 'A' && c <= 'Z') {
|
|
c = c - 'A' + 'a';
|
|
}
|
|
} else {
|
|
if (c >= 'a' && c <= 'z') {
|
|
c = c - 'a' + 'A';
|
|
}
|
|
}
|
|
} else {
|
|
uint32_t v, code, data, type, len, a, is_lower;
|
|
int idx, idx_min, idx_max;
|
|
|
|
is_lower = (conv_type != 0);
|
|
idx_min = 0;
|
|
idx_max = countof(case_conv_table1) - 1;
|
|
while (idx_min <= idx_max) {
|
|
idx = (unsigned)(idx_max + idx_min) / 2;
|
|
v = case_conv_table1[idx];
|
|
code = v >> (32 - 17);
|
|
len = (v >> (32 - 17 - 7)) & 0x7f;
|
|
if (c < code) {
|
|
idx_max = idx - 1;
|
|
} else if (c >= code + len) {
|
|
idx_min = idx + 1;
|
|
} else {
|
|
type = (v >> (32 - 17 - 7 - 4)) & 0xf;
|
|
data = ((v & 0xf) << 8) | case_conv_table2[idx];
|
|
switch(type) {
|
|
case RUN_TYPE_U:
|
|
case RUN_TYPE_L:
|
|
case RUN_TYPE_UF:
|
|
case RUN_TYPE_LF:
|
|
if (conv_type == (type & 1) ||
|
|
(type >= RUN_TYPE_UF && conv_type == 2)) {
|
|
c = c - code + (case_conv_table1[data] >> (32 - 17));
|
|
}
|
|
break;
|
|
case RUN_TYPE_UL:
|
|
a = c - code;
|
|
if ((a & 1) != (1 - is_lower))
|
|
break;
|
|
c = (a ^ 1) + code;
|
|
break;
|
|
case RUN_TYPE_LSU:
|
|
a = c - code;
|
|
if (a == 1) {
|
|
c += 2 * is_lower - 1;
|
|
} else if (a == (1 - is_lower) * 2) {
|
|
c += (2 * is_lower - 1) * 2;
|
|
}
|
|
break;
|
|
case RUN_TYPE_U2L_399_EXT2:
|
|
if (!is_lower) {
|
|
res[0] = c - code + case_conv_ext[data >> 6];
|
|
res[1] = 0x399;
|
|
return 2;
|
|
} else {
|
|
c = c - code + case_conv_ext[data & 0x3f];
|
|
}
|
|
break;
|
|
case RUN_TYPE_UF_D20:
|
|
if (conv_type == 1)
|
|
break;
|
|
c = data + (conv_type == 2) * 0x20;
|
|
break;
|
|
case RUN_TYPE_UF_D1_EXT:
|
|
if (conv_type == 1)
|
|
break;
|
|
c = case_conv_ext[data] + (conv_type == 2);
|
|
break;
|
|
case RUN_TYPE_U_EXT:
|
|
case RUN_TYPE_LF_EXT:
|
|
if (is_lower != (type - RUN_TYPE_U_EXT))
|
|
break;
|
|
c = case_conv_ext[data];
|
|
break;
|
|
case RUN_TYPE_U_EXT2:
|
|
case RUN_TYPE_L_EXT2:
|
|
if (conv_type != (type - RUN_TYPE_U_EXT2))
|
|
break;
|
|
res[0] = c - code + case_conv_ext[data >> 6];
|
|
res[1] = case_conv_ext[data & 0x3f];
|
|
return 2;
|
|
default:
|
|
case RUN_TYPE_U_EXT3:
|
|
if (conv_type != 0)
|
|
break;
|
|
res[0] = case_conv_ext[data >> 8];
|
|
res[1] = case_conv_ext[(data >> 4) & 0xf];
|
|
res[2] = case_conv_ext[data & 0xf];
|
|
return 3;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
res[0] = c;
|
|
return 1;
|
|
}
|
|
|
|
static uint32_t get_le24(const uint8_t *ptr)
|
|
{
|
|
#if defined(__x86__) || defined(__x86_64__)
|
|
return *(uint16_t *)ptr | (ptr[2] << 16);
|
|
#else
|
|
return ptr[0] | (ptr[1] << 8) | (ptr[2] << 16);
|
|
#endif
|
|
}
|
|
|
|
#define UNICODE_INDEX_BLOCK_LEN 32
|
|
|
|
/* return -1 if not in table, otherwise the offset in the block */
|
|
static int get_index_pos(uint32_t *pcode, uint32_t c,
|
|
const uint8_t *index_table, int index_table_len)
|
|
{
|
|
uint32_t code, v;
|
|
int idx_min, idx_max, idx;
|
|
|
|
idx_min = 0;
|
|
v = get_le24(index_table);
|
|
code = v & ((1 << 21) - 1);
|
|
if (c < code) {
|
|
*pcode = 0;
|
|
return 0;
|
|
}
|
|
idx_max = index_table_len - 1;
|
|
code = get_le24(index_table + idx_max * 3);
|
|
if (c >= code)
|
|
return -1;
|
|
/* invariant: tab[idx_min] <= c < tab2[idx_max] */
|
|
while ((idx_max - idx_min) > 1) {
|
|
idx = (idx_max + idx_min) / 2;
|
|
v = get_le24(index_table + idx * 3);
|
|
code = v & ((1 << 21) - 1);
|
|
if (c < code) {
|
|
idx_max = idx;
|
|
} else {
|
|
idx_min = idx;
|
|
}
|
|
}
|
|
v = get_le24(index_table + idx_min * 3);
|
|
*pcode = v & ((1 << 21) - 1);
|
|
return (idx_min + 1) * UNICODE_INDEX_BLOCK_LEN + (v >> 21);
|
|
}
|
|
|
|
static BOOL lre_is_in_table(uint32_t c, const uint8_t *table,
|
|
const uint8_t *index_table, int index_table_len)
|
|
{
|
|
uint32_t code, b, bit;
|
|
int pos;
|
|
const uint8_t *p;
|
|
|
|
pos = get_index_pos(&code, c, index_table, index_table_len);
|
|
if (pos < 0)
|
|
return FALSE; /* outside the table */
|
|
p = table + pos;
|
|
bit = 0;
|
|
for(;;) {
|
|
b = *p++;
|
|
if (b < 64) {
|
|
code += (b >> 3) + 1;
|
|
if (c < code)
|
|
return bit;
|
|
bit ^= 1;
|
|
code += (b & 7) + 1;
|
|
} else if (b >= 0x80) {
|
|
code += b - 0x80 + 1;
|
|
} else if (b < 0x60) {
|
|
code += (((b - 0x40) << 8) | p[0]) + 1;
|
|
p++;
|
|
} else {
|
|
code += (((b - 0x60) << 16) | (p[0] << 8) | p[1]) + 1;
|
|
p += 2;
|
|
}
|
|
if (c < code)
|
|
return bit;
|
|
bit ^= 1;
|
|
}
|
|
}
|
|
|
|
BOOL lre_is_cased(uint32_t c)
|
|
{
|
|
uint32_t v, code, len;
|
|
int idx, idx_min, idx_max;
|
|
|
|
idx_min = 0;
|
|
idx_max = countof(case_conv_table1) - 1;
|
|
while (idx_min <= idx_max) {
|
|
idx = (unsigned)(idx_max + idx_min) / 2;
|
|
v = case_conv_table1[idx];
|
|
code = v >> (32 - 17);
|
|
len = (v >> (32 - 17 - 7)) & 0x7f;
|
|
if (c < code) {
|
|
idx_max = idx - 1;
|
|
} else if (c >= code + len) {
|
|
idx_min = idx + 1;
|
|
} else {
|
|
return TRUE;
|
|
}
|
|
}
|
|
return lre_is_in_table(c, unicode_prop_Cased1_table,
|
|
unicode_prop_Cased1_index,
|
|
sizeof(unicode_prop_Cased1_index) / 3);
|
|
}
|
|
|
|
BOOL lre_is_case_ignorable(uint32_t c)
|
|
{
|
|
return lre_is_in_table(c, unicode_prop_Case_Ignorable_table,
|
|
unicode_prop_Case_Ignorable_index,
|
|
sizeof(unicode_prop_Case_Ignorable_index) / 3);
|
|
}
|
|
|
|
/* character range */
|
|
|
|
static __maybe_unused void cr_dump(CharRange *cr)
|
|
{
|
|
int i;
|
|
for(i = 0; i < cr->len; i++)
|
|
printf("%d: 0x%04x\n", i, cr->points[i]);
|
|
}
|
|
|
|
static void *cr_default_realloc(void *opaque, void *ptr, size_t size)
|
|
{
|
|
return realloc(ptr, size);
|
|
}
|
|
|
|
void cr_init(CharRange *cr, void *mem_opaque, DynBufReallocFunc *realloc_func)
|
|
{
|
|
cr->len = cr->size = 0;
|
|
cr->points = NULL;
|
|
cr->mem_opaque = mem_opaque;
|
|
cr->realloc_func = realloc_func ? realloc_func : cr_default_realloc;
|
|
}
|
|
|
|
void cr_free(CharRange *cr)
|
|
{
|
|
cr->realloc_func(cr->mem_opaque, cr->points, 0);
|
|
}
|
|
|
|
int cr_realloc(CharRange *cr, int size)
|
|
{
|
|
int new_size;
|
|
uint32_t *new_buf;
|
|
|
|
if (size > cr->size) {
|
|
new_size = max_int(size, cr->size * 3 / 2);
|
|
new_buf = cr->realloc_func(cr->mem_opaque, cr->points,
|
|
new_size * sizeof(cr->points[0]));
|
|
if (!new_buf)
|
|
return -1;
|
|
cr->points = new_buf;
|
|
cr->size = new_size;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
int cr_copy(CharRange *cr, const CharRange *cr1)
|
|
{
|
|
if (cr_realloc(cr, cr1->len))
|
|
return -1;
|
|
memcpy(cr->points, cr1->points, sizeof(cr->points[0]) * cr1->len);
|
|
cr->len = cr1->len;
|
|
return 0;
|
|
}
|
|
|
|
/* merge consecutive intervals and remove empty intervals */
|
|
static void cr_compress(CharRange *cr)
|
|
{
|
|
int i, j, k, len;
|
|
uint32_t *pt;
|
|
|
|
pt = cr->points;
|
|
len = cr->len;
|
|
i = 0;
|
|
j = 0;
|
|
k = 0;
|
|
while ((i + 1) < len) {
|
|
if (pt[i] == pt[i + 1]) {
|
|
/* empty interval */
|
|
i += 2;
|
|
} else {
|
|
j = i;
|
|
while ((j + 3) < len && pt[j + 1] == pt[j + 2])
|
|
j += 2;
|
|
/* just copy */
|
|
pt[k] = pt[i];
|
|
pt[k + 1] = pt[j + 1];
|
|
k += 2;
|
|
i = j + 2;
|
|
}
|
|
}
|
|
cr->len = k;
|
|
}
|
|
|
|
/* union or intersection */
|
|
int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,
|
|
const uint32_t *b_pt, int b_len, int op)
|
|
{
|
|
int a_idx, b_idx, is_in;
|
|
uint32_t v;
|
|
|
|
a_idx = 0;
|
|
b_idx = 0;
|
|
for(;;) {
|
|
/* get one more point from a or b in increasing order */
|
|
if (a_idx < a_len && b_idx < b_len) {
|
|
if (a_pt[a_idx] < b_pt[b_idx]) {
|
|
goto a_add;
|
|
} else if (a_pt[a_idx] == b_pt[b_idx]) {
|
|
v = a_pt[a_idx];
|
|
a_idx++;
|
|
b_idx++;
|
|
} else {
|
|
goto b_add;
|
|
}
|
|
} else if (a_idx < a_len) {
|
|
a_add:
|
|
v = a_pt[a_idx++];
|
|
} else if (b_idx < b_len) {
|
|
b_add:
|
|
v = b_pt[b_idx++];
|
|
} else {
|
|
break;
|
|
}
|
|
/* add the point if the in/out status changes */
|
|
switch(op) {
|
|
case CR_OP_UNION:
|
|
is_in = (a_idx & 1) | (b_idx & 1);
|
|
break;
|
|
case CR_OP_INTER:
|
|
is_in = (a_idx & 1) & (b_idx & 1);
|
|
break;
|
|
case CR_OP_XOR:
|
|
is_in = (a_idx & 1) ^ (b_idx & 1);
|
|
break;
|
|
default:
|
|
abort();
|
|
}
|
|
if (is_in != (cr->len & 1)) {
|
|
if (cr_add_point(cr, v))
|
|
return -1;
|
|
}
|
|
}
|
|
cr_compress(cr);
|
|
return 0;
|
|
}
|
|
|
|
int cr_union1(CharRange *cr, const uint32_t *b_pt, int b_len)
|
|
{
|
|
CharRange a = *cr;
|
|
int ret;
|
|
cr->len = 0;
|
|
cr->size = 0;
|
|
cr->points = NULL;
|
|
ret = cr_op(cr, a.points, a.len, b_pt, b_len, CR_OP_UNION);
|
|
cr_free(&a);
|
|
return ret;
|
|
}
|
|
|
|
int cr_invert(CharRange *cr)
|
|
{
|
|
int len;
|
|
len = cr->len;
|
|
if (cr_realloc(cr, len + 2))
|
|
return -1;
|
|
memmove(cr->points + 1, cr->points, len * sizeof(cr->points[0]));
|
|
cr->points[0] = 0;
|
|
cr->points[len + 1] = UINT32_MAX;
|
|
cr->len = len + 2;
|
|
cr_compress(cr);
|
|
return 0;
|
|
}
|
|
|
|
#ifdef CONFIG_ALL_UNICODE
|
|
|
|
BOOL lre_is_id_start(uint32_t c)
|
|
{
|
|
return lre_is_in_table(c, unicode_prop_ID_Start_table,
|
|
unicode_prop_ID_Start_index,
|
|
sizeof(unicode_prop_ID_Start_index) / 3);
|
|
}
|
|
|
|
BOOL lre_is_id_continue(uint32_t c)
|
|
{
|
|
return lre_is_id_start(c) ||
|
|
lre_is_in_table(c, unicode_prop_ID_Continue1_table,
|
|
unicode_prop_ID_Continue1_index,
|
|
sizeof(unicode_prop_ID_Continue1_index) / 3);
|
|
}
|
|
|
|
#define UNICODE_DECOMP_LEN_MAX 18
|
|
|
|
typedef enum {
|
|
DECOMP_TYPE_C1, /* 16 bit char */
|
|
DECOMP_TYPE_L1, /* 16 bit char table */
|
|
DECOMP_TYPE_L2,
|
|
DECOMP_TYPE_L3,
|
|
DECOMP_TYPE_L4,
|
|
DECOMP_TYPE_L5, /* XXX: not used */
|
|
DECOMP_TYPE_L6, /* XXX: could remove */
|
|
DECOMP_TYPE_L7, /* XXX: could remove */
|
|
DECOMP_TYPE_LL1, /* 18 bit char table */
|
|
DECOMP_TYPE_LL2,
|
|
DECOMP_TYPE_S1, /* 8 bit char table */
|
|
DECOMP_TYPE_S2,
|
|
DECOMP_TYPE_S3,
|
|
DECOMP_TYPE_S4,
|
|
DECOMP_TYPE_S5,
|
|
DECOMP_TYPE_I1, /* increment 16 bit char value */
|
|
DECOMP_TYPE_I2_0,
|
|
DECOMP_TYPE_I2_1,
|
|
DECOMP_TYPE_I3_1,
|
|
DECOMP_TYPE_I3_2,
|
|
DECOMP_TYPE_I4_1,
|
|
DECOMP_TYPE_I4_2,
|
|
DECOMP_TYPE_B1, /* 16 bit base + 8 bit offset */
|
|
DECOMP_TYPE_B2,
|
|
DECOMP_TYPE_B3,
|
|
DECOMP_TYPE_B4,
|
|
DECOMP_TYPE_B5,
|
|
DECOMP_TYPE_B6,
|
|
DECOMP_TYPE_B7,
|
|
DECOMP_TYPE_B8,
|
|
DECOMP_TYPE_B18,
|
|
DECOMP_TYPE_LS2,
|
|
DECOMP_TYPE_PAT3,
|
|
DECOMP_TYPE_S2_UL,
|
|
DECOMP_TYPE_LS2_UL,
|
|
} DecompTypeEnum;
|
|
|
|
static uint32_t unicode_get_short_code(uint32_t c)
|
|
{
|
|
static const uint16_t unicode_short_table[2] = { 0x2044, 0x2215 };
|
|
|
|
if (c < 0x80)
|
|
return c;
|
|
else if (c < 0x80 + 0x50)
|
|
return c - 0x80 + 0x300;
|
|
else
|
|
return unicode_short_table[c - 0x80 - 0x50];
|
|
}
|
|
|
|
static uint32_t unicode_get_lower_simple(uint32_t c)
|
|
{
|
|
if (c < 0x100 || (c >= 0x410 && c <= 0x42f))
|
|
c += 0x20;
|
|
else
|
|
c++;
|
|
return c;
|
|
}
|
|
|
|
static uint16_t unicode_get16(const uint8_t *p)
|
|
{
|
|
return p[0] | (p[1] << 8);
|
|
}
|
|
|
|
static int unicode_decomp_entry(uint32_t *res, uint32_t c,
|
|
int idx, uint32_t code, uint32_t len,
|
|
uint32_t type)
|
|
{
|
|
uint32_t c1;
|
|
int l, i, p;
|
|
const uint8_t *d;
|
|
|
|
if (type == DECOMP_TYPE_C1) {
|
|
res[0] = unicode_decomp_table2[idx];
|
|
return 1;
|
|
} else {
|
|
d = unicode_decomp_data + unicode_decomp_table2[idx];
|
|
switch(type) {
|
|
case DECOMP_TYPE_L1:
|
|
case DECOMP_TYPE_L2:
|
|
case DECOMP_TYPE_L3:
|
|
case DECOMP_TYPE_L4:
|
|
case DECOMP_TYPE_L5:
|
|
case DECOMP_TYPE_L6:
|
|
case DECOMP_TYPE_L7:
|
|
l = type - DECOMP_TYPE_L1 + 1;
|
|
d += (c - code) * l * 2;
|
|
for(i = 0; i < l; i++) {
|
|
if ((res[i] = unicode_get16(d + 2 * i)) == 0)
|
|
return 0;
|
|
}
|
|
return l;
|
|
case DECOMP_TYPE_LL1:
|
|
case DECOMP_TYPE_LL2:
|
|
{
|
|
uint32_t k, p;
|
|
l = type - DECOMP_TYPE_LL1 + 1;
|
|
k = (c - code) * l;
|
|
p = len * l * 2;
|
|
for(i = 0; i < l; i++) {
|
|
c1 = unicode_get16(d + 2 * k) |
|
|
(((d[p + (k / 4)] >> ((k % 4) * 2)) & 3) << 16);
|
|
if (!c1)
|
|
return 0;
|
|
res[i] = c1;
|
|
k++;
|
|
}
|
|
}
|
|
return l;
|
|
case DECOMP_TYPE_S1:
|
|
case DECOMP_TYPE_S2:
|
|
case DECOMP_TYPE_S3:
|
|
case DECOMP_TYPE_S4:
|
|
case DECOMP_TYPE_S5:
|
|
l = type - DECOMP_TYPE_S1 + 1;
|
|
d += (c - code) * l;
|
|
for(i = 0; i < l; i++) {
|
|
if ((res[i] = unicode_get_short_code(d[i])) == 0)
|
|
return 0;
|
|
}
|
|
return l;
|
|
case DECOMP_TYPE_I1:
|
|
l = 1;
|
|
p = 0;
|
|
goto decomp_type_i;
|
|
case DECOMP_TYPE_I2_0:
|
|
case DECOMP_TYPE_I2_1:
|
|
case DECOMP_TYPE_I3_1:
|
|
case DECOMP_TYPE_I3_2:
|
|
case DECOMP_TYPE_I4_1:
|
|
case DECOMP_TYPE_I4_2:
|
|
l = 2 + ((type - DECOMP_TYPE_I2_0) >> 1);
|
|
p = ((type - DECOMP_TYPE_I2_0) & 1) + (l > 2);
|
|
decomp_type_i:
|
|
for(i = 0; i < l; i++) {
|
|
c1 = unicode_get16(d + 2 * i);
|
|
if (i == p)
|
|
c1 += c - code;
|
|
res[i] = c1;
|
|
}
|
|
return l;
|
|
case DECOMP_TYPE_B18:
|
|
l = 18;
|
|
goto decomp_type_b;
|
|
case DECOMP_TYPE_B1:
|
|
case DECOMP_TYPE_B2:
|
|
case DECOMP_TYPE_B3:
|
|
case DECOMP_TYPE_B4:
|
|
case DECOMP_TYPE_B5:
|
|
case DECOMP_TYPE_B6:
|
|
case DECOMP_TYPE_B7:
|
|
case DECOMP_TYPE_B8:
|
|
l = type - DECOMP_TYPE_B1 + 1;
|
|
decomp_type_b:
|
|
{
|
|
uint32_t c_min;
|
|
c_min = unicode_get16(d);
|
|
d += 2 + (c - code) * l;
|
|
for(i = 0; i < l; i++) {
|
|
c1 = d[i];
|
|
if (c1 == 0xff)
|
|
c1 = 0x20;
|
|
else
|
|
c1 += c_min;
|
|
res[i] = c1;
|
|
}
|
|
}
|
|
return l;
|
|
case DECOMP_TYPE_LS2:
|
|
d += (c - code) * 3;
|
|
if (!(res[0] = unicode_get16(d)))
|
|
return 0;
|
|
res[1] = unicode_get_short_code(d[2]);
|
|
return 2;
|
|
case DECOMP_TYPE_PAT3:
|
|
res[0] = unicode_get16(d);
|
|
res[2] = unicode_get16(d + 2);
|
|
d += 4 + (c - code) * 2;
|
|
res[1] = unicode_get16(d);
|
|
return 3;
|
|
case DECOMP_TYPE_S2_UL:
|
|
case DECOMP_TYPE_LS2_UL:
|
|
c1 = c - code;
|
|
if (type == DECOMP_TYPE_S2_UL) {
|
|
d += c1 & ~1;
|
|
c = unicode_get_short_code(*d);
|
|
d++;
|
|
} else {
|
|
d += (c1 >> 1) * 3;
|
|
c = unicode_get16(d);
|
|
d += 2;
|
|
}
|
|
if (c1 & 1)
|
|
c = unicode_get_lower_simple(c);
|
|
res[0] = c;
|
|
res[1] = unicode_get_short_code(*d);
|
|
return 2;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
|
|
/* return the length of the decomposition (length <=
|
|
UNICODE_DECOMP_LEN_MAX) or 0 if no decomposition */
|
|
static int unicode_decomp_char(uint32_t *res, uint32_t c, BOOL is_compat1)
|
|
{
|
|
uint32_t v, type, is_compat, code, len;
|
|
int idx_min, idx_max, idx;
|
|
|
|
idx_min = 0;
|
|
idx_max = countof(unicode_decomp_table1) - 1;
|
|
while (idx_min <= idx_max) {
|
|
idx = (idx_max + idx_min) / 2;
|
|
v = unicode_decomp_table1[idx];
|
|
code = v >> (32 - 18);
|
|
len = (v >> (32 - 18 - 7)) & 0x7f;
|
|
// printf("idx=%d code=%05x len=%d\n", idx, code, len);
|
|
if (c < code) {
|
|
idx_max = idx - 1;
|
|
} else if (c >= code + len) {
|
|
idx_min = idx + 1;
|
|
} else {
|
|
is_compat = v & 1;
|
|
if (is_compat1 < is_compat)
|
|
break;
|
|
type = (v >> (32 - 18 - 7 - 6)) & 0x3f;
|
|
return unicode_decomp_entry(res, c, idx, code, len, type);
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/* return 0 if no pair found */
|
|
static int unicode_compose_pair(uint32_t c0, uint32_t c1)
|
|
{
|
|
uint32_t code, len, type, v, idx1, d_idx, d_offset, ch;
|
|
int idx_min, idx_max, idx, d;
|
|
uint32_t pair[2];
|
|
|
|
idx_min = 0;
|
|
idx_max = countof(unicode_comp_table) - 1;
|
|
while (idx_min <= idx_max) {
|
|
idx = (idx_max + idx_min) / 2;
|
|
idx1 = unicode_comp_table[idx];
|
|
|
|
/* idx1 represent an entry of the decomposition table */
|
|
d_idx = idx1 >> 6;
|
|
d_offset = idx1 & 0x3f;
|
|
v = unicode_decomp_table1[d_idx];
|
|
code = v >> (32 - 18);
|
|
len = (v >> (32 - 18 - 7)) & 0x7f;
|
|
type = (v >> (32 - 18 - 7 - 6)) & 0x3f;
|
|
ch = code + d_offset;
|
|
unicode_decomp_entry(pair, ch, d_idx, code, len, type);
|
|
d = c0 - pair[0];
|
|
if (d == 0)
|
|
d = c1 - pair[1];
|
|
if (d < 0) {
|
|
idx_max = idx - 1;
|
|
} else if (d > 0) {
|
|
idx_min = idx + 1;
|
|
} else {
|
|
return ch;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/* return the combining class of character c (between 0 and 255) */
|
|
static int unicode_get_cc(uint32_t c)
|
|
{
|
|
uint32_t code, n, type, cc, c1, b;
|
|
int pos;
|
|
const uint8_t *p;
|
|
|
|
pos = get_index_pos(&code, c,
|
|
unicode_cc_index, sizeof(unicode_cc_index) / 3);
|
|
if (pos < 0)
|
|
return 0;
|
|
p = unicode_cc_table + pos;
|
|
for(;;) {
|
|
b = *p++;
|
|
type = b >> 6;
|
|
n = b & 0x3f;
|
|
if (n < 48) {
|
|
} else if (n < 56) {
|
|
n = (n - 48) << 8;
|
|
n |= *p++;
|
|
n += 48;
|
|
} else {
|
|
n = (n - 56) << 8;
|
|
n |= *p++ << 8;
|
|
n |= *p++;
|
|
n += 48 + (1 << 11);
|
|
}
|
|
if (type <= 1)
|
|
p++;
|
|
c1 = code + n + 1;
|
|
if (c < c1) {
|
|
switch(type) {
|
|
case 0:
|
|
cc = p[-1];
|
|
break;
|
|
case 1:
|
|
cc = p[-1] + c - code;
|
|
break;
|
|
case 2:
|
|
cc = 0;
|
|
break;
|
|
default:
|
|
case 3:
|
|
cc = 230;
|
|
break;
|
|
}
|
|
return cc;
|
|
}
|
|
code = c1;
|
|
}
|
|
}
|
|
|
|
static void sort_cc(int *buf, int len)
|
|
{
|
|
int i, j, k, cc, cc1, start, ch1;
|
|
|
|
for(i = 0; i < len; i++) {
|
|
cc = unicode_get_cc(buf[i]);
|
|
if (cc != 0) {
|
|
start = i;
|
|
j = i + 1;
|
|
while (j < len) {
|
|
ch1 = buf[j];
|
|
cc1 = unicode_get_cc(ch1);
|
|
if (cc1 == 0)
|
|
break;
|
|
k = j - 1;
|
|
while (k >= start) {
|
|
if (unicode_get_cc(buf[k]) <= cc1)
|
|
break;
|
|
buf[k + 1] = buf[k];
|
|
k--;
|
|
}
|
|
buf[k + 1] = ch1;
|
|
j++;
|
|
}
|
|
#if 0
|
|
printf("cc:");
|
|
for(k = start; k < j; k++) {
|
|
printf(" %3d", unicode_get_cc(buf[k]));
|
|
}
|
|
printf("\n");
|
|
#endif
|
|
i = j;
|
|
}
|
|
}
|
|
}
|
|
|
|
static void to_nfd_rec(DynBuf *dbuf,
|
|
const int *src, int src_len, int is_compat)
|
|
{
|
|
uint32_t c, v;
|
|
int i, l;
|
|
uint32_t res[UNICODE_DECOMP_LEN_MAX];
|
|
|
|
for(i = 0; i < src_len; i++) {
|
|
c = src[i];
|
|
if (c >= 0xac00 && c < 0xd7a4) {
|
|
/* Hangul decomposition */
|
|
c -= 0xac00;
|
|
dbuf_put_u32(dbuf, 0x1100 + c / 588);
|
|
dbuf_put_u32(dbuf, 0x1161 + (c % 588) / 28);
|
|
v = c % 28;
|
|
if (v != 0)
|
|
dbuf_put_u32(dbuf, 0x11a7 + v);
|
|
} else {
|
|
l = unicode_decomp_char(res, c, is_compat);
|
|
if (l) {
|
|
to_nfd_rec(dbuf, (int *)res, l, is_compat);
|
|
} else {
|
|
dbuf_put_u32(dbuf, c);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* return 0 if not found */
|
|
static int compose_pair(uint32_t c0, uint32_t c1)
|
|
{
|
|
/* Hangul composition */
|
|
if (c0 >= 0x1100 && c0 < 0x1100 + 19 &&
|
|
c1 >= 0x1161 && c1 < 0x1161 + 21) {
|
|
return 0xac00 + (c0 - 0x1100) * 588 + (c1 - 0x1161) * 28;
|
|
} else if (c0 >= 0xac00 && c0 < 0xac00 + 11172 &&
|
|
(c0 - 0xac00) % 28 == 0 &&
|
|
c1 >= 0x11a7 && c1 < 0x11a7 + 28) {
|
|
return c0 + c1 - 0x11a7;
|
|
} else {
|
|
return unicode_compose_pair(c0, c1);
|
|
}
|
|
}
|
|
|
|
int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len,
|
|
UnicodeNormalizationEnum n_type,
|
|
void *opaque, DynBufReallocFunc *realloc_func)
|
|
{
|
|
int *buf, buf_len, i, p, starter_pos, cc, last_cc, out_len;
|
|
BOOL is_compat;
|
|
DynBuf dbuf_s, *dbuf = &dbuf_s;
|
|
|
|
is_compat = n_type >> 1;
|
|
|
|
dbuf_init2(dbuf, opaque, realloc_func);
|
|
if (dbuf_realloc(dbuf, sizeof(int) * src_len))
|
|
goto fail;
|
|
|
|
/* common case: latin1 is unaffected by NFC */
|
|
if (n_type == UNICODE_NFC) {
|
|
for(i = 0; i < src_len; i++) {
|
|
if (src[i] >= 0x100)
|
|
goto not_latin1;
|
|
}
|
|
buf = (int *)dbuf->buf;
|
|
memcpy(buf, src, src_len * sizeof(int));
|
|
*pdst = (uint32_t *)buf;
|
|
return src_len;
|
|
not_latin1: ;
|
|
}
|
|
|
|
to_nfd_rec(dbuf, (const int *)src, src_len, is_compat);
|
|
if (dbuf_error(dbuf)) {
|
|
fail:
|
|
*pdst = NULL;
|
|
return -1;
|
|
}
|
|
buf = (int *)dbuf->buf;
|
|
buf_len = dbuf->size / sizeof(int);
|
|
|
|
sort_cc(buf, buf_len);
|
|
|
|
if (buf_len <= 1 || (n_type & 1) != 0) {
|
|
/* NFD / NFKD */
|
|
*pdst = (uint32_t *)buf;
|
|
return buf_len;
|
|
}
|
|
|
|
i = 1;
|
|
out_len = 1;
|
|
while (i < buf_len) {
|
|
/* find the starter character and test if it is blocked from
|
|
the character at 'i' */
|
|
last_cc = unicode_get_cc(buf[i]);
|
|
starter_pos = out_len - 1;
|
|
while (starter_pos >= 0) {
|
|
cc = unicode_get_cc(buf[starter_pos]);
|
|
if (cc == 0)
|
|
break;
|
|
if (cc >= last_cc)
|
|
goto next;
|
|
last_cc = 256;
|
|
starter_pos--;
|
|
}
|
|
if (starter_pos >= 0 &&
|
|
(p = compose_pair(buf[starter_pos], buf[i])) != 0) {
|
|
buf[starter_pos] = p;
|
|
i++;
|
|
} else {
|
|
next:
|
|
buf[out_len++] = buf[i++];
|
|
}
|
|
}
|
|
*pdst = (uint32_t *)buf;
|
|
return out_len;
|
|
}
|
|
|
|
/* char ranges for various unicode properties */
|
|
|
|
static int unicode_find_name(const char *name_table, const char *name)
|
|
{
|
|
const char *p, *r;
|
|
int pos;
|
|
size_t name_len, len;
|
|
|
|
p = name_table;
|
|
pos = 0;
|
|
name_len = strlen(name);
|
|
while (*p) {
|
|
for(;;) {
|
|
r = strchr(p, ',');
|
|
if (!r)
|
|
len = strlen(p);
|
|
else
|
|
len = r - p;
|
|
if (len == name_len && !memcmp(p, name, name_len))
|
|
return pos;
|
|
p += len + 1;
|
|
if (!r)
|
|
break;
|
|
}
|
|
pos++;
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
/* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
|
|
if not found */
|
|
int unicode_script(CharRange *cr,
|
|
const char *script_name, BOOL is_ext)
|
|
{
|
|
int script_idx;
|
|
const uint8_t *p, *p_end;
|
|
uint32_t c, c1, b, n, v, v_len, i, type;
|
|
CharRange cr1_s, *cr1;
|
|
CharRange cr2_s, *cr2 = &cr2_s;
|
|
BOOL is_common;
|
|
|
|
script_idx = unicode_find_name(unicode_script_name_table, script_name);
|
|
if (script_idx < 0)
|
|
return -2;
|
|
/* Note: we remove the "Unknown" Script */
|
|
script_idx += UNICODE_SCRIPT_Unknown + 1;
|
|
|
|
is_common = (script_idx == UNICODE_SCRIPT_Common ||
|
|
script_idx == UNICODE_SCRIPT_Inherited);
|
|
if (is_ext) {
|
|
cr1 = &cr1_s;
|
|
cr_init(cr1, cr->mem_opaque, cr->realloc_func);
|
|
cr_init(cr2, cr->mem_opaque, cr->realloc_func);
|
|
} else {
|
|
cr1 = cr;
|
|
}
|
|
|
|
p = unicode_script_table;
|
|
p_end = unicode_script_table + countof(unicode_script_table);
|
|
c = 0;
|
|
while (p < p_end) {
|
|
b = *p++;
|
|
type = b >> 7;
|
|
n = b & 0x7f;
|
|
if (n < 96) {
|
|
} else if (n < 112) {
|
|
n = (n - 96) << 8;
|
|
n |= *p++;
|
|
n += 96;
|
|
} else {
|
|
n = (n - 112) << 16;
|
|
n |= *p++ << 8;
|
|
n |= *p++;
|
|
n += 96 + (1 << 12);
|
|
}
|
|
if (type == 0)
|
|
v = 0;
|
|
else
|
|
v = *p++;
|
|
c1 = c + n + 1;
|
|
if (v == script_idx) {
|
|
if (cr_add_interval(cr1, c, c1))
|
|
goto fail;
|
|
}
|
|
c = c1;
|
|
}
|
|
|
|
if (is_ext) {
|
|
/* add the script extensions */
|
|
p = unicode_script_ext_table;
|
|
p_end = unicode_script_ext_table + countof(unicode_script_ext_table);
|
|
c = 0;
|
|
while (p < p_end) {
|
|
b = *p++;
|
|
if (b < 128) {
|
|
n = b;
|
|
} else if (b < 128 + 64) {
|
|
n = (b - 128) << 8;
|
|
n |= *p++;
|
|
n += 128;
|
|
} else {
|
|
n = (b - 128 - 64) << 16;
|
|
n |= *p++ << 8;
|
|
n |= *p++;
|
|
n += 128 + (1 << 14);
|
|
}
|
|
c1 = c + n + 1;
|
|
v_len = *p++;
|
|
if (is_common) {
|
|
if (v_len != 0) {
|
|
if (cr_add_interval(cr2, c, c1))
|
|
goto fail;
|
|
}
|
|
} else {
|
|
for(i = 0; i < v_len; i++) {
|
|
if (p[i] == script_idx) {
|
|
if (cr_add_interval(cr2, c, c1))
|
|
goto fail;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
p += v_len;
|
|
c = c1;
|
|
}
|
|
if (is_common) {
|
|
/* remove all the characters with script extensions */
|
|
if (cr_invert(cr2))
|
|
goto fail;
|
|
if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len,
|
|
CR_OP_INTER))
|
|
goto fail;
|
|
} else {
|
|
if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len,
|
|
CR_OP_UNION))
|
|
goto fail;
|
|
}
|
|
cr_free(cr1);
|
|
cr_free(cr2);
|
|
}
|
|
return 0;
|
|
fail:
|
|
if (is_ext) {
|
|
cr_free(cr1);
|
|
cr_free(cr2);
|
|
}
|
|
goto fail;
|
|
}
|
|
|
|
#define M(id) (1U << UNICODE_GC_ ## id)
|
|
|
|
static int unicode_general_category1(CharRange *cr, uint32_t gc_mask)
|
|
{
|
|
const uint8_t *p, *p_end;
|
|
uint32_t c, c0, b, n, v;
|
|
|
|
p = unicode_gc_table;
|
|
p_end = unicode_gc_table + countof(unicode_gc_table);
|
|
c = 0;
|
|
while (p < p_end) {
|
|
b = *p++;
|
|
n = b >> 5;
|
|
v = b & 0x1f;
|
|
if (n == 7) {
|
|
n = *p++;
|
|
if (n < 128) {
|
|
n += 7;
|
|
} else if (n < 128 + 64) {
|
|
n = (n - 128) << 8;
|
|
n |= *p++;
|
|
n += 7 + 128;
|
|
} else {
|
|
n = (n - 128 - 64) << 16;
|
|
n |= *p++ << 8;
|
|
n |= *p++;
|
|
n += 7 + 128 + (1 << 14);
|
|
}
|
|
}
|
|
c0 = c;
|
|
c += n + 1;
|
|
if (v == 31) {
|
|
/* run of Lu / Ll */
|
|
b = gc_mask & (M(Lu) | M(Ll));
|
|
if (b != 0) {
|
|
if (b == (M(Lu) | M(Ll))) {
|
|
goto add_range;
|
|
} else {
|
|
c0 += ((gc_mask & M(Ll)) != 0);
|
|
for(; c0 < c; c0 += 2) {
|
|
if (cr_add_interval(cr, c0, c0 + 1))
|
|
return -1;
|
|
}
|
|
}
|
|
}
|
|
} else if ((gc_mask >> v) & 1) {
|
|
add_range:
|
|
if (cr_add_interval(cr, c0, c))
|
|
return -1;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static int unicode_prop1(CharRange *cr, int prop_idx)
|
|
{
|
|
const uint8_t *p, *p_end;
|
|
uint32_t c, c0, b, bit;
|
|
|
|
p = unicode_prop_table[prop_idx];
|
|
p_end = p + unicode_prop_len_table[prop_idx];
|
|
c = 0;
|
|
bit = 0;
|
|
while (p < p_end) {
|
|
c0 = c;
|
|
b = *p++;
|
|
if (b < 64) {
|
|
c += (b >> 3) + 1;
|
|
if (bit) {
|
|
if (cr_add_interval(cr, c0, c))
|
|
return -1;
|
|
}
|
|
bit ^= 1;
|
|
c0 = c;
|
|
c += (b & 7) + 1;
|
|
} else if (b >= 0x80) {
|
|
c += b - 0x80 + 1;
|
|
} else if (b < 0x60) {
|
|
c += (((b - 0x40) << 8) | p[0]) + 1;
|
|
p++;
|
|
} else {
|
|
c += (((b - 0x60) << 16) | (p[0] << 8) | p[1]) + 1;
|
|
p += 2;
|
|
}
|
|
if (bit) {
|
|
if (cr_add_interval(cr, c0, c))
|
|
return -1;
|
|
}
|
|
bit ^= 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
#define CASE_U (1 << 0)
|
|
#define CASE_L (1 << 1)
|
|
#define CASE_F (1 << 2)
|
|
|
|
/* use the case conversion table to generate range of characters.
|
|
CASE_U: set char if modified by uppercasing,
|
|
CASE_L: set char if modified by lowercasing,
|
|
CASE_F: set char if modified by case folding,
|
|
*/
|
|
static int unicode_case1(CharRange *cr, int case_mask)
|
|
{
|
|
#define MR(x) (1 << RUN_TYPE_ ## x)
|
|
const uint32_t tab_run_mask[3] = {
|
|
MR(U) | MR(UF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(UF_D20) |
|
|
MR(UF_D1_EXT) | MR(U_EXT) | MR(U_EXT2) | MR(U_EXT3),
|
|
|
|
MR(L) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(L_EXT2),
|
|
|
|
MR(UF) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(UF_D20) | MR(UF_D1_EXT) | MR(LF_EXT),
|
|
};
|
|
#undef MR
|
|
uint32_t mask, v, code, type, len, i, idx;
|
|
|
|
if (case_mask == 0)
|
|
return 0;
|
|
mask = 0;
|
|
for(i = 0; i < 3; i++) {
|
|
if ((case_mask >> i) & 1)
|
|
mask |= tab_run_mask[i];
|
|
}
|
|
for(idx = 0; idx < countof(case_conv_table1); idx++) {
|
|
v = case_conv_table1[idx];
|
|
type = (v >> (32 - 17 - 7 - 4)) & 0xf;
|
|
code = v >> (32 - 17);
|
|
len = (v >> (32 - 17 - 7)) & 0x7f;
|
|
if ((mask >> type) & 1) {
|
|
// printf("%d: type=%d %04x %04x\n", idx, type, code, code + len - 1);
|
|
switch(type) {
|
|
case RUN_TYPE_UL:
|
|
if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F)))
|
|
goto def_case;
|
|
code += ((case_mask & CASE_U) != 0);
|
|
for(i = 0; i < len; i += 2) {
|
|
if (cr_add_interval(cr, code + i, code + i + 1))
|
|
return -1;
|
|
}
|
|
break;
|
|
case RUN_TYPE_LSU:
|
|
if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F)))
|
|
goto def_case;
|
|
if (!(case_mask & CASE_U)) {
|
|
if (cr_add_interval(cr, code, code + 1))
|
|
return -1;
|
|
}
|
|
if (cr_add_interval(cr, code + 1, code + 2))
|
|
return -1;
|
|
if (case_mask & CASE_U) {
|
|
if (cr_add_interval(cr, code + 2, code + 3))
|
|
return -1;
|
|
}
|
|
break;
|
|
default:
|
|
def_case:
|
|
if (cr_add_interval(cr, code, code + len))
|
|
return -1;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
typedef enum {
|
|
POP_GC,
|
|
POP_PROP,
|
|
POP_CASE,
|
|
POP_UNION,
|
|
POP_INTER,
|
|
POP_XOR,
|
|
POP_INVERT,
|
|
POP_END,
|
|
} PropOPEnum;
|
|
|
|
#define POP_STACK_LEN_MAX 4
|
|
|
|
static int unicode_prop_ops(CharRange *cr, ...)
|
|
{
|
|
va_list ap;
|
|
CharRange stack[POP_STACK_LEN_MAX];
|
|
int stack_len, op, ret, i;
|
|
uint32_t a;
|
|
|
|
va_start(ap, cr);
|
|
stack_len = 0;
|
|
for(;;) {
|
|
op = va_arg(ap, int);
|
|
switch(op) {
|
|
case POP_GC:
|
|
assert(stack_len < POP_STACK_LEN_MAX);
|
|
a = va_arg(ap, int);
|
|
cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
|
|
if (unicode_general_category1(&stack[stack_len - 1], a))
|
|
goto fail;
|
|
break;
|
|
case POP_PROP:
|
|
assert(stack_len < POP_STACK_LEN_MAX);
|
|
a = va_arg(ap, int);
|
|
cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
|
|
if (unicode_prop1(&stack[stack_len - 1], a))
|
|
goto fail;
|
|
break;
|
|
case POP_CASE:
|
|
assert(stack_len < POP_STACK_LEN_MAX);
|
|
a = va_arg(ap, int);
|
|
cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
|
|
if (unicode_case1(&stack[stack_len - 1], a))
|
|
goto fail;
|
|
break;
|
|
case POP_UNION:
|
|
case POP_INTER:
|
|
case POP_XOR:
|
|
{
|
|
CharRange *cr1, *cr2, *cr3;
|
|
assert(stack_len >= 2);
|
|
assert(stack_len < POP_STACK_LEN_MAX);
|
|
cr1 = &stack[stack_len - 2];
|
|
cr2 = &stack[stack_len - 1];
|
|
cr3 = &stack[stack_len++];
|
|
cr_init(cr3, cr->mem_opaque, cr->realloc_func);
|
|
if (cr_op(cr3, cr1->points, cr1->len,
|
|
cr2->points, cr2->len, op - POP_UNION + CR_OP_UNION))
|
|
goto fail;
|
|
cr_free(cr1);
|
|
cr_free(cr2);
|
|
*cr1 = *cr3;
|
|
stack_len -= 2;
|
|
}
|
|
break;
|
|
case POP_INVERT:
|
|
assert(stack_len >= 1);
|
|
if (cr_invert(&stack[stack_len - 1]))
|
|
goto fail;
|
|
break;
|
|
case POP_END:
|
|
goto done;
|
|
default:
|
|
abort();
|
|
}
|
|
}
|
|
done:
|
|
assert(stack_len == 1);
|
|
ret = cr_copy(cr, &stack[0]);
|
|
cr_free(&stack[0]);
|
|
return ret;
|
|
fail:
|
|
for(i = 0; i < stack_len; i++)
|
|
cr_free(&stack[i]);
|
|
return -1;
|
|
}
|
|
|
|
static const uint32_t unicode_gc_mask_table[] = {
|
|
M(Lu) | M(Ll) | M(Lt), /* LC */
|
|
M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo), /* L */
|
|
M(Mn) | M(Mc) | M(Me), /* M */
|
|
M(Nd) | M(Nl) | M(No), /* N */
|
|
M(Sm) | M(Sc) | M(Sk) | M(So), /* S */
|
|
M(Pc) | M(Pd) | M(Ps) | M(Pe) | M(Pi) | M(Pf) | M(Po), /* P */
|
|
M(Zs) | M(Zl) | M(Zp), /* Z */
|
|
M(Cc) | M(Cf) | M(Cs) | M(Co) | M(Cn), /* C */
|
|
};
|
|
|
|
/* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
|
|
if not found */
|
|
int unicode_general_category(CharRange *cr, const char *gc_name)
|
|
{
|
|
int gc_idx;
|
|
uint32_t gc_mask;
|
|
|
|
gc_idx = unicode_find_name(unicode_gc_name_table, gc_name);
|
|
if (gc_idx < 0)
|
|
return -2;
|
|
if (gc_idx <= UNICODE_GC_Co) {
|
|
gc_mask = (uint64_t)1 << gc_idx;
|
|
} else {
|
|
gc_mask = unicode_gc_mask_table[gc_idx - UNICODE_GC_LC];
|
|
}
|
|
return unicode_general_category1(cr, gc_mask);
|
|
}
|
|
|
|
|
|
/* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
|
|
if not found */
|
|
int unicode_prop(CharRange *cr, const char *prop_name)
|
|
{
|
|
int prop_idx, ret;
|
|
|
|
prop_idx = unicode_find_name(unicode_prop_name_table, prop_name);
|
|
if (prop_idx < 0)
|
|
return -2;
|
|
prop_idx += UNICODE_PROP_ASCII_Hex_Digit;
|
|
|
|
ret = 0;
|
|
switch(prop_idx) {
|
|
case UNICODE_PROP_ASCII:
|
|
if (cr_add_interval(cr, 0x00, 0x7f + 1))
|
|
return -1;
|
|
break;
|
|
case UNICODE_PROP_Any:
|
|
if (cr_add_interval(cr, 0x00000, 0x10ffff + 1))
|
|
return -1;
|
|
break;
|
|
case UNICODE_PROP_Assigned:
|
|
ret = unicode_prop_ops(cr,
|
|
POP_GC, M(Cn),
|
|
POP_INVERT,
|
|
POP_END);
|
|
break;
|
|
case UNICODE_PROP_Math:
|
|
ret = unicode_prop_ops(cr,
|
|
POP_GC, M(Sm),
|
|
POP_PROP, UNICODE_PROP_Other_Math,
|
|
POP_UNION,
|
|
POP_END);
|
|
break;
|
|
case UNICODE_PROP_Lowercase:
|
|
ret = unicode_prop_ops(cr,
|
|
POP_GC, M(Ll),
|
|
POP_PROP, UNICODE_PROP_Other_Lowercase,
|
|
POP_UNION,
|
|
POP_END);
|
|
break;
|
|
case UNICODE_PROP_Uppercase:
|
|
ret = unicode_prop_ops(cr,
|
|
POP_GC, M(Lu),
|
|
POP_PROP, UNICODE_PROP_Other_Uppercase,
|
|
POP_UNION,
|
|
POP_END);
|
|
break;
|
|
case UNICODE_PROP_Cased:
|
|
ret = unicode_prop_ops(cr,
|
|
POP_GC, M(Lu) | M(Ll) | M(Lt),
|
|
POP_PROP, UNICODE_PROP_Other_Uppercase,
|
|
POP_UNION,
|
|
POP_PROP, UNICODE_PROP_Other_Lowercase,
|
|
POP_UNION,
|
|
POP_END);
|
|
break;
|
|
case UNICODE_PROP_Alphabetic:
|
|
ret = unicode_prop_ops(cr,
|
|
POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
|
|
POP_PROP, UNICODE_PROP_Other_Uppercase,
|
|
POP_UNION,
|
|
POP_PROP, UNICODE_PROP_Other_Lowercase,
|
|
POP_UNION,
|
|
POP_PROP, UNICODE_PROP_Other_Alphabetic,
|
|
POP_UNION,
|
|
POP_END);
|
|
break;
|
|
case UNICODE_PROP_Grapheme_Base:
|
|
ret = unicode_prop_ops(cr,
|
|
POP_GC, M(Cc) | M(Cf) | M(Cs) | M(Co) | M(Cn) | M(Zl) | M(Zp) | M(Me) | M(Mn),
|
|
POP_PROP, UNICODE_PROP_Other_Grapheme_Extend,
|
|
POP_UNION,
|
|
POP_INVERT,
|
|
POP_END);
|
|
break;
|
|
case UNICODE_PROP_Grapheme_Extend:
|
|
ret = unicode_prop_ops(cr,
|
|
POP_GC, M(Me) | M(Mn),
|
|
POP_PROP, UNICODE_PROP_Other_Grapheme_Extend,
|
|
POP_UNION,
|
|
POP_END);
|
|
break;
|
|
case UNICODE_PROP_XID_Start:
|
|
ret = unicode_prop_ops(cr,
|
|
POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
|
|
POP_PROP, UNICODE_PROP_Other_ID_Start,
|
|
POP_UNION,
|
|
POP_PROP, UNICODE_PROP_Pattern_Syntax,
|
|
POP_PROP, UNICODE_PROP_Pattern_White_Space,
|
|
POP_UNION,
|
|
POP_PROP, UNICODE_PROP_XID_Start1,
|
|
POP_UNION,
|
|
POP_INVERT,
|
|
POP_INTER,
|
|
POP_END);
|
|
break;
|
|
case UNICODE_PROP_XID_Continue:
|
|
ret = unicode_prop_ops(cr,
|
|
POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl) |
|
|
M(Mn) | M(Mc) | M(Nd) | M(Pc),
|
|
POP_PROP, UNICODE_PROP_Other_ID_Start,
|
|
POP_UNION,
|
|
POP_PROP, UNICODE_PROP_Other_ID_Continue,
|
|
POP_UNION,
|
|
POP_PROP, UNICODE_PROP_Pattern_Syntax,
|
|
POP_PROP, UNICODE_PROP_Pattern_White_Space,
|
|
POP_UNION,
|
|
POP_PROP, UNICODE_PROP_XID_Continue1,
|
|
POP_UNION,
|
|
POP_INVERT,
|
|
POP_INTER,
|
|
POP_END);
|
|
break;
|
|
case UNICODE_PROP_Changes_When_Uppercased:
|
|
ret = unicode_case1(cr, CASE_U);
|
|
break;
|
|
case UNICODE_PROP_Changes_When_Lowercased:
|
|
ret = unicode_case1(cr, CASE_L);
|
|
break;
|
|
case UNICODE_PROP_Changes_When_Casemapped:
|
|
ret = unicode_case1(cr, CASE_U | CASE_L | CASE_F);
|
|
break;
|
|
case UNICODE_PROP_Changes_When_Titlecased:
|
|
ret = unicode_prop_ops(cr,
|
|
POP_CASE, CASE_U,
|
|
POP_PROP, UNICODE_PROP_Changes_When_Titlecased1,
|
|
POP_XOR,
|
|
POP_END);
|
|
break;
|
|
case UNICODE_PROP_Changes_When_Casefolded:
|
|
ret = unicode_prop_ops(cr,
|
|
POP_CASE, CASE_F,
|
|
POP_PROP, UNICODE_PROP_Changes_When_Casefolded1,
|
|
POP_XOR,
|
|
POP_END);
|
|
break;
|
|
case UNICODE_PROP_Changes_When_NFKC_Casefolded:
|
|
ret = unicode_prop_ops(cr,
|
|
POP_CASE, CASE_F,
|
|
POP_PROP, UNICODE_PROP_Changes_When_NFKC_Casefolded1,
|
|
POP_XOR,
|
|
POP_END);
|
|
break;
|
|
#if 0
|
|
case UNICODE_PROP_ID_Start:
|
|
ret = unicode_prop_ops(cr,
|
|
POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
|
|
POP_PROP, UNICODE_PROP_Other_ID_Start,
|
|
POP_UNION,
|
|
POP_PROP, UNICODE_PROP_Pattern_Syntax,
|
|
POP_PROP, UNICODE_PROP_Pattern_White_Space,
|
|
POP_UNION,
|
|
POP_INVERT,
|
|
POP_INTER,
|
|
POP_END);
|
|
break;
|
|
case UNICODE_PROP_ID_Continue:
|
|
ret = unicode_prop_ops(cr,
|
|
POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl) |
|
|
M(Mn) | M(Mc) | M(Nd) | M(Pc),
|
|
POP_PROP, UNICODE_PROP_Other_ID_Start,
|
|
POP_UNION,
|
|
POP_PROP, UNICODE_PROP_Other_ID_Continue,
|
|
POP_UNION,
|
|
POP_PROP, UNICODE_PROP_Pattern_Syntax,
|
|
POP_PROP, UNICODE_PROP_Pattern_White_Space,
|
|
POP_UNION,
|
|
POP_INVERT,
|
|
POP_INTER,
|
|
POP_END);
|
|
break;
|
|
case UNICODE_PROP_Case_Ignorable:
|
|
ret = unicode_prop_ops(cr,
|
|
POP_GC, M(Mn) | M(Cf) | M(Lm) | M(Sk),
|
|
POP_PROP, UNICODE_PROP_Case_Ignorable1,
|
|
POP_XOR,
|
|
POP_END);
|
|
break;
|
|
#else
|
|
/* we use the existing tables */
|
|
case UNICODE_PROP_ID_Continue:
|
|
ret = unicode_prop_ops(cr,
|
|
POP_PROP, UNICODE_PROP_ID_Start,
|
|
POP_PROP, UNICODE_PROP_ID_Continue1,
|
|
POP_XOR,
|
|
POP_END);
|
|
break;
|
|
#endif
|
|
default:
|
|
if (prop_idx >= countof(unicode_prop_table))
|
|
return -2;
|
|
ret = unicode_prop1(cr, prop_idx);
|
|
break;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
#endif /* CONFIG_ALL_UNICODE */
|