mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2024-10-31 16:38:12 +00:00
unicode: Add utf8-data module
utf8data.h contains a large database table which is an auto-generated decodification trie for the unicode normalization functions. Allow building it into a separate module. Based on a patch from Shreeya Patel <shreeya.patel@collabora.com>. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
This commit is contained in:
parent
6ca99ce756
commit
2b3d047870
9 changed files with 126 additions and 91 deletions
|
@ -8,7 +8,16 @@ config UNICODE
|
|||
Say Y here to enable UTF-8 NFD normalization and NFD+CF casefolding
|
||||
support.
|
||||
|
||||
config UNICODE_UTF8_DATA
|
||||
tristate "UTF-8 normalization and casefolding tables"
|
||||
depends on UNICODE
|
||||
default UNICODE
|
||||
help
|
||||
This contains a large table of case foldings, which can be loaded as
|
||||
a separate module if you say M here. To be on the safe side stick
|
||||
to the default of Y. Saying N here makes no sense, if you do not want
|
||||
utf8 casefolding support, disable CONFIG_UNICODE instead.
|
||||
|
||||
config UNICODE_NORMALIZATION_SELFTEST
|
||||
tristate "Test UTF-8 normalization support"
|
||||
depends on UNICODE
|
||||
default n
|
||||
depends on UNICODE_UTF8_DATA
|
||||
|
|
|
@ -2,14 +2,15 @@
|
|||
|
||||
obj-$(CONFIG_UNICODE) += unicode.o
|
||||
obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o
|
||||
obj-$(CONFIG_UNICODE_UTF8_DATA) += utf8data.o
|
||||
|
||||
unicode-y := utf8-norm.o utf8-core.o
|
||||
|
||||
$(obj)/utf8-norm.o: $(obj)/utf8data.h
|
||||
$(obj)/utf8-data.o: $(obj)/utf8data.c
|
||||
|
||||
# In the normal build, the checked-in utf8data.h is just shipped.
|
||||
# In the normal build, the checked-in utf8data.c is just shipped.
|
||||
#
|
||||
# To generate utf8data.h from UCD, put *.txt files in this directory
|
||||
# To generate utf8data.c from UCD, put *.txt files in this directory
|
||||
# and pass REGENERATE_UTF8DATA=1 from the command line.
|
||||
ifdef REGENERATE_UTF8DATA
|
||||
|
||||
|
@ -24,15 +25,15 @@ quiet_cmd_utf8data = GEN $@
|
|||
-t $(srctree)/$(src)/NormalizationTest.txt \
|
||||
-o $@
|
||||
|
||||
$(obj)/utf8data.h: $(obj)/mkutf8data $(filter %.txt, $(cmd_utf8data)) FORCE
|
||||
$(obj)/utf8data.c: $(obj)/mkutf8data $(filter %.txt, $(cmd_utf8data)) FORCE
|
||||
$(call if_changed,utf8data)
|
||||
|
||||
else
|
||||
|
||||
$(obj)/utf8data.h: $(src)/utf8data.h_shipped FORCE
|
||||
$(obj)/utf8data.c: $(src)/utf8data.c_shipped FORCE
|
||||
$(call if_changed,shipped)
|
||||
|
||||
endif
|
||||
|
||||
targets += utf8data.h
|
||||
targets += utf8data.c
|
||||
hostprogs += mkutf8data
|
||||
|
|
|
@ -3287,12 +3287,10 @@ static void write_file(void)
|
|||
open_fail(utf8_name, errno);
|
||||
|
||||
fprintf(file, "/* This file is generated code, do not edit. */\n");
|
||||
fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n");
|
||||
fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n");
|
||||
fprintf(file, "#endif\n");
|
||||
fprintf(file, "\n");
|
||||
fprintf(file, "static const unsigned int utf8vers = %#x;\n",
|
||||
unicode_maxage);
|
||||
fprintf(file, "#include <linux/module.h>\n");
|
||||
fprintf(file, "#include <linux/kernel.h>\n");
|
||||
fprintf(file, "#include \"utf8n.h\"\n");
|
||||
fprintf(file, "\n");
|
||||
fprintf(file, "static const unsigned int utf8agetab[] = {\n");
|
||||
for (i = 0; i != ages_count; i++)
|
||||
|
@ -3339,6 +3337,22 @@ static void write_file(void)
|
|||
fprintf(file, "\n");
|
||||
}
|
||||
fprintf(file, "};\n");
|
||||
fprintf(file, "\n");
|
||||
fprintf(file, "struct utf8data_table utf8_data_table = {\n");
|
||||
fprintf(file, "\t.utf8agetab = utf8agetab,\n");
|
||||
fprintf(file, "\t.utf8agetab_size = ARRAY_SIZE(utf8agetab),\n");
|
||||
fprintf(file, "\n");
|
||||
fprintf(file, "\t.utf8nfdicfdata = utf8nfdicfdata,\n");
|
||||
fprintf(file, "\t.utf8nfdicfdata_size = ARRAY_SIZE(utf8nfdicfdata),\n");
|
||||
fprintf(file, "\n");
|
||||
fprintf(file, "\t.utf8nfdidata = utf8nfdidata,\n");
|
||||
fprintf(file, "\t.utf8nfdidata_size = ARRAY_SIZE(utf8nfdidata),\n");
|
||||
fprintf(file, "\n");
|
||||
fprintf(file, "\t.utf8data = utf8data,\n");
|
||||
fprintf(file, "};\n");
|
||||
fprintf(file, "EXPORT_SYMBOL_GPL(utf8_data_table);");
|
||||
fprintf(file, "\n");
|
||||
fprintf(file, "MODULE_LICENSE(\"GPL v2\");\n");
|
||||
fclose(file);
|
||||
}
|
||||
|
||||
|
|
|
@ -160,25 +160,45 @@ int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
|
|||
}
|
||||
EXPORT_SYMBOL(utf8_normalize);
|
||||
|
||||
static const struct utf8data *find_table_version(const struct utf8data *table,
|
||||
size_t nr_entries, unsigned int version)
|
||||
{
|
||||
size_t i = nr_entries - 1;
|
||||
|
||||
while (version < table[i].maxage)
|
||||
i--;
|
||||
if (version > table[i].maxage)
|
||||
return NULL;
|
||||
return &table[i];
|
||||
}
|
||||
|
||||
struct unicode_map *utf8_load(unsigned int version)
|
||||
{
|
||||
struct unicode_map *um;
|
||||
|
||||
if (!utf8version_is_supported(version))
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL);
|
||||
if (!um)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
um->version = version;
|
||||
um->ntab[UTF8_NFDI] = utf8nfdi(version);
|
||||
|
||||
um->tables = symbol_request(utf8_data_table);
|
||||
if (!um->tables)
|
||||
goto out_free_um;
|
||||
|
||||
if (!utf8version_is_supported(um, version))
|
||||
goto out_symbol_put;
|
||||
um->ntab[UTF8_NFDI] = find_table_version(um->tables->utf8nfdidata,
|
||||
um->tables->utf8nfdidata_size, um->version);
|
||||
if (!um->ntab[UTF8_NFDI])
|
||||
goto out_free_um;
|
||||
um->ntab[UTF8_NFDICF] = utf8nfdicf(version);
|
||||
goto out_symbol_put;
|
||||
um->ntab[UTF8_NFDICF] = find_table_version(um->tables->utf8nfdicfdata,
|
||||
um->tables->utf8nfdicfdata_size, um->version);
|
||||
if (!um->ntab[UTF8_NFDICF])
|
||||
goto out_free_um;
|
||||
goto out_symbol_put;
|
||||
return um;
|
||||
|
||||
out_symbol_put:
|
||||
symbol_put(um->tables);
|
||||
out_free_um:
|
||||
kfree(um);
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
@ -187,7 +207,10 @@ EXPORT_SYMBOL(utf8_load);
|
|||
|
||||
void utf8_unload(struct unicode_map *um)
|
||||
{
|
||||
if (um) {
|
||||
symbol_put(utf8_data_table);
|
||||
kfree(um);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(utf8_unload);
|
||||
|
||||
|
|
|
@ -6,21 +6,12 @@
|
|||
|
||||
#include "utf8n.h"
|
||||
|
||||
struct utf8data {
|
||||
unsigned int maxage;
|
||||
unsigned int offset;
|
||||
};
|
||||
|
||||
#define __INCLUDED_FROM_UTF8NORM_C__
|
||||
#include "utf8data.h"
|
||||
#undef __INCLUDED_FROM_UTF8NORM_C__
|
||||
|
||||
int utf8version_is_supported(unsigned int version)
|
||||
int utf8version_is_supported(const struct unicode_map *um, unsigned int version)
|
||||
{
|
||||
int i = ARRAY_SIZE(utf8agetab) - 1;
|
||||
int i = um->tables->utf8agetab_size - 1;
|
||||
|
||||
while (i >= 0 && utf8agetab[i] != 0) {
|
||||
if (version == utf8agetab[i])
|
||||
while (i >= 0 && um->tables->utf8agetab[i] != 0) {
|
||||
if (version == um->tables->utf8agetab[i])
|
||||
return 1;
|
||||
i--;
|
||||
}
|
||||
|
@ -161,7 +152,7 @@ typedef const unsigned char utf8trie_t;
|
|||
* underlying datatype: unsigned char.
|
||||
*
|
||||
* leaf[0]: The unicode version, stored as a generation number that is
|
||||
* an index into utf8agetab[]. With this we can filter code
|
||||
* an index into ->utf8agetab[]. With this we can filter code
|
||||
* points based on the unicode version in which they were
|
||||
* defined. The CCC of a non-defined code point is 0.
|
||||
* leaf[1]: Canonical Combining Class. During normalization, we need
|
||||
|
@ -313,7 +304,7 @@ static utf8leaf_t *utf8nlookup(const struct unicode_map *um,
|
|||
enum utf8_normalization n, unsigned char *hangul, const char *s,
|
||||
size_t len)
|
||||
{
|
||||
utf8trie_t *trie = utf8data + um->ntab[n]->offset;
|
||||
utf8trie_t *trie = um->tables->utf8data + um->ntab[n]->offset;
|
||||
int offlen;
|
||||
int offset;
|
||||
int mask;
|
||||
|
@ -404,7 +395,8 @@ ssize_t utf8nlen(const struct unicode_map *um, enum utf8_normalization n,
|
|||
leaf = utf8nlookup(um, n, hangul, s, len);
|
||||
if (!leaf)
|
||||
return -1;
|
||||
if (utf8agetab[LEAF_GEN(leaf)] > um->ntab[n]->maxage)
|
||||
if (um->tables->utf8agetab[LEAF_GEN(leaf)] >
|
||||
um->ntab[n]->maxage)
|
||||
ret += utf8clen(s);
|
||||
else if (LEAF_CCC(leaf) == DECOMPOSE)
|
||||
ret += strlen(LEAF_STR(leaf));
|
||||
|
@ -520,7 +512,7 @@ int utf8byte(struct utf8cursor *u8c)
|
|||
|
||||
ccc = LEAF_CCC(leaf);
|
||||
/* Characters that are too new have CCC 0. */
|
||||
if (utf8agetab[LEAF_GEN(leaf)] >
|
||||
if (u8c->um->tables->utf8agetab[LEAF_GEN(leaf)] >
|
||||
u8c->um->ntab[u8c->n]->maxage) {
|
||||
ccc = STOPPER;
|
||||
} else if (ccc == DECOMPOSE) {
|
||||
|
@ -597,25 +589,3 @@ int utf8byte(struct utf8cursor *u8c)
|
|||
}
|
||||
}
|
||||
EXPORT_SYMBOL(utf8byte);
|
||||
|
||||
const struct utf8data *utf8nfdi(unsigned int maxage)
|
||||
{
|
||||
int i = ARRAY_SIZE(utf8nfdidata) - 1;
|
||||
|
||||
while (maxage < utf8nfdidata[i].maxage)
|
||||
i--;
|
||||
if (maxage > utf8nfdidata[i].maxage)
|
||||
return NULL;
|
||||
return &utf8nfdidata[i];
|
||||
}
|
||||
|
||||
const struct utf8data *utf8nfdicf(unsigned int maxage)
|
||||
{
|
||||
int i = ARRAY_SIZE(utf8nfdicfdata) - 1;
|
||||
|
||||
while (maxage < utf8nfdicfdata[i].maxage)
|
||||
i--;
|
||||
if (maxage > utf8nfdicfdata[i].maxage)
|
||||
return NULL;
|
||||
return &utf8nfdicfdata[i];
|
||||
}
|
||||
|
|
|
@ -255,21 +255,21 @@ static void check_utf8_comparisons(struct unicode_map *table)
|
|||
}
|
||||
}
|
||||
|
||||
static void check_supported_versions(void)
|
||||
static void check_supported_versions(struct unicode_map *um)
|
||||
{
|
||||
/* Unicode 7.0.0 should be supported. */
|
||||
test(utf8version_is_supported(UNICODE_AGE(7, 0, 0)));
|
||||
test(utf8version_is_supported(um, UNICODE_AGE(7, 0, 0)));
|
||||
|
||||
/* Unicode 9.0.0 should be supported. */
|
||||
test(utf8version_is_supported(UNICODE_AGE(9, 0, 0)));
|
||||
test(utf8version_is_supported(um, UNICODE_AGE(9, 0, 0)));
|
||||
|
||||
/* Unicode 1x.0.0 (the latest version) should be supported. */
|
||||
test(utf8version_is_supported(UTF8_LATEST));
|
||||
test(utf8version_is_supported(um, UTF8_LATEST));
|
||||
|
||||
/* Next versions don't exist. */
|
||||
test(!utf8version_is_supported(UNICODE_AGE(13, 0, 0)));
|
||||
test(!utf8version_is_supported(UNICODE_AGE(0, 0, 0)));
|
||||
test(!utf8version_is_supported(UNICODE_AGE(-1, -1, -1)));
|
||||
test(!utf8version_is_supported(um, UNICODE_AGE(13, 0, 0)));
|
||||
test(!utf8version_is_supported(um, UNICODE_AGE(0, 0, 0)));
|
||||
test(!utf8version_is_supported(um, UNICODE_AGE(-1, -1, -1)));
|
||||
}
|
||||
|
||||
static int __init init_test_ucd(void)
|
||||
|
@ -285,7 +285,7 @@ static int __init init_test_ucd(void)
|
|||
return PTR_ERR(um);
|
||||
}
|
||||
|
||||
check_supported_versions();
|
||||
check_supported_versions(um);
|
||||
check_utf8_nfdi(um);
|
||||
check_utf8_nfdicf(um);
|
||||
check_utf8_comparisons(um);
|
||||
|
|
|
@ -1,9 +1,8 @@
|
|||
/* This file is generated code, do not edit. */
|
||||
#ifndef __INCLUDED_FROM_UTF8NORM_C__
|
||||
#error Only nls_utf8-norm.c should include this file.
|
||||
#endif
|
||||
|
||||
static const unsigned int utf8vers = 0xc0100;
|
||||
#include <linux/module.h>
|
||||
#include <linux/kernel.h>
|
||||
#include "utf8n.h"
|
||||
|
||||
static const unsigned int utf8agetab[] = {
|
||||
0,
|
||||
|
@ -4107,3 +4106,18 @@ static const unsigned char utf8data[64256] = {
|
|||
0x52,0x04,0x00,0x00,0x11,0x04,0x00,0x00,0x02,0x00,0xcf,0x86,0xcf,0x06,0x02,0x00,
|
||||
0x81,0x80,0xcf,0x86,0x85,0x84,0xcf,0x86,0xcf,0x06,0x02,0x00,0x00,0x00,0x00,0x00
|
||||
};
|
||||
|
||||
struct utf8data_table utf8_data_table = {
|
||||
.utf8agetab = utf8agetab,
|
||||
.utf8agetab_size = ARRAY_SIZE(utf8agetab),
|
||||
|
||||
.utf8nfdicfdata = utf8nfdicfdata,
|
||||
.utf8nfdicfdata_size = ARRAY_SIZE(utf8nfdicfdata),
|
||||
|
||||
.utf8nfdidata = utf8nfdidata,
|
||||
.utf8nfdidata_size = ARRAY_SIZE(utf8nfdidata),
|
||||
|
||||
.utf8data = utf8data,
|
||||
};
|
||||
EXPORT_SYMBOL_GPL(utf8_data_table);
|
||||
MODULE_LICENSE("GPL v2");
|
|
@ -13,25 +13,7 @@
|
|||
#include <linux/module.h>
|
||||
#include <linux/unicode.h>
|
||||
|
||||
int utf8version_is_supported(unsigned int version);
|
||||
|
||||
/*
|
||||
* Look for the correct const struct utf8data for a unicode version.
|
||||
* Returns NULL if the version requested is too new.
|
||||
*
|
||||
* Two normalization forms are supported: nfdi and nfdicf.
|
||||
*
|
||||
* nfdi:
|
||||
* - Apply unicode normalization form NFD.
|
||||
* - Remove any Default_Ignorable_Code_Point.
|
||||
*
|
||||
* nfdicf:
|
||||
* - Apply unicode normalization form NFD.
|
||||
* - Remove any Default_Ignorable_Code_Point.
|
||||
* - Apply a full casefold (C + F).
|
||||
*/
|
||||
extern const struct utf8data *utf8nfdi(unsigned int maxage);
|
||||
extern const struct utf8data *utf8nfdicf(unsigned int maxage);
|
||||
int utf8version_is_supported(const struct unicode_map *um, unsigned int version);
|
||||
|
||||
/*
|
||||
* Determine the length of the normalized from of the string,
|
||||
|
@ -78,4 +60,24 @@ int utf8ncursor(struct utf8cursor *u8c, const struct unicode_map *um,
|
|||
*/
|
||||
extern int utf8byte(struct utf8cursor *u8c);
|
||||
|
||||
struct utf8data {
|
||||
unsigned int maxage;
|
||||
unsigned int offset;
|
||||
};
|
||||
|
||||
struct utf8data_table {
|
||||
const unsigned int *utf8agetab;
|
||||
int utf8agetab_size;
|
||||
|
||||
const struct utf8data *utf8nfdicfdata;
|
||||
int utf8nfdicfdata_size;
|
||||
|
||||
const struct utf8data *utf8nfdidata;
|
||||
int utf8nfdidata_size;
|
||||
|
||||
const unsigned char *utf8data;
|
||||
};
|
||||
|
||||
extern struct utf8data_table utf8_data_table;
|
||||
|
||||
#endif /* UTF8NORM_H */
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
#include <linux/dcache.h>
|
||||
|
||||
struct utf8data;
|
||||
struct utf8data_table;
|
||||
|
||||
#define UNICODE_MAJ_SHIFT 16
|
||||
#define UNICODE_MIN_SHIFT 8
|
||||
|
@ -49,6 +50,7 @@ enum utf8_normalization {
|
|||
struct unicode_map {
|
||||
unsigned int version;
|
||||
const struct utf8data *ntab[UTF8_NMAX];
|
||||
const struct utf8data_table *tables;
|
||||
};
|
||||
|
||||
int utf8_validate(const struct unicode_map *um, const struct qstr *str);
|
||||
|
|
Loading…
Reference in a new issue