From 03e4ef0293b15b6623cdbab36d09aa13bc123f55 Mon Sep 17 00:00:00 2001 From: Vladimir 'phcoder' Serbinenko Date: Wed, 24 Mar 2010 00:35:38 +0100 Subject: [PATCH] Import data for arabic joining computation --- autogen.sh | 2 +- include/grub/unicode.h | 31 +++- unicode/ArabicShaping.txt | 373 ++++++++++++++++++++++++++++++++++++++ util/import_unicode.py | 48 ++++- 4 files changed, 445 insertions(+), 9 deletions(-) create mode 100644 unicode/ArabicShaping.txt diff --git a/autogen.sh b/autogen.sh index 7cbee5cca..f9d55318f 100755 --- a/autogen.sh +++ b/autogen.sh @@ -13,7 +13,7 @@ echo timestamp > stamp-h.in python util/import_gcry.py lib/libgcrypt/ . -python util/import_unicode.py unicode/UnicodeData.txt unicode/BidiMirroring.txt unidata.c +python util/import_unicode.py unicode/UnicodeData.txt unicode/BidiMirroring.txt unicode/ArabicShaping.txt unidata.c for rmk in conf/*.rmk ${GRUB_CONTRIB}/*/conf/*.rmk; do if test -e $rmk ; then diff --git a/include/grub/unicode.h b/include/grub/unicode.h index 457bcd93b..28c752b9e 100644 --- a/include/grub/unicode.h +++ b/include/grub/unicode.h @@ -36,6 +36,7 @@ struct grub_unicode_compact_range grub_uint8_t bidi_type:5; grub_uint8_t comb_type; grub_uint8_t bidi_mirror:1; + grub_uint8_t join_type:3; } __attribute__ ((packed)); enum grub_bidi_type @@ -61,6 +62,16 @@ enum grub_bidi_type GRUB_BIDI_TYPE_ON }; +enum grub_join_type + { + GRUB_JOIN_TYPE_NONJOINING = 0, + GRUB_JOIN_TYPE_LEFT = 1, + GRUB_JOIN_TYPE_RIGHT = 2, + GRUB_JOIN_TYPE_DUAL = 3, + GRUB_JOIN_TYPE_CAUSING = 4, + GRUB_JOIN_TYPE_TRANSPARENT = 5 + }; + enum grub_comb_type { GRUB_UNICODE_COMB_NONE = 0, @@ -103,7 +114,7 @@ struct grub_unicode_glyph { grub_uint32_t base; grub_uint16_t variant:9; - grub_uint8_t attributes:1; + grub_uint8_t attributes:5; grub_size_t ncomb; struct grub_unicode_combining { grub_uint32_t code; @@ -115,12 +126,30 @@ struct grub_unicode_glyph }; #define GRUB_UNICODE_GLYPH_ATTRIBUTE_MIRROR 0x1 +#define GRUB_UNICODE_GLYPH_ATTRIBUTES_JOIN_LEFT_TO_RIGHT_SHIFT 1 +#define GRUB_UNICODE_GLYPH_ATTRIBUTE_LEFT_JOINED 0x2 +#define GRUB_UNICODE_GLYPH_ATTRIBUTE_RIGHT_JOINED \ + (GRUB_UNICODE_GLYPH_ATTRIBUTE_LEFT_JOINED \ + << GRUB_UNICODE_GLYPH_ATTRIBUTES_JOIN_LEFT_TO_RIGHT_SHIFT) +/* Set iff the corresponding joining flags come from ZWJ or ZWNJ. */ +#define GRUB_UNICODE_GLYPH_ATTRIBUTE_LEFT_JOINED_EXPLICIT 0x8 +#define GRUB_UNICODE_GLYPH_ATTRIBUTE_RIGHT_JOINED_EXPLICIT \ + (GRUB_UNICODE_GLYPH_ATTRIBUTE_LEFT_JOINED_EXPLICIT \ + << GRUB_UNICODE_GLYPH_ATTRIBUTES_JOIN_LEFT_TO_RIGHT_SHIFT) +#define GRUB_UNICODE_GLYPH_ATTRIBUTES_JOIN \ + (GRUB_UNICODE_GLYPH_ATTRIBUTE_LEFT_JOINED \ + | GRUB_UNICODE_GLYPH_ATTRIBUTE_RIGHT_JOINED \ + | GRUB_UNICODE_GLYPH_ATTRIBUTE_LEFT_JOINED_EXPLICIT \ + | GRUB_UNICODE_GLYPH_ATTRIBUTE_RIGHT_JOINED_EXPLICIT) + #define GRUB_UNICODE_COMBINING_GRAPHEME_JOINER 0x34f #define GRUB_UNICODE_VARIATION_SELECTOR_1 0xfe00 #define GRUB_UNICODE_VARIATION_SELECTOR_16 0xfe0f #define GRUB_UNICODE_VARIATION_SELECTOR_17 0xe0100 #define GRUB_UNICODE_VARIATION_SELECTOR_256 0xe01ef #define GRUB_UNICODE_HEBREW_WAW 0x05d5 +#define GRUB_UNICODE_ZWNJ 0x200c +#define GRUB_UNICODE_ZWJ 0x200d extern struct grub_unicode_compact_range grub_unicode_compact[]; extern struct grub_unicode_bidi_pair grub_unicode_bidi_pairs[]; diff --git a/unicode/ArabicShaping.txt b/unicode/ArabicShaping.txt new file mode 100644 index 000000000..b851d3839 --- /dev/null +++ b/unicode/ArabicShaping.txt @@ -0,0 +1,373 @@ +# ArabicShaping-5.2.0.txt +# Date: 2009-08-17, 11:11:00 PDT [KW] +# +# This file is a normative contributory data file in the +# Unicode Character Database. +# +# Copyright (c) 1991-2009 Unicode, Inc. +# For terms of use, see http://www.unicode.org/terms_of_use.html +# +# This file defines the shaping classes for Arabic, Syriac, and N'Ko +# positional shaping, repeating in machine readable form the +# information exemplified in Tables 8-3, 8-7, 8-8, 8-11, 8-12, +# 8-13, and 13-5 of The Unicode Standard, Version 5.2. +# +# See sections 8.2, 8.3, and 13.5 of The Unicode Standard, Version 5.2 +# for more information. +# +# Each line contains four fields, separated by a semicolon. +# +# Field 0: the code point, in 4-digit hexadecimal +# form, of an Arabic, Syriac, or N'Ko character. +# +# Field 1: gives a short schematic name for that character, +# abbreviated from the normative Unicode character name. +# +# Field 2: defines the joining type (property name: Joining_Type) +# R Right_Joining +# L Left_Joining +# D Dual_Joining +# C Join_Causing +# U Non_Joining +# T Transparent +# See Section 8.2, Arabic for more information on these types. +# +# Field 3: defines the joining group (property name: Joining_Group) +# +# The values of the joining group are based schematically on character +# names. Where a schematic character name consists of two or more parts separated +# by spaces, the formal Joining_Group property value, as specified in +# PropertyValueAliases.txt, consists of the same name parts joined by +# underscores. Hence, the entry: +# +# 0629; TEH MARBUTA; R; TEH MARBUTA +# +# corresponds to [Joining_Group = Teh_Marbuta]. +# +# Note: For historical reasons, the property value [Joining_Group = Hamza_On_Heh_Goal] +# is anachronistically named. It used to apply to both of the following characters +# in earlier versions of the standard: +# +# U+06C2 ARABIC LETTER HEH GOAL WITH HAMZA ABOVE +# U+06C3 ARABIC LETTER TEH MARBUTA GOAL +# +# However, it currently applies only to U+06C3, and *not* to U+06C2. +# To avoid destabilizing existing Joining_Group property aliases, the +# value Hamza_On_Heh_Goal has not been changed, despite the fact that it +# no longer applies to Hamza On Heh Goal, but only to Teh Marbuta Goal. +# +# When other cursive scripts are added to the Unicode Standard in +# the future, the joining group value of all its letters will default +# to jg=No_Joining_Group in this data file. Other, more specific +# joining group values will be defined only if an explicit proposal +# to define those values exactly has been approved by the UTC. This +# is the convention exemplified by the N'Ko script. Only the Arabic +# and Syriac scripts currently have explicit joining group values defined. +# +# Note: Code points that are not explicitly listed in this file are +# either of joining type T or U: +# +# - Those that not explicitly listed that are of General Category Mn, Me, or Cf +# have joining type T. +# - All others not explicitly listed have joining type U. +# +# For an explicit listing of characters of joining type T, see +# the derived property file DerivedJoiningType.txt. +# +# There are currently no characters of joining type L defined in Unicode. +# +# ############################################################# + +# Unicode; Schematic Name; Joining Type; Joining Group + +# Arabic characters + +0600; ARABIC NUMBER SIGN; U; No_Joining_Group +0601; ARABIC SIGN SANAH; U; No_Joining_Group +0602; ARABIC FOOTNOTE MARKER; U; No_Joining_Group +0603; ARABIC SIGN SAFHA; U; No_Joining_Group +0608; ARABIC RAY; U; No_Joining_Group +060B; AFGHANI SIGN; U; No_Joining_Group +0621; HAMZA; U; No_Joining_Group +0622; MADDA ON ALEF; R; ALEF +0623; HAMZA ON ALEF; R; ALEF +0624; HAMZA ON WAW; R; WAW +0625; HAMZA UNDER ALEF; R; ALEF +0626; HAMZA ON YEH; D; YEH +0627; ALEF; R; ALEF +0628; BEH; D; BEH +0629; TEH MARBUTA; R; TEH MARBUTA +062A; TEH; D; BEH +062B; THEH; D; BEH +062C; JEEM; D; HAH +062D; HAH; D; HAH +062E; KHAH; D; HAH +062F; DAL; R; DAL +0630; THAL; R; DAL +0631; REH; R; REH +0632; ZAIN; R; REH +0633; SEEN; D; SEEN +0634; SHEEN; D; SEEN +0635; SAD; D; SAD +0636; DAD; D; SAD +0637; TAH; D; TAH +0638; ZAH; D; TAH +0639; AIN; D; AIN +063A; GHAIN; D; AIN +063B; KEHEH WITH 2 DOTS ABOVE; D; GAF +063C; KEHEH WITH 3 DOTS BELOW; D; GAF +063D; FARSI YEH WITH INVERTED V; D; FARSI YEH +063E; FARSI YEH WITH 2 DOTS ABOVE; D; FARSI YEH +063F; FARSI YEH WITH 3 DOTS ABOVE; D; FARSI YEH +0640; TATWEEL; C; No_Joining_Group +0641; FEH; D; FEH +0642; QAF; D; QAF +0643; KAF; D; KAF +0644; LAM; D; LAM +0645; MEEM; D; MEEM +0646; NOON; D; NOON +0647; HEH; D; HEH +0648; WAW; R; WAW +0649; ALEF MAKSURA; D; YEH +064A; YEH; D; YEH +066E; DOTLESS BEH; D; BEH +066F; DOTLESS QAF; D; QAF +0671; HAMZAT WASL ON ALEF; R; ALEF +0672; WAVY HAMZA ON ALEF; R; ALEF +0673; WAVY HAMZA UNDER ALEF; R; ALEF +0674; HIGH HAMZA; U; No_Joining_Group +0675; HIGH HAMZA ALEF; R; ALEF +0676; HIGH HAMZA WAW; R; WAW +0677; HIGH HAMZA WAW WITH DAMMA; R; WAW +0678; HIGH HAMZA YEH; D; YEH +0679; TEH WITH SMALL TAH; D; BEH +067A; TEH WITH 2 DOTS VERTICAL ABOVE; D; BEH +067B; BEH WITH 2 DOTS VERTICAL BELOW; D; BEH +067C; TEH WITH RING; D; BEH +067D; TEH WITH 3 DOTS ABOVE DOWNWARD; D; BEH +067E; TEH WITH 3 DOTS BELOW; D; BEH +067F; TEH WITH 4 DOTS ABOVE; D; BEH +0680; BEH WITH 4 DOTS BELOW; D; BEH +0681; HAMZA ON HAH; D; HAH +0682; HAH WITH 2 DOTS VERTICAL ABOVE; D; HAH +0683; HAH WITH MIDDLE 2 DOTS; D; HAH +0684; HAH WITH MIDDLE 2 DOTS VERTICAL; D; HAH +0685; HAH WITH 3 DOTS ABOVE; D; HAH +0686; HAH WITH MIDDLE 3 DOTS DOWNWARD; D; HAH +0687; HAH WITH MIDDLE 4 DOTS; D; HAH +0688; DAL WITH SMALL TAH; R; DAL +0689; DAL WITH RING; R; DAL +068A; DAL WITH DOT BELOW; R; DAL +068B; DAL WITH DOT BELOW AND SMALL TAH; R; DAL +068C; DAL WITH 2 DOTS ABOVE; R; DAL +068D; DAL WITH 2 DOTS BELOW; R; DAL +068E; DAL WITH 3 DOTS ABOVE; R; DAL +068F; DAL WITH 3 DOTS ABOVE DOWNWARD; R; DAL +0690; DAL WITH 4 DOTS ABOVE; R; DAL +0691; REH WITH SMALL TAH; R; REH +0692; REH WITH SMALL V; R; REH +0693; REH WITH RING; R; REH +0694; REH WITH DOT BELOW; R; REH +0695; REH WITH SMALL V BELOW; R; REH +0696; REH WITH DOT BELOW AND DOT ABOVE; R; REH +0697; REH WITH 2 DOTS ABOVE; R; REH +0698; REH WITH 3 DOTS ABOVE; R; REH +0699; REH WITH 4 DOTS ABOVE; R; REH +069A; SEEN WITH DOT BELOW AND DOT ABOVE; D; SEEN +069B; SEEN WITH 3 DOTS BELOW; D; SEEN +069C; SEEN WITH 3 DOTS BELOW AND 3 DOTS ABOVE; D; SEEN +069D; SAD WITH 2 DOTS BELOW; D; SAD +069E; SAD WITH 3 DOTS ABOVE; D; SAD +069F; TAH WITH 3 DOTS ABOVE; D; TAH +06A0; AIN WITH 3 DOTS ABOVE; D; AIN +06A1; DOTLESS FEH; D; FEH +06A2; FEH WITH DOT MOVED BELOW; D; FEH +06A3; FEH WITH DOT BELOW; D; FEH +06A4; FEH WITH 3 DOTS ABOVE; D; FEH +06A5; FEH WITH 3 DOTS BELOW; D; FEH +06A6; FEH WITH 4 DOTS ABOVE; D; FEH +06A7; QAF WITH DOT ABOVE; D; QAF +06A8; QAF WITH 3 DOTS ABOVE; D; QAF +06A9; KEHEH; D; GAF +06AA; SWASH KAF; D; SWASH KAF +06AB; KAF WITH RING; D; GAF +06AC; KAF WITH DOT ABOVE; D; KAF +06AD; KAF WITH 3 DOTS ABOVE; D; KAF +06AE; KAF WITH 3 DOTS BELOW; D; KAF +06AF; GAF; D; GAF +06B0; GAF WITH RING; D; GAF +06B1; GAF WITH 2 DOTS ABOVE; D; GAF +06B2; GAF WITH 2 DOTS BELOW; D; GAF +06B3; GAF WITH 2 DOTS VERTICAL BELOW; D; GAF +06B4; GAF WITH 3 DOTS ABOVE; D; GAF +06B5; LAM WITH SMALL V; D; LAM +06B6; LAM WITH DOT ABOVE; D; LAM +06B7; LAM WITH 3 DOTS ABOVE; D; LAM +06B8; LAM WITH 3 DOTS BELOW; D; LAM +06B9; NOON WITH DOT BELOW; D; NOON +06BA; DOTLESS NOON; D; NOON +06BB; DOTLESS NOON WITH SMALL TAH; D; NOON +06BC; NOON WITH RING; D; NOON +06BD; NYA; D; NYA +06BE; KNOTTED HEH; D; KNOTTED HEH +06BF; HAH WITH MIDDLE 3 DOTS DOWNWARD AND DOT ABOVE; D; HAH +06C0; HAMZA ON HEH; R; TEH MARBUTA +06C1; HEH GOAL; D; HEH GOAL +06C2; HAMZA ON HEH GOAL; D; HEH GOAL +06C3; TEH MARBUTA GOAL; R; HAMZA ON HEH GOAL +06C4; WAW WITH RING; R; WAW +06C5; WAW WITH BAR; R; WAW +06C6; WAW WITH SMALL V; R; WAW +06C7; WAW WITH DAMMA; R; WAW +06C8; WAW WITH ALEF ABOVE; R; WAW +06C9; WAW WITH INVERTED SMALL V; R; WAW +06CA; WAW WITH 2 DOTS ABOVE; R; WAW +06CB; WAW WITH 3 DOTS ABOVE; R; WAW +06CC; FARSI YEH; D; FARSI YEH +06CD; YEH WITH TAIL; R; YEH WITH TAIL +06CE; FARSI YEH WITH SMALL V; D; FARSI YEH +06CF; WAW WITH DOT ABOVE; R; WAW +06D0; YEH WITH 2 DOTS VERTICAL BELOW; D; YEH +06D1; YEH WITH 3 DOTS BELOW; D; YEH +06D2; YEH BARREE; R; YEH BARREE +06D3; HAMZA ON YEH BARREE; R; YEH BARREE +06D5; AE; R; TEH MARBUTA +06DD; ARABIC END OF AYAH; U; No_Joining_Group +06EE; DAL WITH INVERTED V; R; DAL +06EF; REH WITH INVERTED V; R; REH +06FA; SEEN WITH DOT BELOW AND 3 DOTS ABOVE; D; SEEN +06FB; DAD WITH DOT BELOW; D; SAD +06FC; GHAIN WITH DOT BELOW; D; AIN +06FF; HEH WITH INVERTED V; D; KNOTTED HEH + +# Syriac characters + +0710; ALAPH; R; ALAPH +0712; BETH; D; BETH +0713; GAMAL; D; GAMAL +0714; GAMAL GARSHUNI; D; GAMAL +0715; DALATH; R; DALATH RISH +0716; DOTLESS DALATH RISH; R; DALATH RISH +0717; HE; R; HE +0718; WAW; R; SYRIAC WAW +0719; ZAIN; R; ZAIN +071A; HETH; D; HETH +071B; TETH; D; TETH +071C; TETH GARSHUNI; D; TETH +071D; YUDH; D; YUDH +071E; YUDH HE; R; YUDH HE +071F; KAPH; D; KAPH +0720; LAMADH; D; LAMADH +0721; MIM; D; MIM +0722; NUN; D; NUN +0723; SEMKATH; D; SEMKATH +0724; FINAL SEMKATH; D; FINAL SEMKATH +0725; E; D; E +0726; PE; D; PE +0727; REVERSED PE; D; REVERSED PE +0728; SADHE; R; SADHE +0729; QAPH; D; QAPH +072A; RISH; R; DALATH RISH +072B; SHIN; D; SHIN +072C; TAW; R; TAW +072D; PERSIAN BHETH; D; BETH +072E; PERSIAN GHAMAL; D; GAMAL +072F; PERSIAN DHALATH; R; DALATH RISH +074D; SOGDIAN ZHAIN; R; ZHAIN +074E; SOGDIAN KHAPH; D; KHAPH +074F; SOGDIAN FE; D; FE + +# Arabic supplement characters + +0750; BEH WITH 3 DOTS HORIZONTALLY BELOW; D; BEH +0751; BEH WITH DOT BELOW AND 3 DOTS ABOVE; D; BEH +0752; BEH WITH 3 DOTS POINTING UPWARDS BELOW; D; BEH +0753; BEH WITH 3 DOTS POINTING UPWARDS BELOW AND 2 DOTS ABOVE; D; BEH +0754; BEH WITH 2 DOTS BELOW AND DOT ABOVE; D; BEH +0755; BEH WITH INVERTED SMALL V BELOW; D; BEH +0756; BEH WITH SMALL V; D; BEH +0757; HAH WITH 2 DOTS ABOVE; D; HAH +0758; HAH WITH 3 DOTS POINTING UPWARDS BELOW; D; HAH +0759; DAL WITH 2 DOTS VERTICALLY BELOW AND SMALL TAH; R; DAL +075A; DAL WITH INVERTED SMALL V BELOW; R; DAL +075B; REH WITH STROKE; R; REH +075C; SEEN WITH 4 DOTS ABOVE; D; SEEN +075D; AIN WITH 2 DOTS ABOVE; D; AIN +075E; AIN WITH 3 DOTS POINTING DOWNWARDS ABOVE; D; AIN +075F; AIN WITH 2 DOTS VERTICALLY ABOVE; D; AIN +0760; FEH WITH 2 DOTS BELOW; D; FEH +0761; FEH WITH 3 DOTS POINTING UPWARDS BELOW; D; FEH +0762; KEHEH WITH DOT ABOVE; D; GAF +0763; KEHEH WITH 3 DOTS ABOVE; D; GAF +0764; KEHEH WITH 3 DOTS POINTING UPWARDS BELOW; D; GAF +0765; MEEM WITH DOT ABOVE; D; MEEM +0766; MEEM WITH DOT BELOW; D; MEEM +0767; NOON WITH 2 DOTS BELOW; D; NOON +0768; NOON WITH SMALL TAH; D; NOON +0769; NOON WITH SMALL V; D; NOON +076A; LAM WITH BAR; D; LAM +076B; REH WITH 2 DOTS VERTICALLY ABOVE; R; REH +076C; REH WITH HAMZA ABOVE; R; REH +076D; SEEN WITH 2 DOTS VERTICALLY ABOVE; D; SEEN +076E; HAH WITH SMALL TAH BELOW; D; HAH +076F; HAH WITH SMALL TAH AND 2 DOTS; D; HAH +0770; SEEN WITH SMALL TAH AND 2 DOTS; D; SEEN +0771; REH WITH SMALL TAH AND 2 DOTS; R; REH +0772; HAH WITH SMALL TAH ABOVE; D; HAH +0773; ALEF WITH DIGIT TWO ABOVE; R; ALEF +0774; ALEF WITH DIGIT THREE ABOVE; R; ALEF +0775; FARSI YEH WITH DIGIT TWO ABOVE; D; FARSI YEH +0776; FARSI YEH WITH DIGIT THREE ABOVE; D; FARSI YEH +0777; YEH WITH DIGIT FOUR BELOW; D; YEH +0778; WAW WITH DIGIT TWO ABOVE; R; WAW +0779; WAW WITH DIGIT THREE ABOVE; R; WAW +077A; YEH BARREE WITH DIGIT TWO ABOVE; D; BURUSHASKI YEH BARREE +077B; YEH BARREE WITH DIGIT THREE ABOVE; D; BURUSHASKI YEH BARREE +077C; HAH WITH DIGIT FOUR BELOW; D; HAH +077D; SEEN WITH DIGIT FOUR ABOVE; D; SEEN +077E; SEEN WITH INVERTED V; D; SEEN +077F; KAF WITH 2 DOTS ABOVE; D; KAF + +# N'Ko Characters + +07CA; NKO A; D; No_Joining_Group +07CB; NKO EE; D; No_Joining_Group +07CC; NKO I; D; No_Joining_Group +07CD; NKO E; D; No_Joining_Group +07CE; NKO U; D; No_Joining_Group +07CF; NKO OO; D; No_Joining_Group +07D0; NKO O; D; No_Joining_Group +07D1; NKO DAGBASINNA; D; No_Joining_Group +07D2; NKO N; D; No_Joining_Group +07D3; NKO BA; D; No_Joining_Group +07D4; NKO PA; D; No_Joining_Group +07D5; NKO TA; D; No_Joining_Group +07D6; NKO JA; D; No_Joining_Group +07D7; NKO CHA; D; No_Joining_Group +07D8; NKO DA; D; No_Joining_Group +07D9; NKO RA; D; No_Joining_Group +07DA; NKO RRA; D; No_Joining_Group +07DB; NKO SA; D; No_Joining_Group +07DC; NKO GBA; D; No_Joining_Group +07DD; NKO FA; D; No_Joining_Group +07DE; NKO KA; D; No_Joining_Group +07DF; NKO LA; D; No_Joining_Group +07E0; NKO NA WOLOSO; D; No_Joining_Group +07E1; NKO MA; D; No_Joining_Group +07E2; NKO NYA; D; No_Joining_Group +07E3; NKO NA; D; No_Joining_Group +07E4; NKO HA; D; No_Joining_Group +07E5; NKO WA; D; No_Joining_Group +07E6; NKO YA; D; No_Joining_Group +07E7; NKO NYA WOLOSO; D; No_Joining_Group +07E8; NKO JONA JA; D; No_Joining_Group +07E9; NKO JONA CHA; D; No_Joining_Group +07EA; NKO JONA RA; D; No_Joining_Group +07FA; NKO LAJANYALAN; C; No_Joining_Group + +# Other + +200C; ZERO WIDTH NON-JOINER; U; No_Joining_Group +200D; ZERO WIDTH JOINER; C; No_Joining_Group + +# EOF diff --git a/util/import_unicode.py b/util/import_unicode.py index e0e53b655..653d7bae2 100644 --- a/util/import_unicode.py +++ b/util/import_unicode.py @@ -24,8 +24,33 @@ import datetime if len (sys.argv) < 3: print ("Usage: %s SOURCE DESTINATION" % sys.argv[0]) exit (0) +infile = open (sys.argv[3], "r") +joining = {} +for line in infile: + line = re.sub ("#.*$", "", line) + line = line.replace ("\n", "") + line = line.replace (" ", "") + if len (line) == 0 or line[0] == '\n': + continue + sp = line.split (";") + curcode = int (sp[0], 16) + if sp[2] == "U": + joining[curcode] = "GRUB_JOIN_TYPE_NONJOINING" + elif sp[2] == "L": + joining[curcode] = "GRUB_JOIN_TYPE_LEFT" + elif sp[2] == "R": + joining[curcode] = "GRUB_JOIN_TYPE_RIGHT" + elif sp[2] == "D": + joining[curcode] = "GRUB_JOIN_TYPE_DUAL" + elif sp[2] == "C": + joining[curcode] = "GRUB_JOIN_TYPE_CAUSING" + else: + print ("Unknown joining type '%s'" % sp[2]) + exit (1) +infile.close () + infile = open (sys.argv[1], "r") -outfile = open (sys.argv[3], "w") +outfile = open (sys.argv[4], "w") outfile.write ("#include \n") outfile.write ("\n") outfile.write ("struct grub_unicode_compact_range grub_unicode_compact[] = {\n") @@ -74,23 +99,32 @@ for line in infile: curcombtype != 240 and curcombtype != 253 and \ curcombtype != 254 and curcombtype != 255): print ("WARNING: Unknown combining type %d" % curcombtype) + if curcode in joining: + curjoin = joining[curcode] + elif sp[2] == "Me" or sp[2] == "Mn" or sp[2] == "Cf": + curjoin = "GRUB_JOIN_TYPE_TRANSPARENT" + else: + curjoin = "GRUB_JOIN_TYPE_NONJOINING" if lastcode + 1 != curcode or curbiditype != lastbiditype \ - or curcombtype != lastcombtype or curmirrortype != lastmirrortype: + or curcombtype != lastcombtype or curmirrortype != lastmirrortype \ + or curjoin != lastjoin: if begincode != -2 and (lastbiditype != "L" or lastcombtype != 0 or \ lastmirrortype): - outfile.write (("{0x%x, 0x%x, GRUB_BIDI_TYPE_%s, %d, %d},\n" \ + outfile.write (("{0x%x, 0x%x, GRUB_BIDI_TYPE_%s, %d, %d, %s},\n" \ % (begincode, lastcode, lastbiditype, \ - lastcombtype, lastmirrortype))) + lastcombtype, lastmirrortype, \ + lastjoin))) begincode = curcode lastcode = curcode + lastjoin = curjoin lastbiditype = curbiditype lastcombtype = curcombtype lastmirrortype = curmirrortype if lastbiditype != "L" or lastcombtype != 0 or lastmirrortype: - outfile.write (("{0x%x, 0x%x, GRUB_BIDI_TYPE_%s, %d, %d},\n" \ + outfile.write (("{0x%x, 0x%x, GRUB_BIDI_TYPE_%s, %d, %d, %s},\n" \ % (begincode, lastcode, lastbiditype, lastcombtype, \ - lastmirrortype))) -outfile.write ("{0, 0, 0, 0, 0},\n") + lastmirrortype, lastjoin))) +outfile.write ("{0, 0, 0, 0, 0, 0},\n") outfile.write ("};\n")