Enable UTF8 in gnulib regexp.

* config.h.in (RE_ENABLE_I18N) [!GRUB_UTIL]: New define.
	* grub-core/lib/posix_wrap/ctype.h (islower): Use grub_islower.
	(isupper): Use grub_isupper.
	(isascii): New inline function.
	* grub-core/lib/posix_wrap/wchar.h: Replace dummy with real contents.
	* grub-core/lib/posix_wrap/wctype.h: Likewise.
	* grub-core/normal/charset.c (grub_utf8_process): New function.
	(grub_utf8_to_utf16): Use grub_utf8_process.
	(grub_encode_utf8_character): New function.
	(grub_ucs4_to_utf8): Use grub_encode_utf8_character.
	* include/grub/charset.h (grub_utf8_process): New declaration.
	(grub_encode_utf8_character): Likewise.
	* include/grub/misc.h (grub_islower): New inline function.
	(grub_isupper): Likewise.
	(grub_strchrsub): Moved down to fix the definitions.
This commit is contained in:
Vladimir 'phcoder' Serbinenko 2011-12-13 00:50:49 +01:00
parent 0af2346fdb
commit c5fc563aff
8 changed files with 380 additions and 106 deletions

View file

@ -60,6 +60,51 @@
#include "widthspec.h"
#endif
int
grub_utf8_process (grub_uint8_t c, grub_uint32_t *code, int *count)
{
if (*count)
{
if ((c & GRUB_UINT8_2_LEADINGBITS) != GRUB_UINT8_1_LEADINGBIT)
{
/* invalid */
return 0;
}
else
{
*code <<= 6;
*code |= (c & GRUB_UINT8_6_TRAILINGBITS);
(*count)--;
return 1;
}
}
if ((c & GRUB_UINT8_1_LEADINGBIT) == 0)
{
*code = c;
return 1;
}
if ((c & GRUB_UINT8_3_LEADINGBITS) == GRUB_UINT8_2_LEADINGBITS)
{
*count = 1;
*code = c & GRUB_UINT8_5_TRAILINGBITS;
return 1;
}
if ((c & GRUB_UINT8_4_LEADINGBITS) == GRUB_UINT8_3_LEADINGBITS)
{
*count = 2;
*code = c & GRUB_UINT8_4_TRAILINGBITS;
return 1;
}
if ((c & GRUB_UINT8_5_LEADINGBITS) == GRUB_UINT8_4_LEADINGBITS)
{
*count = 3;
*code = c & GRUB_UINT8_3_TRAILINGBITS;
return 1;
}
return 0;
}
grub_ssize_t
grub_utf8_to_utf16 (grub_uint16_t *dest, grub_size_t destsize,
const grub_uint8_t *src, grub_size_t srcsize,
@ -74,64 +119,27 @@ grub_utf8_to_utf16 (grub_uint16_t *dest, grub_size_t destsize,
while (srcsize && destsize)
{
grub_uint32_t c = *src++;
grub_uint8_t c = *src++;
if (srcsize != (grub_size_t)-1)
srcsize--;
if (count)
if (!grub_utf8_process (c, &code, &count))
return -1;
if (count != 0)
continue;
if (code == 0)
break;
if (destsize < 2 && code >= GRUB_UCS2_LIMIT)
break;
if (code >= GRUB_UCS2_LIMIT)
{
if ((c & GRUB_UINT8_2_LEADINGBITS) != GRUB_UINT8_1_LEADINGBIT)
{
/* invalid */
return -1;
}
else
{
code <<= 6;
code |= (c & GRUB_UINT8_6_TRAILINGBITS);
count--;
}
*p++ = GRUB_UTF16_UPPER_SURROGATE (code);
*p++ = GRUB_UTF16_LOWER_SURROGATE (code);
destsize -= 2;
}
else
{
if (c == 0)
break;
if ((c & GRUB_UINT8_1_LEADINGBIT) == 0)
code = c;
else if ((c & GRUB_UINT8_3_LEADINGBITS) == GRUB_UINT8_2_LEADINGBITS)
{
count = 1;
code = c & GRUB_UINT8_5_TRAILINGBITS;
}
else if ((c & GRUB_UINT8_4_LEADINGBITS) == GRUB_UINT8_3_LEADINGBITS)
{
count = 2;
code = c & GRUB_UINT8_4_TRAILINGBITS;
}
else if ((c & GRUB_UINT8_5_LEADINGBITS) == GRUB_UINT8_4_LEADINGBITS)
{
count = 3;
code = c & GRUB_UINT8_3_TRAILINGBITS;
}
else
return -1;
}
if (count == 0)
{
if (destsize < 2 && code >= GRUB_UCS2_LIMIT)
break;
if (code >= GRUB_UCS2_LIMIT)
{
*p++ = GRUB_UTF16_UPPER_SURROGATE (code);
*p++ = GRUB_UTF16_LOWER_SURROGATE (code);
destsize -= 2;
}
else
{
*p++ = code;
destsize--;
}
*p++ = code;
destsize--;
}
}
@ -140,6 +148,53 @@ grub_utf8_to_utf16 (grub_uint16_t *dest, grub_size_t destsize,
return p - dest;
}
/* Returns -2 if not enough space, -1 on invalid character. */
grub_ssize_t
grub_encode_utf8_character (grub_uint8_t *dest, grub_uint8_t *destend,
grub_uint32_t code)
{
if (dest >= destend)
return -2;
if (code <= 0x007F)
{
*dest++ = code;
return 1;
}
if (code <= 0x07FF)
{
if (dest + 1 >= destend)
return -2;
*dest++ = (code >> 6) | 0xC0;
*dest++ = (code & 0x3F) | 0x80;
return 2;
}
if ((code >= 0xDC00 && code <= 0xDFFF)
|| (code >= 0xD800 && code <= 0xDBFF))
{
/* No surrogates in UCS-4... */
return -1;
}
if (code < 0x10000)
{
if (dest + 2 >= destend)
return -2;
*dest++ = (code >> 12) | 0xE0;
*dest++ = ((code >> 6) & 0x3F) | 0x80;
*dest++ = (code & 0x3F) | 0x80;
return 3;
}
{
if (dest + 3 >= destend)
return -2;
*dest++ = (code >> 18) | 0xF0;
*dest++ = ((code >> 12) & 0x3F) | 0x80;
*dest++ = ((code >> 6) & 0x3F) | 0x80;
*dest++ = (code & 0x3F) | 0x80;
return 4;
}
}
/* Convert UCS-4 to UTF-8. */
void
grub_ucs4_to_utf8 (grub_uint32_t *src, grub_size_t size,
@ -151,39 +206,17 @@ grub_ucs4_to_utf8 (grub_uint32_t *src, grub_size_t size,
while (size-- && dest < destend)
{
grub_uint32_t code = *src++;
if (code <= 0x007F)
*dest++ = code;
else if (code <= 0x07FF)
grub_ssize_t s;
s = grub_encode_utf8_character (dest, destend,
code);
if (s == -2)
break;
if (s == -1)
{
if (dest + 1 >= destend)
break;
*dest++ = (code >> 6) | 0xC0;
*dest++ = (code & 0x3F) | 0x80;
}
else if ((code >= 0xDC00 && code <= 0xDFFF)
|| (code >= 0xD800 && code <= 0xDBFF))
{
/* No surrogates in UCS-4... */
*dest++ = '?';
continue;
}
else if (code < 0x10000)
{
if (dest + 2 >= destend)
break;
*dest++ = (code >> 12) | 0xE0;
*dest++ = ((code >> 6) & 0x3F) | 0x80;
*dest++ = (code & 0x3F) | 0x80;
}
else
{
if (dest + 3 >= destend)
break;
*dest++ = (code >> 18) | 0xF0;
*dest++ = ((code >> 12) & 0x3F) | 0x80;
*dest++ = ((code >> 6) & 0x3F) | 0x80;
*dest++ = (code & 0x3F) | 0x80;
}
dest += s;
}
*dest = 0;
}