d2fcfb0cef
(GRUB_UTF16_LOWER_SURROGATE): Likewise. (grub_utf16_to_utf8): Likewise.
307 lines
8 KiB
C
307 lines
8 KiB
C
/*
|
|
* GRUB -- GRand Unified Bootloader
|
|
* Copyright (C) 1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009 Free Software Foundation, Inc.
|
|
*
|
|
* GRUB is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* GRUB is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with GRUB. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#ifndef GRUB_CHARSET_HEADER
|
|
#define GRUB_CHARSET_HEADER 1
|
|
|
|
#include <grub/types.h>
|
|
|
|
#define GRUB_UINT8_1_LEADINGBIT 0x80
|
|
#define GRUB_UINT8_2_LEADINGBITS 0xc0
|
|
#define GRUB_UINT8_3_LEADINGBITS 0xe0
|
|
#define GRUB_UINT8_4_LEADINGBITS 0xf0
|
|
#define GRUB_UINT8_5_LEADINGBITS 0xf8
|
|
#define GRUB_UINT8_6_LEADINGBITS 0xfc
|
|
#define GRUB_UINT8_7_LEADINGBITS 0xfe
|
|
|
|
#define GRUB_UINT8_1_TRAILINGBIT 0x01
|
|
#define GRUB_UINT8_2_TRAILINGBITS 0x03
|
|
#define GRUB_UINT8_3_TRAILINGBITS 0x07
|
|
#define GRUB_UINT8_4_TRAILINGBITS 0x0f
|
|
#define GRUB_UINT8_5_TRAILINGBITS 0x1f
|
|
#define GRUB_UINT8_6_TRAILINGBITS 0x3f
|
|
|
|
#define GRUB_MAX_UTF8_PER_UTF16 4
|
|
/* You need at least one UTF-8 byte to have one UTF-16 word.
|
|
You need at least three UTF-8 bytes to have 2 UTF-16 words (surrogate pairs).
|
|
*/
|
|
#define GRUB_MAX_UTF16_PER_UTF8 1
|
|
|
|
#define GRUB_UCS2_LIMIT 0x10000
|
|
#define GRUB_UTF16_UPPER_SURROGATE(code) \
|
|
(0xD800 | ((((code) - GRUB_UCS2_LIMIT) >> 10) & 0x3ff))
|
|
#define GRUB_UTF16_LOWER_SURROGATE(code) \
|
|
(0xDC00 | (((code) - GRUB_UCS2_LIMIT) & 0x3ff))
|
|
|
|
/* Process one character from UTF8 sequence.
|
|
At beginning set *code = 0, *count = 0. Returns 0 on failure and
|
|
1 on success. *count holds the number of trailing bytes. */
|
|
static inline int
|
|
grub_utf8_process (grub_uint8_t c, grub_uint32_t *code, int *count)
|
|
{
|
|
if (*count)
|
|
{
|
|
if ((c & GRUB_UINT8_2_LEADINGBITS) != GRUB_UINT8_1_LEADINGBIT)
|
|
{
|
|
*count = 0;
|
|
/* invalid */
|
|
return 0;
|
|
}
|
|
else
|
|
{
|
|
*code <<= 6;
|
|
*code |= (c & GRUB_UINT8_6_TRAILINGBITS);
|
|
(*count)--;
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
if ((c & GRUB_UINT8_1_LEADINGBIT) == 0)
|
|
{
|
|
*code = c;
|
|
return 1;
|
|
}
|
|
if ((c & GRUB_UINT8_3_LEADINGBITS) == GRUB_UINT8_2_LEADINGBITS)
|
|
{
|
|
*count = 1;
|
|
*code = c & GRUB_UINT8_5_TRAILINGBITS;
|
|
return 1;
|
|
}
|
|
if ((c & GRUB_UINT8_4_LEADINGBITS) == GRUB_UINT8_3_LEADINGBITS)
|
|
{
|
|
*count = 2;
|
|
*code = c & GRUB_UINT8_4_TRAILINGBITS;
|
|
return 1;
|
|
}
|
|
if ((c & GRUB_UINT8_5_LEADINGBITS) == GRUB_UINT8_4_LEADINGBITS)
|
|
{
|
|
*count = 3;
|
|
*code = c & GRUB_UINT8_3_TRAILINGBITS;
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
|
|
/* Convert a (possibly null-terminated) UTF-8 string of at most SRCSIZE
|
|
bytes (if SRCSIZE is -1, it is ignored) in length to a UTF-16 string.
|
|
Return the number of characters converted. DEST must be able to hold
|
|
at least DESTSIZE characters. If an invalid sequence is found, return -1.
|
|
If SRCEND is not NULL, then *SRCEND is set to the next byte after the
|
|
last byte used in SRC. */
|
|
static inline grub_size_t
|
|
grub_utf8_to_utf16 (grub_uint16_t *dest, grub_size_t destsize,
|
|
const grub_uint8_t *src, grub_size_t srcsize,
|
|
const grub_uint8_t **srcend)
|
|
{
|
|
grub_uint16_t *p = dest;
|
|
int count = 0;
|
|
grub_uint32_t code = 0;
|
|
|
|
if (srcend)
|
|
*srcend = src;
|
|
|
|
while (srcsize && destsize)
|
|
{
|
|
int was_count = count;
|
|
if (srcsize != (grub_size_t)-1)
|
|
srcsize--;
|
|
if (!grub_utf8_process (*src++, &code, &count))
|
|
{
|
|
code = '?';
|
|
count = 0;
|
|
/* Character c may be valid, don't eat it. */
|
|
if (was_count)
|
|
src--;
|
|
}
|
|
if (count != 0)
|
|
continue;
|
|
if (code == 0)
|
|
break;
|
|
if (destsize < 2 && code >= GRUB_UCS2_LIMIT)
|
|
break;
|
|
if (code >= GRUB_UCS2_LIMIT)
|
|
{
|
|
*p++ = GRUB_UTF16_UPPER_SURROGATE (code);
|
|
*p++ = GRUB_UTF16_LOWER_SURROGATE (code);
|
|
destsize -= 2;
|
|
}
|
|
else
|
|
{
|
|
*p++ = code;
|
|
destsize--;
|
|
}
|
|
}
|
|
|
|
if (srcend)
|
|
*srcend = src;
|
|
return p - dest;
|
|
}
|
|
|
|
/* Determine the last position where the UTF-8 string [beg, end) can
|
|
be safely cut. */
|
|
static inline grub_size_t
|
|
grub_getend (const char *beg, const char *end)
|
|
{
|
|
const char *ptr;
|
|
for (ptr = end - 1; ptr >= beg; ptr--)
|
|
if ((*ptr & GRUB_UINT8_2_LEADINGBITS) != GRUB_UINT8_1_LEADINGBIT)
|
|
break;
|
|
if (ptr < beg)
|
|
return 0;
|
|
if ((*ptr & GRUB_UINT8_1_LEADINGBIT) == 0)
|
|
return ptr + 1 - beg;
|
|
if ((*ptr & GRUB_UINT8_3_LEADINGBITS) == GRUB_UINT8_2_LEADINGBITS
|
|
&& ptr + 2 <= end)
|
|
return ptr + 2 - beg;
|
|
if ((*ptr & GRUB_UINT8_4_LEADINGBITS) == GRUB_UINT8_3_LEADINGBITS
|
|
&& ptr + 3 <= end)
|
|
return ptr + 3 - beg;
|
|
if ((*ptr & GRUB_UINT8_5_LEADINGBITS) == GRUB_UINT8_4_LEADINGBITS
|
|
&& ptr + 4 <= end)
|
|
return ptr + 4 - beg;
|
|
/* Invalid character or incomplete. Cut before it. */
|
|
return ptr - beg;
|
|
}
|
|
|
|
/* Convert UTF-16 to UTF-8. */
|
|
static inline grub_uint8_t *
|
|
grub_utf16_to_utf8 (grub_uint8_t *dest, const grub_uint16_t *src,
|
|
grub_size_t size)
|
|
{
|
|
grub_uint32_t code_high = 0;
|
|
|
|
while (size--)
|
|
{
|
|
grub_uint32_t code = *src++;
|
|
|
|
if (code_high)
|
|
{
|
|
if (code >= 0xDC00 && code <= 0xDFFF)
|
|
{
|
|
/* Surrogate pair. */
|
|
code = ((code_high - 0xD800) << 10) + (code - 0xDC00) + 0x10000;
|
|
|
|
*dest++ = (code >> 18) | 0xF0;
|
|
*dest++ = ((code >> 12) & 0x3F) | 0x80;
|
|
*dest++ = ((code >> 6) & 0x3F) | 0x80;
|
|
*dest++ = (code & 0x3F) | 0x80;
|
|
}
|
|
else
|
|
{
|
|
/* Error... */
|
|
*dest++ = '?';
|
|
/* *src may be valid. Don't eat it. */
|
|
src--;
|
|
}
|
|
|
|
code_high = 0;
|
|
}
|
|
else
|
|
{
|
|
if (code <= 0x007F)
|
|
*dest++ = code;
|
|
else if (code <= 0x07FF)
|
|
{
|
|
*dest++ = (code >> 6) | 0xC0;
|
|
*dest++ = (code & 0x3F) | 0x80;
|
|
}
|
|
else if (code >= 0xD800 && code <= 0xDBFF)
|
|
{
|
|
code_high = code;
|
|
continue;
|
|
}
|
|
else if (code >= 0xDC00 && code <= 0xDFFF)
|
|
{
|
|
/* Error... */
|
|
*dest++ = '?';
|
|
}
|
|
else if (code < 0x10000)
|
|
{
|
|
*dest++ = (code >> 12) | 0xE0;
|
|
*dest++ = ((code >> 6) & 0x3F) | 0x80;
|
|
*dest++ = (code & 0x3F) | 0x80;
|
|
}
|
|
else
|
|
{
|
|
*dest++ = (code >> 18) | 0xF0;
|
|
*dest++ = ((code >> 12) & 0x3F) | 0x80;
|
|
*dest++ = ((code >> 6) & 0x3F) | 0x80;
|
|
*dest++ = (code & 0x3F) | 0x80;
|
|
}
|
|
}
|
|
}
|
|
|
|
return dest;
|
|
}
|
|
|
|
#define GRUB_MAX_UTF8_PER_LATIN1 2
|
|
|
|
/* Convert Latin1 to UTF-8. */
|
|
static inline grub_uint8_t *
|
|
grub_latin1_to_utf8 (grub_uint8_t *dest, const grub_uint8_t *src,
|
|
grub_size_t size)
|
|
{
|
|
while (size--)
|
|
{
|
|
if (!(*src & 0x80))
|
|
*dest++ = *src;
|
|
else
|
|
{
|
|
*dest++ = (*src >> 6) | 0xC0;
|
|
*dest++ = (*src & 0x3F) | 0x80;
|
|
}
|
|
src++;
|
|
}
|
|
|
|
return dest;
|
|
}
|
|
|
|
/* Convert UCS-4 to UTF-8. */
|
|
char *grub_ucs4_to_utf8_alloc (const grub_uint32_t *src, grub_size_t size);
|
|
|
|
int
|
|
grub_is_valid_utf8 (const grub_uint8_t *src, grub_size_t srcsize);
|
|
|
|
grub_ssize_t grub_utf8_to_ucs4_alloc (const char *msg,
|
|
grub_uint32_t **unicode_msg,
|
|
grub_uint32_t **last_position);
|
|
|
|
/* Returns the number of bytes the string src would occupy is converted
|
|
to UTF-8, excluding \0. */
|
|
grub_size_t
|
|
grub_get_num_of_utf8_bytes (const grub_uint32_t *src, grub_size_t size);
|
|
|
|
/* Converts UCS-4 to UTF-8. Returns the number of bytes effectively written
|
|
excluding the trailing \0. */
|
|
grub_size_t
|
|
grub_ucs4_to_utf8 (const grub_uint32_t *src, grub_size_t size,
|
|
grub_uint8_t *dest, grub_size_t destsize);
|
|
grub_size_t grub_utf8_to_ucs4 (grub_uint32_t *dest, grub_size_t destsize,
|
|
const grub_uint8_t *src, grub_size_t srcsize,
|
|
const grub_uint8_t **srcend);
|
|
/* Returns -2 if not enough space, -1 on invalid character. */
|
|
grub_ssize_t
|
|
grub_encode_utf8_character (grub_uint8_t *dest, grub_uint8_t *destend,
|
|
grub_uint32_t code);
|
|
|
|
const grub_uint32_t *
|
|
grub_unicode_get_comb_start (const grub_uint32_t *str,
|
|
const grub_uint32_t *cur);
|
|
|
|
#endif
|