/* The contents of this file are subject to the THDL Open Community License Version 1.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License on the THDL web site (http://www.thdl.org/). Software distributed under the License is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for the specific terms governing rights and limitations under the License. The Initial Developer of this software is the Tibetan and Himalayan Digital Library (THDL). Portions created by the THDL are Copyright 2001 THDL. All Rights Reserved. Contributor(s): ______________________________________. */ package org.thdl.tib.text.tshegbar; /**
This non-instantiable class contains utility routines for * dealing with Tibetan Unicode codepoints and strings of such * codepoints.
* * @author David Chandler */ public class UnicodeUtils implements UnicodeConstants { /** Do not use this, as this class is not instantiable. */ private UnicodeUtils() { super(); } /** Returns true iff x is a Unicode codepoint that represents a consonant or two-consonant stack that has a Unicode code point. Returns true only for the usual suspects (like\u0F40
) and for Sanskrit consonants (like
\u0F71
) and the simple two-consonant stacks in
Unicode (like \u0F43
). Returns false for, among
other things, subjoined consonants like
\u0F90
. */
public static boolean isNonSubjoinedConsonant(char x) {
return ((x != '\u0F48' /* reserved in Unicode 3.2, but not in use */)
&& (x >= '\u0F40' && x <= '\u0F6A'));
}
/** Returns true iff x is a Unicode codepoint that represents a
subjoined consonant or subjoined two-consonant stack that has
a Unicode code point. Returns true only for the usual
suspects (like \u0F90
) and for Sanskrit
consonants (like \u0F9C
) and the simple
two-consonant stacks in Unicode (like \u0FAC
).
Returns false for, among other things, non-subjoined
consonants like \u0F40
. */
public static boolean isSubjoinedConsonant(char x) {
return ((x != '\u0F98' /* reserved in Unicode 3.2, but not in use */)
&& (x >= '\u0F90' && x <= '\u0FBC'));
}
/** Returns true iff x is the preferred representation of a
Tibetan or Sanskrit consonant and cannot be broken down any
further. Returns false for, among other things, subjoined
consonants like \u0F90
, two-component consonants
like \u0F43
, and fixed-form consonants like
'\u0F6A'. The new consonants (for transcribing Chinese, I
believe) "\u0F55\u0F39" (which EWTS calls "fa"),
"\u0F56\u0F39" ("va"), and "\u0F5F\u0F39" ("Dza") are
two-codepoint sequences, but you should be aware of them
also. */
public static boolean isPreferredFormOfConsonant(char x) {
return ((x != '\u0F48' /* reserved in Unicode 3.2, but not in use */)
&& (x >= '\u0F40' && x <= '\u0F68')
&& (x != '\u0F43')
&& (x != '\u0F4D')
&& (x != '\u0F52')
&& (x != '\u0F57')
&& (x != '\u0F5C'));
}
/** Returns true iff unicodeCP is a codepoint from the Unicode
range U+0F00-U+0FFF.
@see #isEntirelyTibetanUnicode(String) */
public static boolean isInTibetanRange(char unicodeCP) {
return (unicodeCP >= '\u0F00' && unicodeCP <= '\u0FFF');
}
/** Returns true iff unicodeString consists only of codepoints
from the Unicode range U+0F00-U+0FFF. (Note that these
codepoints are typically not enough to represent a Tibetan
text, you may need ZWSP (zero-width space) and various
whitespace from other ranges.) */
public static boolean isEntirelyTibetanUnicode(String unicodeString) {
for (int i = 0; i < unicodeString.length(); i++) {
if (!isInTibetanRange(unicodeString.charAt(i)))
return false;
}
return true;
}
/** Puts the Tibetan codepoints in tibetanUnicode, a sequence of
Unicode codepoints, into either Normalization Form KD (NFKD),
D (NFD), or THDL (NFTHDL), depending on the value of normForm.
NFD and NFKD are specified by Unicode 3.2; NFTHDL is needed
for {@link org.thdl.tib.text.tshegbar#UnicodeGraphemeCluster}
because NFKD normalizes U+0F0C
and neither NFD
nor NFKD breaks down U+0F00
into its constituent
codepoints. NFTHDL uses a maximum of codepoints, and it never
uses codepoints whose use has been {@link #isDiscouraged(char)
discouraged}.
The Tibetan passages of the returned string are in the
chosen normalized form, but codepoints outside of the {@link
#isInTibetanRange(char) range}
U+0F00
-U+0FFF
are not necessarily
put into normalized form.
Recall that normalized forms are not necessarily closed under string concatenation, but are closed under substringing.
Note well that only well-formed input guarantees well-formed output.
@param tibetanUnicode the codepoints to be decomposed @param normForm NORM_NFKD, NORM_NFTHDL, or NORM_NFD */ public static void toMostlyDecomposedUnicode(StringBuffer tibetanUnicode, byte normForm) { if (normForm != NORM_NFD && normForm != NORM_NFKD && normForm != NORM_NFTHDL) throw new IllegalArgumentException("normForm must be NORM_NFD, NORM_NFTHDL, or NORM_NFKD for decomposition to work"); int offset = 0; while (offset < tibetanUnicode.length()) { String s = toNormalizedForm(tibetanUnicode.charAt(offset), normForm); if (null == s) { ++offset; } else { // modify tibetanUnicode and update offset. tibetanUnicode.deleteCharAt(offset); tibetanUnicode.insert(offset, s); } } } /** Like {@link #toMostlyDecomposedUnicode(StringBuffer, byte)}, but does not modify its input. Instead, it returns the NFKD- or NFD-normalized version of tibetanUnicode. */ public static String toMostlyDecomposedUnicode(String tibetanUnicode, byte normForm) { StringBuffer sb = new StringBuffer(tibetanUnicode); toMostlyDecomposedUnicode(sb, normForm); return sb.toString(); } /** There are 19 codepoints in the Tibetan range of Unicode 3.2 which can be decomposed into longer strings of codepoints in the Tibetan range of Unicode. Often one wants to manipulate decomposed codepoint strings. Also, HTML and XML are W3C standards that require certain normalization forms. This routine returns a chosen normalized form for such codepoints, and returns null for codepoints that are already normalized or are not in the Tibetan range of Unicode. @param tibetanUnicodeCP the codepoint to normalize @param normalizationForm NORM_NFTHDL, NORM_NFKD, or NORM_NFD if you expect something nontrivial to happen @return null if tibetanUnicodeCP is already in the chosen normalized form, or a string of two or three codepoints otherwise */ public static String toNormalizedForm(char tibetanUnicodeCP, byte normalizationForm) { if (normalizationForm == NORM_NFKD || normalizationForm == NORM_NFD || normalizationForm == NORM_NFTHDL) { // Where not specified, the NFKD and NFTHDL forms are // identical to the NFD form. switch (tibetanUnicodeCP) { case '\u0F00': return ((normalizationForm == NORM_NFTHDL) ? "\u0F68\u0F7C\u0F7E" : null); case '\u0F0C': return ((normalizationForm == NORM_NFKD) ? "\u0F0B" : null); case '\u0F43': return "\u0F42\u0FB7"; case '\u0F4D': return "\u0F4C\u0FB7"; case '\u0F52': return "\u0F51\u0FB7"; case '\u0F57': return "\u0F56\u0FB7"; case '\u0F5C': return "\u0F5B\u0FB7"; case '\u0F69': return "\u0F40\u0FB5"; case '\u0F73': return "\u0F71\u0F72"; case '\u0F75': return "\u0F71\u0F74"; case '\u0F76': return "\u0FB2\u0F80"; case '\u0F77': { // I do not understand why NFD does not decompose this // codepoint, hence NORM_NFTHDL does: if (normalizationForm == NORM_NFKD || normalizationForm == NORM_NFTHDL) return "\u0FB2\u0F71\u0F80"; else return null; } case '\u0F78': return "\u0FB3\u0F80"; case '\u0F79': { // I do not understand why NFD does not decompose this // codepoint, hence NORM_NFTHDL does: if (normalizationForm == NORM_NFKD || normalizationForm == NORM_NFTHDL) return "\u0FB3\u0F71\u0F80"; else return null; } case '\u0F81': return "\u0F71\u0F80"; case '\u0F93': return "\u0F92\u0FB7"; case '\u0F9D': return "\u0F9C\u0FB7"; case '\u0FA2': return "\u0FA1\u0FB7"; case '\u0FA7': return "\u0FA6\u0FB7"; case '\u0FAC': return "\u0FAB\u0FB7"; case '\u0FB9': return "\u0F90\u0FB5"; default: return null; } } return null; } /** Returns true iff tibetanUnicodeCP {@link #isInTibetanRange(char) is a Tibetan codepoint} and if the Unicode 3.2 standard discourages the use of tibetanUnicodeCP. */ public static boolean isDiscouraged(char tibetanUnicodeCP) { return ('\u0F73' == tibetanUnicodeCP || '\u0F75' == tibetanUnicodeCP || '\u0F77' == tibetanUnicodeCP || '\u0F79' == tibetanUnicodeCP || '\u0F81' == tibetanUnicodeCP); /* DLC FIXME -- I was using 3.0 p.437-440, check 3.2. */ } /** Returns true iff ch corresponds to the Tibetan letter ra. Several Unicode codepoints correspond to the Tibetan letter ra (in its subscribed form or otherwise). Oftentimes,\u0F62
is thought of as the nominal
representation. Returns false for some codepoints that
contain ra but are not merely ra, such as \u0F77
*/
public static boolean isRa(char ch) {
return ('\u0F62' == ch
|| '\u0F6A' == ch
|| '\u0FB2' == ch
|| '\u0FBC' == ch);
}
/** Returns true iff ch corresponds to the Tibetan letter wa.
Several Unicode codepoints correspond to the Tibetan letter
wa. Oftentimes, \u0F5D
is thought of as the
nominal representation. */
public static boolean isWa(char ch) {
return ('\u0F5D' == ch
|| '\u0FAD' == ch
|| '\u0FBA' == ch);
}
/** Returns true iff ch corresponds to the Tibetan letter ya.
Several Unicode codepoints correspond to the Tibetan letter
ya. Oftentimes, \u0F61
is thought of as the
nominal representation. */
public static boolean isYa(char ch) {
return ('\u0F61' == ch
|| '\u0FB1' == ch
|| '\u0FBB' == ch);
}
/** Returns true iff there exists at least one codepoint cp in
unicodeString such that cp {@link #isRa(char) is ra} or contains
ra (like \u0F77
). This method is not implemented
as fast as it could be. It calls on the canonicalization code
in order to maximize reuse and minimize the possibility of
coder error. */
public static boolean containsRa(String unicodeString) {
String canonForm = toMostlyDecomposedUnicode(unicodeString, NORM_NFKD);
for (int i = 0; i < canonForm.length(); i++) {
if (isRa(canonForm.charAt(i)))
return true;
}
return false;
}
/** Inefficient shortcut.
@see #containsRa(String) */
public static boolean containsRa(char unicodeCP) {
return containsRa(new String(new char[] { unicodeCP }));
}
/** Returns a human-readable, ASCII form of the Unicode codepoint
cp. */
public static String unicodeCodepointToString(char cp) {
if (cp < '\u0010')
return "\\u000" + Integer.toHexString((int)cp);
else if (cp < '\u0100')
return "\\u00" + Integer.toHexString((int)cp);
else if (cp < '\u1000')
return "\\u0" + Integer.toHexString((int)cp);
else
return "\\u" + Integer.toHexString((int)cp);
}
public static String unicodeStringToString(String s) {
StringBuffer sb = new StringBuffer(s.length() * 6);
for (int i = 0; i < s.length(); i++) {
sb.append(unicodeCodepointToString(s.charAt(i)));
}
return sb.toString();
}
/** Returns true iff cp is a Unicode 3.2 Tibetan consonant,
subjoined or not. This counts precomposed consonant stacks
like U+0FA7
as consonants. If you don't wish to
treat such as consonants, then put the input into NORM_NFD,
NORM_NFKD, or NORM_NFTHDL first. If it changes under such a
normalization, it is a precomposed consonant. */
public static boolean isTibetanConsonant(char cp) {
return (((cp >= '\u0F40' && cp <= '\u0F6A')
|| (cp >= '\u0F90' && cp <= '\u0FBC'))
&& '\u0F48' != cp
&& '\u0F98' != cp);
}
}