More and better tests; fixed some bugs in LegalTshegBar.
This commit is contained in:
parent
35a9869aac
commit
2b81020b0e
4 changed files with 317 additions and 68 deletions
|
@ -30,11 +30,11 @@ public class UnicodeUtils implements UnicodeConstants {
|
|||
/** Returns true iff x is a Unicode codepoint that represents a
|
||||
consonant or two-consonant stack that has a Unicode code
|
||||
point. Returns true only for the usual suspects (like
|
||||
<code>\u0F40</code>) and for Sanskrit consonants (like
|
||||
<code>\u0F71</code>) and the simple two-consonant stacks in
|
||||
Unicode (like <code>\u0F43</code>). Returns false for, among
|
||||
<code>U+0F40</code>) and for Sanskrit consonants (like
|
||||
<code>U+0F71</code>) and the simple two-consonant stacks in
|
||||
Unicode (like <code>U+0F43</code>). Returns false for, among
|
||||
other things, subjoined consonants like
|
||||
<code>\u0F90</code>. */
|
||||
<code>U+0F90</code>. */
|
||||
public static boolean isNonSubjoinedConsonant(char x) {
|
||||
return ((x != '\u0F48' /* reserved in Unicode 3.2, but not in use */)
|
||||
&& (x >= '\u0F40' && x <= '\u0F6A'));
|
||||
|
@ -43,11 +43,11 @@ public class UnicodeUtils implements UnicodeConstants {
|
|||
/** Returns true iff x is a Unicode codepoint that represents a
|
||||
subjoined consonant or subjoined two-consonant stack that has
|
||||
a Unicode code point. Returns true only for the usual
|
||||
suspects (like <code>\u0F90</code>) and for Sanskrit
|
||||
consonants (like <code>\u0F9C</code>) and the simple
|
||||
two-consonant stacks in Unicode (like <code>\u0FAC</code>).
|
||||
suspects (like <code>U+0F90</code>) and for Sanskrit
|
||||
consonants (like <code>U+0F9C</code>) and the simple
|
||||
two-consonant stacks in Unicode (like <code>U+0FAC</code>).
|
||||
Returns false for, among other things, non-subjoined
|
||||
consonants like <code>\u0F40</code>. */
|
||||
consonants like <code>U+0F40</code>. */
|
||||
public static boolean isSubjoinedConsonant(char x) {
|
||||
return ((x != '\u0F98' /* reserved in Unicode 3.2, but not in use */)
|
||||
&& (x >= '\u0F90' && x <= '\u0FBC'));
|
||||
|
@ -56,13 +56,13 @@ public class UnicodeUtils implements UnicodeConstants {
|
|||
/** Returns true iff x is the preferred representation of a
|
||||
Tibetan or Sanskrit consonant and cannot be broken down any
|
||||
further. Returns false for, among other things, subjoined
|
||||
consonants like <code>\u0F90</code>, two-component consonants
|
||||
like <code>\u0F43</code>, and fixed-form consonants like
|
||||
'\u0F6A'. The new consonants (for transcribing Chinese, I
|
||||
believe) "\u0F55\u0F39" (which EWTS calls "fa"),
|
||||
"\u0F56\u0F39" ("va"), and "\u0F5F\u0F39" ("Dza") are
|
||||
two-codepoint sequences, but you should be aware of them
|
||||
also. */
|
||||
consonants like <code>U+0F90</code>, two-component consonants
|
||||
like <code>U+0F43</code>, and fixed-form consonants like
|
||||
<code>U+0F6A</code>. The new consonants (for transcribing
|
||||
Chinese, I believe) "\u0F55\u0F39" (which EWTS calls
|
||||
"fa"), "\u0F56\u0F39" ("va"), and
|
||||
"\u0F5F\u0F39" ("Dza") are two-codepoint sequences,
|
||||
but you should be aware of them also. */
|
||||
public static boolean isPreferredFormOfConsonant(char x) {
|
||||
return ((x != '\u0F48' /* reserved in Unicode 3.2, but not in use */)
|
||||
&& (x >= '\u0F40' && x <= '\u0F68')
|
||||
|
@ -97,7 +97,7 @@ public class UnicodeUtils implements UnicodeConstants {
|
|||
Unicode codepoints, into either Normalization Form KD (NFKD),
|
||||
D (NFD), or THDL (NFTHDL), depending on the value of normForm.
|
||||
NFD and NFKD are specified by Unicode 3.2; NFTHDL is needed
|
||||
for {@link org.thdl.tib.text.tshegbar#UnicodeGraphemeCluster}
|
||||
for {@link org.thdl.tib.text.tshegbar.UnicodeGraphemeCluster}
|
||||
because NFKD normalizes <code>U+0F0C</code> and neither NFD
|
||||
nor NFKD breaks down <code>U+0F00</code> into its constituent
|
||||
codepoints. NFTHDL uses a maximum of codepoints, and it never
|
||||
|
@ -247,7 +247,7 @@ public class UnicodeUtils implements UnicodeConstants {
|
|||
|
||||
/** Returns true iff ch corresponds to the Tibetan letter wa.
|
||||
Several Unicode codepoints correspond to the Tibetan letter
|
||||
wa. Oftentimes, <code>\u0F5D</code> is thought of as the
|
||||
wa. Oftentimes, <code>U+0F5D</code> is thought of as the
|
||||
nominal representation. */
|
||||
public static boolean isWa(char ch) {
|
||||
return ('\u0F5D' == ch
|
||||
|
@ -257,7 +257,7 @@ public class UnicodeUtils implements UnicodeConstants {
|
|||
|
||||
/** Returns true iff ch corresponds to the Tibetan letter ya.
|
||||
Several Unicode codepoints correspond to the Tibetan letter
|
||||
ya. Oftentimes, <code>\u0F61</code> is thought of as the
|
||||
ya. Oftentimes, <code>U+0F61</code> is thought of as the
|
||||
nominal representation. */
|
||||
public static boolean isYa(char ch) {
|
||||
return ('\u0F61' == ch
|
||||
|
@ -267,7 +267,7 @@ public class UnicodeUtils implements UnicodeConstants {
|
|||
|
||||
/** Returns true iff there exists at least one codepoint cp in
|
||||
unicodeString such that cp {@link #isRa(char) is ra} or contains
|
||||
ra (like <code>\u0F77</code>). This method is not implemented
|
||||
ra (like <code>U+0F77</code>). This method is not implemented
|
||||
as fast as it could be. It calls on the canonicalization code
|
||||
in order to maximize reuse and minimize the possibility of
|
||||
coder error. */
|
||||
|
@ -298,6 +298,9 @@ public class UnicodeUtils implements UnicodeConstants {
|
|||
return "\\u" + Integer.toHexString((int)cp);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a human-readable, ASCII form of the String s of Unicode
|
||||
* codepoints. */
|
||||
public static String unicodeStringToString(String s) {
|
||||
StringBuffer sb = new StringBuffer(s.length() * 6);
|
||||
for (int i = 0; i < s.length(); i++) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue