diff --git a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java index 35950ac..13a0e72 100644 --- a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java +++ b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java @@ -100,13 +100,11 @@ And also there are cases where they combine. For ex you can have * * *

Note that this class uses only a subset of Unicode to represent - * consonants and vowels. In some situations, you should use {@link - * #EWSUB_wa_zur} to represent the consonant wa, while in others you - * should use {@link #EWC_wa}, even though you mean to subscribe a - * fixed-form wa. Basically, stick to the codepoints for which - * enumerations exist in {@link - * org.thdl.tib.text.tshegbar.UnicodeConstants} and use your common - * sense.

+ * consonants and vowels. You should always use the nominal form of + * a letter, e.g. {@link #EWC_wa}, not {@link #EWSUB_wa_zur}, to + * represent letters. (What if you mean to subscribe a fixed-form + * wa? Well, that's not a legal tsheg-bar, so you don't mean to do + * that.)

* *

For a pretty good, concise summary of the rules this class * knows about, see Joe B. Wilson's Translating Buddhism from @@ -142,8 +140,6 @@ public class LegalTshegBar /** Do not use this constructor. */ private LegalTshegBar() { super(); } - // DLC FIXME: do we want to accept EWC_ra or EWSUB_ra_btags for - // the root letter, even if there is no head letter? Etc. /** Constructs a valid Tibetan syllable or throws an exception. * Use EW_ABSENT (or null in the case of suffix) for * those parts of the syllable that are absent. The root letter @@ -180,7 +176,7 @@ public class LegalTshegBar // copying is slightly inefficient because it is unnecessary // since Java strings are read-only, but translating this code // to C++ is easier this way. - this.suffix = new String(suffix); + this.suffix = (suffix == null) ? null : new String(suffix); this.postsuffix = postsuffix; this.vowel = vowel; @@ -198,7 +194,8 @@ public class LegalTshegBar throws IllegalArgumentException { this(prefix, headLetter, rootLetter, subjoinedLetter, - hasWaZur, hasAChung, new String(new char[] { suffix }), + hasWaZur, hasAChung, + (suffix == EW_ABSENT) ? null : new String(new char[] { suffix }), postsuffix, vowel); } @@ -216,7 +213,10 @@ public class LegalTshegBar } /** Returns the non-EWSUB_wa_zur consonant subscribed to the root - * consonant, or EW_ABSENT if none is. If you want to know if there is a wa-zur, use {@link #hasWaZurSubjoinedToRootLetter()}*/ + * consonant, or EW_ABSENT if none is. If you want to know if + * there is a wa-zur, use {@link + * #hasWaZurSubjoinedToRootLetter()}. This returns EWC_ra, not + * EWSUB_ra_btags, etc. */ public char getSubjoinedLetter() { return subjoinedLetter; } @@ -458,11 +458,11 @@ public class LegalTshegBar if (EW_ABSENT == subjoinedLetter) { return isConsonantThatTakesWaZur(rootLetter); } - if (EWSUB_ra_btags == subjoinedLetter) { + if (EWC_ra == subjoinedLetter) { if (EWC_ga == rootLetter || EWC_da == rootLetter) return true; - } else if (EWSUB_ya_btags == subjoinedLetter) { + } else if (EWC_ya == subjoinedLetter) { if (EWC_pha == rootLetter) return true; } @@ -599,6 +599,9 @@ public class LegalTshegBar * this is {@link #getConnectiveCaseSuffix()} * @param postsuffix the optional postsuffix, which should be * EWC_sa or EWC_da + * @param errorBuffer if non-null, and if the return code is + * false, then the reason that this is not a legal tsheg-bar will + * be appended to errorBuffer. * @param vowel the optional vowel */ public static boolean formsLegalTshegBar(char prefix, char headLetter, @@ -608,12 +611,14 @@ public class LegalTshegBar boolean hasAChung, String suffix, char postsuffix, - char vowel) + char vowel, + StringBuffer errorBuffer) { try { return internalLegalityTest(prefix, headLetter, rootLetter, subjoinedLetter, hasWaZur, hasAChung, - suffix, postsuffix, vowel, false); + suffix, postsuffix, vowel, false, + errorBuffer); } catch (IllegalArgumentException e) { throw new Error("This simply cannot happen, but it did."); } @@ -631,12 +636,15 @@ public class LegalTshegBar boolean hasAChung, char suffix, char postsuffix, - char vowel) + char vowel, + StringBuffer errorBuffer) { return formsLegalTshegBar(prefix, headLetter, rootLetter, subjoinedLetter, hasWaZur, hasAChung, - new String(new char[] { suffix }), - postsuffix, vowel); + ((suffix == EW_ABSENT) + ? null + : new String(new char[] { suffix })), + postsuffix, vowel, errorBuffer); } @@ -659,12 +667,17 @@ public class LegalTshegBar { internalLegalityTest(prefix, headLetter, rootLetter, subjoinedLetter, hasWaZur, hasAChung, - suffix, postsuffix, vowel, true); + suffix, postsuffix, vowel, true, null); } /** Voodoo. Stand back. */ - private static boolean internalThrowThing(boolean doThrow, String msg) + private static boolean internalThrowThing(boolean doThrow, + StringBuffer errorBuf, + String msg) { + if (errorBuf != null) { + errorBuf.append(msg); + } if (doThrow) throw new IllegalArgumentException(msg); return false; @@ -674,6 +687,8 @@ public class LegalTshegBar * thrown, then this combination makes a legal Tibetan syllable. * To learn about the arguments, see {@link * #formsLegalTshegBar(char,char,char,char,boolean,boolean,String,char,char)}. + * @param errorBuf if non-null, the reason this is illegal will + * be written here, if this is illegal * @return true if this syllable is legal, false if this syllable * is illegal and throwIfIllegal is false, does not return if * this syllable is illegal and throwIfIllegal is true @@ -689,11 +704,13 @@ public class LegalTshegBar String suffix, char postsuffix, char vowel, - boolean throwIfIllegal) + boolean throwIfIllegal, + StringBuffer errorBuf) throws IllegalArgumentException { if (!isNominalRepresentationOfConsonant(rootLetter)) return internalThrowThing(throwIfIllegal, + errorBuf, "The root letter must be one of the standard thirty Tibetan consonants, and must be represented nominally, not, for example, by FIXED-FORM RA (\u0F6A)"); if (EW_ABSENT != prefix) { @@ -701,28 +718,34 @@ public class LegalTshegBar // and that it can go with this root letter: if (!isNominalRepresentationOfPrefix(prefix)) return internalThrowThing(throwIfIllegal, + errorBuf, "The prefix is not absent, so it must be one of the five possible prefixes."); // DLC test that it can go with the root letter. } if (EW_ABSENT != subjoinedLetter) { - if (EWSUB_ya_btags == subjoinedLetter) { + if (EWC_ya == subjoinedLetter) { if (!isConsonantThatTakesYaBtags(rootLetter)) { return internalThrowThing(throwIfIllegal, + errorBuf, "Cannot subscribe ya-btags to that root letter."); } - } else if (EWSUB_ra_btags == subjoinedLetter) { + } else if (EWC_ra == subjoinedLetter) { if (!isConsonantThatTakesRaBtags(rootLetter)) { return internalThrowThing(throwIfIllegal, + errorBuf, "Cannot subscribe ra-btags to that root letter."); } - } else if (EWSUB_la_btags == subjoinedLetter) { + } else if (EWC_la == subjoinedLetter) { if (!isConsonantThatTakesLaBtags(rootLetter)) { return internalThrowThing(throwIfIllegal, + errorBuf, "Cannot subscribe la-btags to that root letter."); } - } else if (EWSUB_wa_zur == subjoinedLetter) { - throw new Error("DLC FIXME: can this happen? wa-zur comes in via the boolean argument hasWaZur, not via subjoinedLetter."); + } else if (EWC_wa == subjoinedLetter) { + return internalThrowThing(throwIfIllegal, + errorBuf, + "The presence of wa-zur must be specified via a boolean parameter."); } else { // check for a common mistake: if ('\u0FBA' == subjoinedLetter @@ -730,9 +753,11 @@ public class LegalTshegBar || '\u0FBC' == subjoinedLetter) { return internalThrowThing(throwIfIllegal, + errorBuf, "The subjoined letter given is subjoinable, but you gave the fixed-form variant, which is not used in Tibetan syllables but is sometimes used in Tibetan transliteration of Sanskrit, Chinese, or some non-Tibetan language."); } return internalThrowThing(throwIfIllegal, + errorBuf, "The subjoined letter given is not one of the four consonants that may be subscribed."); } } // subjoinedLetter tests @@ -743,10 +768,12 @@ public class LegalTshegBar if (!getConnectiveCaseSuffix().equals(suffix)) { if (suffix.length() != 1) { return internalThrowThing(throwIfIllegal, + errorBuf, "Illegal suffix -- not one of the legal complex suffixes like 'u, 'o, 'i, 'am."); } if (!isNominalRepresentationOfSimpleSuffix(suffix.charAt(0))) { return internalThrowThing(throwIfIllegal, + errorBuf, "Illegal suffix -- not one of the ten legal suffixes: " + UnicodeUtils.unicodeCodepointToString(suffix.charAt(0))); } @@ -755,6 +782,7 @@ public class LegalTshegBar if (EW_ABSENT != postsuffix) { if (null == suffix) return internalThrowThing(throwIfIllegal, + errorBuf, "You cannot have a postsuffix unless you also have a suffix."); } @@ -762,11 +790,13 @@ public class LegalTshegBar if (EWC_ra == headLetter) { if (!isConsonantThatTakesRaMgo(rootLetter)) { return internalThrowThing(throwIfIllegal, + errorBuf, "The head letter ra cannot be used with that root letter."); } } else if (EWC_la == headLetter) { if (!isConsonantThatTakesLaMgo(rootLetter)) { return internalThrowThing(throwIfIllegal, + errorBuf, "The head letter la cannot be used with that root letter."); } } else if (EWC_sa == headLetter) { @@ -774,15 +804,18 @@ public class LegalTshegBar // handle a common error specially: if (EWC_la == rootLetter) return internalThrowThing(throwIfIllegal, + errorBuf, "sa cannot be a head letter atop the root letter la. You probably meant to have sa the root letter and la the subjoined letter."); return internalThrowThing(throwIfIllegal, + errorBuf, "The head letter sa cannot be used with that root letter."); } } else { // '\u0F6A' is not a valid head letter, even for // "rnya". Use EWC_ra instead. return internalThrowThing(throwIfIllegal, + errorBuf, "The head letter given is not valid."); } } // headLetter tests @@ -796,16 +829,20 @@ public class LegalTshegBar { if (EWC_achen == vowel) return internalThrowThing(throwIfIllegal, + errorBuf, "The vowel given is not valid. Use EW_ABSENT for the EWC_achen sound."); if ('\u0F71' == vowel) return internalThrowThing(throwIfIllegal, - "a-chung cannot be used in a simple Tibetan syllable."); + errorBuf, + "a-chung cannot be used in a simple Tibetan syllable."); // DLC FIXME: what about pA? return internalThrowThing(throwIfIllegal, + errorBuf, "The vowel given is not valid."); } } // Phew. We got here, so this combination of inputs is valid. + // Do nothing to errorBuf. return true; } diff --git a/source/org/thdl/tib/text/tshegbar/LegalTshegBarTest.java b/source/org/thdl/tib/text/tshegbar/LegalTshegBarTest.java index fc4a0c1..1749648 100644 --- a/source/org/thdl/tib/text/tshegbar/LegalTshegBarTest.java +++ b/source/org/thdl/tib/text/tshegbar/LegalTshegBarTest.java @@ -40,29 +40,123 @@ public class LegalTshegBarTest extends TestCase implements UnicodeConstants { /** Tests the getThdlWylie() method and one of the constructors. */ public void testGetThdlWylie() { - assertTrue(new LegalTshegBar(EWC_ba, EWC_sa, EWC_ga, EWSUB_ra_btags, + assertTrue(new LegalTshegBar(EWC_ba, EWC_sa, EWC_ga, EWC_ra, false, true, EWC_la, EWC_sa, EWV_o).getThdlWylie().toString().equals("bsgrAols")); assertTrue(new LegalTshegBar(EWC_ba, EWC_sa, EWC_ga, - EWSUB_ra_btags, true, true, + EWC_ra, true, true, EWC_la, EWC_sa, EWV_o).getThdlWylie().toString().equals("bsgrwAols")); assertTrue(new LegalTshegBar(EWC_ba, EWC_sa, EWC_ga, - EWSUB_ra_btags, false, false, + EWC_ra, false, false, EWC_la, EWC_sa, EWV_o).getThdlWylie().toString().equals("bsgrols")); + assertTrue(new LegalTshegBar(EWC_ba, EW_ABSENT, EWC_ta, + EW_ABSENT, false, false, + EWC_nga, EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("btang")); + + // dga and dag are fun, as both are represented by "\u0F51\u0F42": + { + assertTrue(new LegalTshegBar(EWC_da, EW_ABSENT, EWC_ga, + EW_ABSENT, false, false, + EW_ABSENT, EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("dga")); + assertTrue(new LegalTshegBar(EW_ABSENT, EW_ABSENT, EWC_da, + EW_ABSENT, false, false, + EWC_ga, EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("dag")); + } + + assertTrue(new LegalTshegBar(EW_ABSENT, EWC_ra, EWC_da, + EW_ABSENT, false, false, + EWC_ga, EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("rdag")); + assertTrue(new LegalTshegBar(EWC_ba, EWC_ra, EWC_da, + EW_ABSENT, false, false, + EWC_ga, EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("brdag")); + + assertTrue(new LegalTshegBar(EW_ABSENT, EW_ABSENT, EWC_nga, + EW_ABSENT, false, false, + "\u0F60\u0F72", EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("nga'i")); + + assertTrue(new LegalTshegBar(EW_ABSENT, EW_ABSENT, EWC_nga, + EW_ABSENT, false, false, + null, EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("nga")); + + assertTrue(new LegalTshegBar(EW_ABSENT, EW_ABSENT, EWC_sa, + EWC_la, false, false, + null, EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("sla")); + + { + boolean threw = false; + try { + new LegalTshegBar(EW_ABSENT, EWC_sa, EWC_la, + EW_ABSENT, false, false, + null, EW_ABSENT, EW_ABSENT); + } catch (IllegalArgumentException e) { + threw = true; + } + assertTrue(threw); + } } /** Tests the formsLegalTshegBar(..) method. DLC FIXME: but * doesn't test it very well. */ public void testFormsLegalTshegBar() { + StringBuffer eb = new StringBuffer(); + // Ensure that EWTS's jskad is not legal: - assertTrue(!LegalTshegBar.formsLegalTshegBar(EWC_ja, EWC_sa, - EWC_ka, EW_ABSENT, - false, false, - EW_ABSENT, EWC_da, - EW_ABSENT)); + { + assertTrue(!LegalTshegBar.formsLegalTshegBar(EWC_ja, EWC_sa, + EWC_ka, EW_ABSENT, + false, false, + EW_ABSENT, EWC_da, + EW_ABSENT, eb)); + } + assertTrue(LegalTshegBar.formsLegalTshegBar(EWC_ba, EW_ABSENT, EWC_ta, EW_ABSENT, false, false, EWC_da, EW_ABSENT, - EW_ABSENT)); + EW_ABSENT, eb)); + + // test that there's only one way to make dwa: + assertTrue(!LegalTshegBar.formsLegalTshegBar(EW_ABSENT, EW_ABSENT, + EWC_da, EWSUB_wa_zur, + false, false, + EW_ABSENT, EW_ABSENT, + EW_ABSENT, eb)); + assertTrue(!LegalTshegBar.formsLegalTshegBar(EW_ABSENT, EW_ABSENT, + EWC_da, EWC_wa, + false, false, + EW_ABSENT, EW_ABSENT, + EW_ABSENT, eb)); + boolean result + = LegalTshegBar.formsLegalTshegBar(EW_ABSENT, EW_ABSENT, + EWC_da, EW_ABSENT, + true, false, + EW_ABSENT, EW_ABSENT, + EW_ABSENT, eb); + assertTrue(eb.toString(), result); + } + + /** Tests the behavior of the constructors. */ + public void testConstructors() { + boolean x; + + x = false; + try { + new LegalTshegBar(EWC_ba, EWC_sa, EWC_ga, + EWSUB_ra_btags, false, false, + EWC_la, EWC_sa, EWV_o); + } catch (IllegalArgumentException e) { + x = true; + } + assertTrue(x); + + x = false; + try { + new LegalTshegBar(EWC_ba, EWC_sa, EWC_ga, + EWSUB_ra_btags, false, false, + new String(new char[] { EWC_la }), EWC_sa, + EWV_o); + } catch (IllegalArgumentException e) { + x = true; + } + assertTrue(x); } } diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java index 3df96fb..c23a60a 100644 --- a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java +++ b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java @@ -30,11 +30,11 @@ public class UnicodeUtils implements UnicodeConstants { /** Returns true iff x is a Unicode codepoint that represents a consonant or two-consonant stack that has a Unicode code point. Returns true only for the usual suspects (like - \u0F40) and for Sanskrit consonants (like - \u0F71) and the simple two-consonant stacks in - Unicode (like \u0F43). Returns false for, among + U+0F40) and for Sanskrit consonants (like + U+0F71) and the simple two-consonant stacks in + Unicode (like U+0F43). Returns false for, among other things, subjoined consonants like - \u0F90. */ + U+0F90. */ public static boolean isNonSubjoinedConsonant(char x) { return ((x != '\u0F48' /* reserved in Unicode 3.2, but not in use */) && (x >= '\u0F40' && x <= '\u0F6A')); @@ -43,11 +43,11 @@ public class UnicodeUtils implements UnicodeConstants { /** Returns true iff x is a Unicode codepoint that represents a subjoined consonant or subjoined two-consonant stack that has a Unicode code point. Returns true only for the usual - suspects (like \u0F90) and for Sanskrit - consonants (like \u0F9C) and the simple - two-consonant stacks in Unicode (like \u0FAC). + suspects (like U+0F90) and for Sanskrit + consonants (like U+0F9C) and the simple + two-consonant stacks in Unicode (like U+0FAC). Returns false for, among other things, non-subjoined - consonants like \u0F40. */ + consonants like U+0F40. */ public static boolean isSubjoinedConsonant(char x) { return ((x != '\u0F98' /* reserved in Unicode 3.2, but not in use */) && (x >= '\u0F90' && x <= '\u0FBC')); @@ -56,13 +56,13 @@ public class UnicodeUtils implements UnicodeConstants { /** Returns true iff x is the preferred representation of a Tibetan or Sanskrit consonant and cannot be broken down any further. Returns false for, among other things, subjoined - consonants like \u0F90, two-component consonants - like \u0F43, and fixed-form consonants like - '\u0F6A'. The new consonants (for transcribing Chinese, I - believe) "\u0F55\u0F39" (which EWTS calls "fa"), - "\u0F56\u0F39" ("va"), and "\u0F5F\u0F39" ("Dza") are - two-codepoint sequences, but you should be aware of them - also. */ + consonants like U+0F90, two-component consonants + like U+0F43, and fixed-form consonants like + U+0F6A. The new consonants (for transcribing + Chinese, I believe) "\u0F55\u0F39" (which EWTS calls + "fa"), "\u0F56\u0F39" ("va"), and + "\u0F5F\u0F39" ("Dza") are two-codepoint sequences, + but you should be aware of them also. */ public static boolean isPreferredFormOfConsonant(char x) { return ((x != '\u0F48' /* reserved in Unicode 3.2, but not in use */) && (x >= '\u0F40' && x <= '\u0F68') @@ -97,7 +97,7 @@ public class UnicodeUtils implements UnicodeConstants { Unicode codepoints, into either Normalization Form KD (NFKD), D (NFD), or THDL (NFTHDL), depending on the value of normForm. NFD and NFKD are specified by Unicode 3.2; NFTHDL is needed - for {@link org.thdl.tib.text.tshegbar#UnicodeGraphemeCluster} + for {@link org.thdl.tib.text.tshegbar.UnicodeGraphemeCluster} because NFKD normalizes U+0F0C and neither NFD nor NFKD breaks down U+0F00 into its constituent codepoints. NFTHDL uses a maximum of codepoints, and it never @@ -247,7 +247,7 @@ public class UnicodeUtils implements UnicodeConstants { /** Returns true iff ch corresponds to the Tibetan letter wa. Several Unicode codepoints correspond to the Tibetan letter - wa. Oftentimes, \u0F5D is thought of as the + wa. Oftentimes, U+0F5D is thought of as the nominal representation. */ public static boolean isWa(char ch) { return ('\u0F5D' == ch @@ -257,7 +257,7 @@ public class UnicodeUtils implements UnicodeConstants { /** Returns true iff ch corresponds to the Tibetan letter ya. Several Unicode codepoints correspond to the Tibetan letter - ya. Oftentimes, \u0F61 is thought of as the + ya. Oftentimes, U+0F61 is thought of as the nominal representation. */ public static boolean isYa(char ch) { return ('\u0F61' == ch @@ -267,7 +267,7 @@ public class UnicodeUtils implements UnicodeConstants { /** Returns true iff there exists at least one codepoint cp in unicodeString such that cp {@link #isRa(char) is ra} or contains - ra (like \u0F77). This method is not implemented + ra (like U+0F77). This method is not implemented as fast as it could be. It calls on the canonicalization code in order to maximize reuse and minimize the possibility of coder error. */ @@ -298,6 +298,9 @@ public class UnicodeUtils implements UnicodeConstants { return "\\u" + Integer.toHexString((int)cp); } + /** + * Returns a human-readable, ASCII form of the String s of Unicode + * codepoints. */ public static String unicodeStringToString(String s) { StringBuffer sb = new StringBuffer(s.length() * 6); for (int i = 0; i < s.length(); i++) { diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeUtilsTest.java b/source/org/thdl/tib/text/tshegbar/UnicodeUtilsTest.java index 8527867..309869d 100644 --- a/source/org/thdl/tib/text/tshegbar/UnicodeUtilsTest.java +++ b/source/org/thdl/tib/text/tshegbar/UnicodeUtilsTest.java @@ -40,10 +40,13 @@ public class UnicodeUtilsTest extends TestCase implements UnicodeConstants { } /** Tests Unicode Normalization form KD for Tibetan codepoints. - See Unicode, Inc.'s NormalizationTest-3.2.0.txt. This - contains all test cases for - U+0F00-U+0FFF there, and a few - more. */ + * See Unicode, Inc.'s NormalizationTest-3.2.0.txt. This + * contains all test cases for + * U+0F00-U+0FFF there, and a few more. + * Tests both {@link + * UnicodeUtils#toMostlyDecomposedUnicode(String, byte)} and + * {@link UnicodeUtils#toMostlyDecomposedUnicode(StringBuffer, + * byte)}.*/ public void testMostlyNFKD() { assertTrue(UnicodeUtils.toMostlyDecomposedUnicode("\u0F0B", NORM_NFKD).equals("\u0F0B")); assertTrue(UnicodeUtils.toMostlyDecomposedUnicode("\u0F40", NORM_NFKD).equals("\u0F40")); @@ -112,10 +115,13 @@ public class UnicodeUtilsTest extends TestCase implements UnicodeConstants { } /** Tests Unicode Normalization form D for Tibetan codepoints. - See Unicode, Inc.'s NormalizationTest-3.2.0.txt. This - contains all test cases for - U+0F00-U+0FFF there, and a few - more. */ + * See Unicode, Inc.'s NormalizationTest-3.2.0.txt. This + * contains all test cases for + * U+0F00-U+0FFF there, and a few more. + * Tests both {@link + * UnicodeUtils#toMostlyDecomposedUnicode(String, byte)} and + * {@link UnicodeUtils#toMostlyDecomposedUnicode(StringBuffer, + * byte)}.*/ public void testMostlyNFD() { assertTrue(UnicodeUtils.toMostlyDecomposedUnicode("\u0F0B", NORM_NFD).equals("\u0F0B")); assertTrue(UnicodeUtils.toMostlyDecomposedUnicode("\u0F40", NORM_NFD).equals("\u0F40")); @@ -184,10 +190,13 @@ public class UnicodeUtilsTest extends TestCase implements UnicodeConstants { } /** Tests Unicode Normalization form THDL for Tibetan codepoints. - See Unicode, Inc.'s NormalizationTest-3.2.0.txt. This - contains all test cases for - U+0F00-U+0FFF there, and a few - more. */ + * See Unicode, Inc.'s NormalizationTest-3.2.0.txt. This + * contains all test cases for + * U+0F00-U+0FFF there, and a few more. + * Tests both {@link + * UnicodeUtils#toMostlyDecomposedUnicode(String, byte)} and + * {@link UnicodeUtils#toMostlyDecomposedUnicode(StringBuffer, + * byte)}. */ public void testMostlyNFTHDL() { assertTrue(UnicodeUtils.toMostlyDecomposedUnicode("\u0F0B", NORM_NFTHDL).equals("\u0F0B")); assertTrue(UnicodeUtils.toMostlyDecomposedUnicode("\u0F40", NORM_NFTHDL).equals("\u0F40")); @@ -253,10 +262,36 @@ public class UnicodeUtilsTest extends TestCase implements UnicodeConstants { assertTrue(UnicodeUtils.toMostlyDecomposedUnicode("\u0F79", NORM_NFTHDL).equals("\u0FB3\u0F71\u0F80")); assertTrue(UnicodeUtils.toMostlyDecomposedUnicode("\u0FB3\u0F81", NORM_NFTHDL).equals("\u0FB3\u0F71\u0F80")); assertTrue(UnicodeUtils.toMostlyDecomposedUnicode("\u0FB3\u0F71\u0F80", NORM_NFTHDL).equals("\u0FB3\u0F71\u0F80")); + + + assertTrue(UnicodeUtils.toMostlyDecomposedUnicode("", NORM_NFTHDL).equals("")); + + { + StringBuffer sb = new StringBuffer("\u0FAC"); + UnicodeUtils.toMostlyDecomposedUnicode(sb, NORM_NFTHDL); + assertTrue(sb.toString().equals("\u0FAB\u0FB7")); + } + { + StringBuffer sb = new StringBuffer("\u0F66"); + UnicodeUtils.toMostlyDecomposedUnicode(sb, NORM_NFTHDL); + assertTrue(sb.toString().equals("\u0F66")); + } + { + StringBuffer sb = new StringBuffer(""); + UnicodeUtils.toMostlyDecomposedUnicode(sb, NORM_NFTHDL); + assertTrue(sb.toString().equals("")); + } } /** Tests the containsRa method. */ public void testContainsRa() { + assertTrue(!UnicodeUtils.containsRa('\u0F69')); + assertTrue(!UnicodeUtils.containsRa('\u0FB1')); + assertTrue(!UnicodeUtils.containsRa('\u0F48')); + assertTrue(!UnicodeUtils.containsRa('\u0060')); + assertTrue(!UnicodeUtils.containsRa('\uFFFF')); + assertTrue(!UnicodeUtils.containsRa('\uFFFF')); + assertTrue(UnicodeUtils.containsRa('\u0FB2')); assertTrue(UnicodeUtils.containsRa('\u0F77')); assertTrue(UnicodeUtils.containsRa('\u0F76')); @@ -264,4 +299,84 @@ public class UnicodeUtilsTest extends TestCase implements UnicodeConstants { assertTrue(UnicodeUtils.containsRa('\u0F62')); assertTrue(UnicodeUtils.containsRa('\u0FBC')); } + + /** + * Tests the {@link UnicodeUtils#unicodeStringToString(String)} + * method. */ + public void testUnicodeStringToString() { + assertTrue(UnicodeUtils.unicodeStringToString("\u0000").equals("\\u0000")); + assertTrue(UnicodeUtils.unicodeStringToString("\u0001").equals("\\u0001")); + assertTrue(UnicodeUtils.unicodeStringToString("\u000F").equals("\\u000f")); + assertTrue(UnicodeUtils.unicodeStringToString("\u001F").equals("\\u001f")); + assertTrue(UnicodeUtils.unicodeStringToString("\u00fF").equals("\\u00ff")); + assertTrue(UnicodeUtils.unicodeStringToString("\u01fF").equals("\\u01ff")); + assertTrue(UnicodeUtils.unicodeStringToString("\u0ffF").equals("\\u0fff")); + assertTrue(UnicodeUtils.unicodeStringToString("\u1ffF").equals("\\u1fff")); + assertTrue(UnicodeUtils.unicodeStringToString("\ufffF").equals("\\uffff")); + + assertTrue(UnicodeUtils.unicodeStringToString("\u0F00\u0091\uABCD\u0FFF\u0Ff1\uFFFF\u0000").equals("\\u0f00\\u0091\\uabcd\\u0fff\\u0ff1\\uffff\\u0000")); + } + + /** + * Tests the {@link UnicodeUtils#unicodeCodepointToString(char)} + * method. */ + public void testUnicodeCodepointToString() { + assertTrue(UnicodeUtils.unicodeCodepointToString('\u0000').equals("\\u0000")); + assertTrue(UnicodeUtils.unicodeCodepointToString('\u0001').equals("\\u0001")); + assertTrue(UnicodeUtils.unicodeCodepointToString('\u000F').equals("\\u000f")); + assertTrue(UnicodeUtils.unicodeCodepointToString('\u001F').equals("\\u001f")); + assertTrue(UnicodeUtils.unicodeCodepointToString('\u00fF').equals("\\u00ff")); + assertTrue(UnicodeUtils.unicodeCodepointToString('\u01fF').equals("\\u01ff")); + assertTrue(UnicodeUtils.unicodeCodepointToString('\u0ffF').equals("\\u0fff")); + assertTrue(UnicodeUtils.unicodeCodepointToString('\u1ffF').equals("\\u1fff")); + assertTrue(UnicodeUtils.unicodeCodepointToString('\ufffF').equals("\\uffff")); + } + + /** + * Tests the {@link UnicodeUtils#isEntirelyTibetanUnicode(String)} + * method. */ + public void testIsEntirelyTibetanUnicode() { + assertTrue(UnicodeUtils.isEntirelyTibetanUnicode("\u0F00\u0FFF\u0F00\u0F1e\u0F48")); // U+0F48 is reserved, but in the range. + assertTrue(!UnicodeUtils.isEntirelyTibetanUnicode("\u0F00\u1000\u0FFF\u0F00\u0F1e\u0F48")); // U+0F48 is reserved, but in the range. + } + + /** + * Tests the {@link UnicodeUtils#isTibetanConsonant(char)} + * method. */ + public void testIsTibetanConsonant() { + assertTrue(!UnicodeUtils.isTibetanConsonant('\u0000')); + assertTrue(!UnicodeUtils.isTibetanConsonant('\uF000')); + assertTrue(!UnicodeUtils.isTibetanConsonant('\u0EFF')); + assertTrue(!UnicodeUtils.isTibetanConsonant('\u1000')); + assertTrue(!UnicodeUtils.isTibetanConsonant('\u0F00')); + assertTrue(!UnicodeUtils.isTibetanConsonant('\u0FFF')); + + assertTrue(UnicodeUtils.isTibetanConsonant('\u0FB2')); + assertTrue(UnicodeUtils.isTibetanConsonant('\u0F6A')); + assertTrue(UnicodeUtils.isTibetanConsonant('\u0F40')); + assertTrue(UnicodeUtils.isTibetanConsonant('\u0F50')); + assertTrue(UnicodeUtils.isTibetanConsonant('\u0FBC')); + assertTrue(UnicodeUtils.isTibetanConsonant('\u0FB9')); + assertTrue(UnicodeUtils.isTibetanConsonant('\u0FB0')); + assertTrue(UnicodeUtils.isTibetanConsonant('\u0FAD')); + assertTrue(UnicodeUtils.isTibetanConsonant('\u0FA6')); + assertTrue(UnicodeUtils.isTibetanConsonant('\u0F90')); + assertTrue(UnicodeUtils.isTibetanConsonant('\u0F91')); + + // reserved codepoints: + assertTrue(!UnicodeUtils.isTibetanConsonant('\u0F48')); + assertTrue(!UnicodeUtils.isTibetanConsonant('\u0F98')); + } + + /** + * Tests the {@link UnicodeUtils#isInTibetanRange(char)} + * method. */ + public void testIsInTibetanRange() { + assertTrue(!UnicodeUtils.isInTibetanRange('\u0000')); + assertTrue(!UnicodeUtils.isInTibetanRange('\u0100')); + assertTrue(!UnicodeUtils.isInTibetanRange('\u1000')); + assertTrue(UnicodeUtils.isInTibetanRange('\u0F00')); + assertTrue(UnicodeUtils.isInTibetanRange('\u0FF0')); + assertTrue(UnicodeUtils.isInTibetanRange('\u0FFF')); + } }