More and better tests; fixed some bugs in LegalTshegBar.

This commit is contained in:
dchandler 2003-03-28 03:49:49 +00:00
parent 35a9869aac
commit 2b81020b0e
4 changed files with 317 additions and 68 deletions

View file

@ -100,13 +100,11 @@ And also there are cases where they combine. For ex you can have
* </ul> * </ul>
* *
* <p>Note that this class uses only a subset of Unicode to represent * <p>Note that this class uses only a subset of Unicode to represent
* consonants and vowels. In some situations, you should use {@link * consonants and vowels. You should always use the nominal form of
* #EWSUB_wa_zur} to represent the consonant wa, while in others you * a letter, e.g. {@link #EWC_wa}, not {@link #EWSUB_wa_zur}, to
* should use {@link #EWC_wa}, even though you mean to subscribe a * represent letters. (What if you mean to subscribe a fixed-form
* fixed-form wa. Basically, stick to the codepoints for which * wa? Well, that's not a legal tsheg-bar, so you don't mean to do
* enumerations exist in {@link * that.)</p>
* org.thdl.tib.text.tshegbar.UnicodeConstants} and use your common
* sense.</p>
* *
* <p>For a pretty good, concise summary of the rules this class * <p>For a pretty good, concise summary of the rules this class
* knows about, see Joe B. Wilson's <i>Translating Buddhism from * knows about, see Joe B. Wilson's <i>Translating Buddhism from
@ -142,8 +140,6 @@ public class LegalTshegBar
/** Do not use this constructor. */ /** Do not use this constructor. */
private LegalTshegBar() { super(); } private LegalTshegBar() { super(); }
// DLC FIXME: do we want to accept EWC_ra or EWSUB_ra_btags for
// the root letter, even if there is no head letter? Etc.
/** Constructs a valid Tibetan syllable or throws an exception. /** Constructs a valid Tibetan syllable or throws an exception.
* Use EW_ABSENT (or null in the case of <code>suffix</code>) for * Use EW_ABSENT (or null in the case of <code>suffix</code>) for
* those parts of the syllable that are absent. The root letter * those parts of the syllable that are absent. The root letter
@ -180,7 +176,7 @@ public class LegalTshegBar
// copying is slightly inefficient because it is unnecessary // copying is slightly inefficient because it is unnecessary
// since Java strings are read-only, but translating this code // since Java strings are read-only, but translating this code
// to C++ is easier this way. // to C++ is easier this way.
this.suffix = new String(suffix); this.suffix = (suffix == null) ? null : new String(suffix);
this.postsuffix = postsuffix; this.postsuffix = postsuffix;
this.vowel = vowel; this.vowel = vowel;
@ -198,7 +194,8 @@ public class LegalTshegBar
throws IllegalArgumentException throws IllegalArgumentException
{ {
this(prefix, headLetter, rootLetter, subjoinedLetter, this(prefix, headLetter, rootLetter, subjoinedLetter,
hasWaZur, hasAChung, new String(new char[] { suffix }), hasWaZur, hasAChung,
(suffix == EW_ABSENT) ? null : new String(new char[] { suffix }),
postsuffix, vowel); postsuffix, vowel);
} }
@ -216,7 +213,10 @@ public class LegalTshegBar
} }
/** Returns the non-EWSUB_wa_zur consonant subscribed to the root /** Returns the non-EWSUB_wa_zur consonant subscribed to the root
* consonant, or EW_ABSENT if none is. If you want to know if there is a wa-zur, use {@link #hasWaZurSubjoinedToRootLetter()}*/ * consonant, or EW_ABSENT if none is. If you want to know if
* there is a wa-zur, use {@link
* #hasWaZurSubjoinedToRootLetter()}. This returns EWC_ra, not
* EWSUB_ra_btags, etc. */
public char getSubjoinedLetter() { public char getSubjoinedLetter() {
return subjoinedLetter; return subjoinedLetter;
} }
@ -458,11 +458,11 @@ public class LegalTshegBar
if (EW_ABSENT == subjoinedLetter) { if (EW_ABSENT == subjoinedLetter) {
return isConsonantThatTakesWaZur(rootLetter); return isConsonantThatTakesWaZur(rootLetter);
} }
if (EWSUB_ra_btags == subjoinedLetter) { if (EWC_ra == subjoinedLetter) {
if (EWC_ga == rootLetter if (EWC_ga == rootLetter
|| EWC_da == rootLetter) || EWC_da == rootLetter)
return true; return true;
} else if (EWSUB_ya_btags == subjoinedLetter) { } else if (EWC_ya == subjoinedLetter) {
if (EWC_pha == rootLetter) if (EWC_pha == rootLetter)
return true; return true;
} }
@ -599,6 +599,9 @@ public class LegalTshegBar
* this is {@link #getConnectiveCaseSuffix()} * this is {@link #getConnectiveCaseSuffix()}
* @param postsuffix the optional postsuffix, which should be * @param postsuffix the optional postsuffix, which should be
* EWC_sa or EWC_da * EWC_sa or EWC_da
* @param errorBuffer if non-null, and if the return code is
* false, then the reason that this is not a legal tsheg-bar will
* be appended to errorBuffer.
* @param vowel the optional vowel */ * @param vowel the optional vowel */
public static boolean formsLegalTshegBar(char prefix, public static boolean formsLegalTshegBar(char prefix,
char headLetter, char headLetter,
@ -608,12 +611,14 @@ public class LegalTshegBar
boolean hasAChung, boolean hasAChung,
String suffix, String suffix,
char postsuffix, char postsuffix,
char vowel) char vowel,
StringBuffer errorBuffer)
{ {
try { try {
return internalLegalityTest(prefix, headLetter, rootLetter, return internalLegalityTest(prefix, headLetter, rootLetter,
subjoinedLetter, hasWaZur, hasAChung, subjoinedLetter, hasWaZur, hasAChung,
suffix, postsuffix, vowel, false); suffix, postsuffix, vowel, false,
errorBuffer);
} catch (IllegalArgumentException e) { } catch (IllegalArgumentException e) {
throw new Error("This simply cannot happen, but it did."); throw new Error("This simply cannot happen, but it did.");
} }
@ -631,12 +636,15 @@ public class LegalTshegBar
boolean hasAChung, boolean hasAChung,
char suffix, char suffix,
char postsuffix, char postsuffix,
char vowel) char vowel,
StringBuffer errorBuffer)
{ {
return formsLegalTshegBar(prefix, headLetter, rootLetter, return formsLegalTshegBar(prefix, headLetter, rootLetter,
subjoinedLetter, hasWaZur, hasAChung, subjoinedLetter, hasWaZur, hasAChung,
new String(new char[] { suffix }), ((suffix == EW_ABSENT)
postsuffix, vowel); ? null
: new String(new char[] { suffix })),
postsuffix, vowel, errorBuffer);
} }
@ -659,12 +667,17 @@ public class LegalTshegBar
{ {
internalLegalityTest(prefix, headLetter, rootLetter, internalLegalityTest(prefix, headLetter, rootLetter,
subjoinedLetter, hasWaZur, hasAChung, subjoinedLetter, hasWaZur, hasAChung,
suffix, postsuffix, vowel, true); suffix, postsuffix, vowel, true, null);
} }
/** Voodoo. Stand back. */ /** Voodoo. Stand back. */
private static boolean internalThrowThing(boolean doThrow, String msg) private static boolean internalThrowThing(boolean doThrow,
StringBuffer errorBuf,
String msg)
{ {
if (errorBuf != null) {
errorBuf.append(msg);
}
if (doThrow) if (doThrow)
throw new IllegalArgumentException(msg); throw new IllegalArgumentException(msg);
return false; return false;
@ -674,6 +687,8 @@ public class LegalTshegBar
* thrown, then this combination makes a legal Tibetan syllable. * thrown, then this combination makes a legal Tibetan syllable.
* To learn about the arguments, see {@link * To learn about the arguments, see {@link
* #formsLegalTshegBar(char,char,char,char,boolean,boolean,String,char,char)}. * #formsLegalTshegBar(char,char,char,char,boolean,boolean,String,char,char)}.
* @param errorBuf if non-null, the reason this is illegal will
* be written here, if this is illegal
* @return true if this syllable is legal, false if this syllable * @return true if this syllable is legal, false if this syllable
* is illegal and throwIfIllegal is false, does not return if * is illegal and throwIfIllegal is false, does not return if
* this syllable is illegal and throwIfIllegal is true * this syllable is illegal and throwIfIllegal is true
@ -689,11 +704,13 @@ public class LegalTshegBar
String suffix, String suffix,
char postsuffix, char postsuffix,
char vowel, char vowel,
boolean throwIfIllegal) boolean throwIfIllegal,
StringBuffer errorBuf)
throws IllegalArgumentException throws IllegalArgumentException
{ {
if (!isNominalRepresentationOfConsonant(rootLetter)) if (!isNominalRepresentationOfConsonant(rootLetter))
return internalThrowThing(throwIfIllegal, return internalThrowThing(throwIfIllegal,
errorBuf,
"The root letter must be one of the standard thirty Tibetan consonants, and must be represented nominally, not, for example, by FIXED-FORM RA (&#92;u0F6A)"); "The root letter must be one of the standard thirty Tibetan consonants, and must be represented nominally, not, for example, by FIXED-FORM RA (&#92;u0F6A)");
if (EW_ABSENT != prefix) { if (EW_ABSENT != prefix) {
@ -701,28 +718,34 @@ public class LegalTshegBar
// and that it can go with this root letter: // and that it can go with this root letter:
if (!isNominalRepresentationOfPrefix(prefix)) if (!isNominalRepresentationOfPrefix(prefix))
return internalThrowThing(throwIfIllegal, return internalThrowThing(throwIfIllegal,
errorBuf,
"The prefix is not absent, so it must be one of the five possible prefixes."); "The prefix is not absent, so it must be one of the five possible prefixes.");
// DLC test that it can go with the root letter. // DLC test that it can go with the root letter.
} }
if (EW_ABSENT != subjoinedLetter) { if (EW_ABSENT != subjoinedLetter) {
if (EWSUB_ya_btags == subjoinedLetter) { if (EWC_ya == subjoinedLetter) {
if (!isConsonantThatTakesYaBtags(rootLetter)) { if (!isConsonantThatTakesYaBtags(rootLetter)) {
return internalThrowThing(throwIfIllegal, return internalThrowThing(throwIfIllegal,
errorBuf,
"Cannot subscribe ya-btags to that root letter."); "Cannot subscribe ya-btags to that root letter.");
} }
} else if (EWSUB_ra_btags == subjoinedLetter) { } else if (EWC_ra == subjoinedLetter) {
if (!isConsonantThatTakesRaBtags(rootLetter)) { if (!isConsonantThatTakesRaBtags(rootLetter)) {
return internalThrowThing(throwIfIllegal, return internalThrowThing(throwIfIllegal,
errorBuf,
"Cannot subscribe ra-btags to that root letter."); "Cannot subscribe ra-btags to that root letter.");
} }
} else if (EWSUB_la_btags == subjoinedLetter) { } else if (EWC_la == subjoinedLetter) {
if (!isConsonantThatTakesLaBtags(rootLetter)) { if (!isConsonantThatTakesLaBtags(rootLetter)) {
return internalThrowThing(throwIfIllegal, return internalThrowThing(throwIfIllegal,
errorBuf,
"Cannot subscribe la-btags to that root letter."); "Cannot subscribe la-btags to that root letter.");
} }
} else if (EWSUB_wa_zur == subjoinedLetter) { } else if (EWC_wa == subjoinedLetter) {
throw new Error("DLC FIXME: can this happen? wa-zur comes in via the boolean argument hasWaZur, not via subjoinedLetter."); return internalThrowThing(throwIfIllegal,
errorBuf,
"The presence of wa-zur must be specified via a boolean parameter.");
} else { } else {
// check for a common mistake: // check for a common mistake:
if ('\u0FBA' == subjoinedLetter if ('\u0FBA' == subjoinedLetter
@ -730,9 +753,11 @@ public class LegalTshegBar
|| '\u0FBC' == subjoinedLetter) || '\u0FBC' == subjoinedLetter)
{ {
return internalThrowThing(throwIfIllegal, return internalThrowThing(throwIfIllegal,
errorBuf,
"The subjoined letter given is subjoinable, but you gave the fixed-form variant, which is not used in Tibetan syllables but is sometimes used in Tibetan transliteration of Sanskrit, Chinese, or some non-Tibetan language."); "The subjoined letter given is subjoinable, but you gave the fixed-form variant, which is not used in Tibetan syllables but is sometimes used in Tibetan transliteration of Sanskrit, Chinese, or some non-Tibetan language.");
} }
return internalThrowThing(throwIfIllegal, return internalThrowThing(throwIfIllegal,
errorBuf,
"The subjoined letter given is not one of the four consonants that may be subscribed."); "The subjoined letter given is not one of the four consonants that may be subscribed.");
} }
} // subjoinedLetter tests } // subjoinedLetter tests
@ -743,10 +768,12 @@ public class LegalTshegBar
if (!getConnectiveCaseSuffix().equals(suffix)) { if (!getConnectiveCaseSuffix().equals(suffix)) {
if (suffix.length() != 1) { if (suffix.length() != 1) {
return internalThrowThing(throwIfIllegal, return internalThrowThing(throwIfIllegal,
errorBuf,
"Illegal suffix -- not one of the legal complex suffixes like 'u, 'o, 'i, 'am."); "Illegal suffix -- not one of the legal complex suffixes like 'u, 'o, 'i, 'am.");
} }
if (!isNominalRepresentationOfSimpleSuffix(suffix.charAt(0))) { if (!isNominalRepresentationOfSimpleSuffix(suffix.charAt(0))) {
return internalThrowThing(throwIfIllegal, return internalThrowThing(throwIfIllegal,
errorBuf,
"Illegal suffix -- not one of the ten legal suffixes: " "Illegal suffix -- not one of the ten legal suffixes: "
+ UnicodeUtils.unicodeCodepointToString(suffix.charAt(0))); + UnicodeUtils.unicodeCodepointToString(suffix.charAt(0)));
} }
@ -755,6 +782,7 @@ public class LegalTshegBar
if (EW_ABSENT != postsuffix) { if (EW_ABSENT != postsuffix) {
if (null == suffix) if (null == suffix)
return internalThrowThing(throwIfIllegal, return internalThrowThing(throwIfIllegal,
errorBuf,
"You cannot have a postsuffix unless you also have a suffix."); "You cannot have a postsuffix unless you also have a suffix.");
} }
@ -762,11 +790,13 @@ public class LegalTshegBar
if (EWC_ra == headLetter) { if (EWC_ra == headLetter) {
if (!isConsonantThatTakesRaMgo(rootLetter)) { if (!isConsonantThatTakesRaMgo(rootLetter)) {
return internalThrowThing(throwIfIllegal, return internalThrowThing(throwIfIllegal,
errorBuf,
"The head letter ra cannot be used with that root letter."); "The head letter ra cannot be used with that root letter.");
} }
} else if (EWC_la == headLetter) { } else if (EWC_la == headLetter) {
if (!isConsonantThatTakesLaMgo(rootLetter)) { if (!isConsonantThatTakesLaMgo(rootLetter)) {
return internalThrowThing(throwIfIllegal, return internalThrowThing(throwIfIllegal,
errorBuf,
"The head letter la cannot be used with that root letter."); "The head letter la cannot be used with that root letter.");
} }
} else if (EWC_sa == headLetter) { } else if (EWC_sa == headLetter) {
@ -774,15 +804,18 @@ public class LegalTshegBar
// handle a common error specially: // handle a common error specially:
if (EWC_la == rootLetter) if (EWC_la == rootLetter)
return internalThrowThing(throwIfIllegal, return internalThrowThing(throwIfIllegal,
errorBuf,
"sa cannot be a head letter atop the root letter la. You probably meant to have sa the root letter and la the subjoined letter."); "sa cannot be a head letter atop the root letter la. You probably meant to have sa the root letter and la the subjoined letter.");
return internalThrowThing(throwIfIllegal, return internalThrowThing(throwIfIllegal,
errorBuf,
"The head letter sa cannot be used with that root letter."); "The head letter sa cannot be used with that root letter.");
} }
} else { } else {
// '&#92;u0F6A' is not a valid head letter, even for // '&#92;u0F6A' is not a valid head letter, even for
// "rnya". Use EWC_ra instead. // "rnya". Use EWC_ra instead.
return internalThrowThing(throwIfIllegal, return internalThrowThing(throwIfIllegal,
errorBuf,
"The head letter given is not valid."); "The head letter given is not valid.");
} }
} // headLetter tests } // headLetter tests
@ -796,16 +829,20 @@ public class LegalTshegBar
{ {
if (EWC_achen == vowel) if (EWC_achen == vowel)
return internalThrowThing(throwIfIllegal, return internalThrowThing(throwIfIllegal,
errorBuf,
"The vowel given is not valid. Use EW_ABSENT for the EWC_achen sound."); "The vowel given is not valid. Use EW_ABSENT for the EWC_achen sound.");
if ('\u0F71' == vowel) if ('\u0F71' == vowel)
return internalThrowThing(throwIfIllegal, return internalThrowThing(throwIfIllegal,
"a-chung cannot be used in a simple Tibetan syllable."); errorBuf,
"a-chung cannot be used in a simple Tibetan syllable."); // DLC FIXME: what about pA?
return internalThrowThing(throwIfIllegal, return internalThrowThing(throwIfIllegal,
errorBuf,
"The vowel given is not valid."); "The vowel given is not valid.");
} }
} }
// Phew. We got here, so this combination of inputs is valid. // Phew. We got here, so this combination of inputs is valid.
// Do nothing to errorBuf.
return true; return true;
} }

View file

@ -40,29 +40,123 @@ public class LegalTshegBarTest extends TestCase implements UnicodeConstants {
/** Tests the getThdlWylie() method and one of the constructors. */ /** Tests the getThdlWylie() method and one of the constructors. */
public void testGetThdlWylie() { public void testGetThdlWylie() {
assertTrue(new LegalTshegBar(EWC_ba, EWC_sa, EWC_ga, EWSUB_ra_btags, assertTrue(new LegalTshegBar(EWC_ba, EWC_sa, EWC_ga, EWC_ra,
false, true, EWC_la, EWC_sa, EWV_o).getThdlWylie().toString().equals("bsgrAols")); false, true, EWC_la, EWC_sa, EWV_o).getThdlWylie().toString().equals("bsgrAols"));
assertTrue(new LegalTshegBar(EWC_ba, EWC_sa, EWC_ga, assertTrue(new LegalTshegBar(EWC_ba, EWC_sa, EWC_ga,
EWSUB_ra_btags, true, true, EWC_ra, true, true,
EWC_la, EWC_sa, EWV_o).getThdlWylie().toString().equals("bsgrwAols")); EWC_la, EWC_sa, EWV_o).getThdlWylie().toString().equals("bsgrwAols"));
assertTrue(new LegalTshegBar(EWC_ba, EWC_sa, EWC_ga, assertTrue(new LegalTshegBar(EWC_ba, EWC_sa, EWC_ga,
EWSUB_ra_btags, false, false, EWC_ra, false, false,
EWC_la, EWC_sa, EWV_o).getThdlWylie().toString().equals("bsgrols")); EWC_la, EWC_sa, EWV_o).getThdlWylie().toString().equals("bsgrols"));
assertTrue(new LegalTshegBar(EWC_ba, EW_ABSENT, EWC_ta,
EW_ABSENT, false, false,
EWC_nga, EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("btang"));
// dga and dag are fun, as both are represented by "\u0F51\u0F42":
{
assertTrue(new LegalTshegBar(EWC_da, EW_ABSENT, EWC_ga,
EW_ABSENT, false, false,
EW_ABSENT, EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("dga"));
assertTrue(new LegalTshegBar(EW_ABSENT, EW_ABSENT, EWC_da,
EW_ABSENT, false, false,
EWC_ga, EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("dag"));
}
assertTrue(new LegalTshegBar(EW_ABSENT, EWC_ra, EWC_da,
EW_ABSENT, false, false,
EWC_ga, EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("rdag"));
assertTrue(new LegalTshegBar(EWC_ba, EWC_ra, EWC_da,
EW_ABSENT, false, false,
EWC_ga, EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("brdag"));
assertTrue(new LegalTshegBar(EW_ABSENT, EW_ABSENT, EWC_nga,
EW_ABSENT, false, false,
"\u0F60\u0F72", EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("nga'i"));
assertTrue(new LegalTshegBar(EW_ABSENT, EW_ABSENT, EWC_nga,
EW_ABSENT, false, false,
null, EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("nga"));
assertTrue(new LegalTshegBar(EW_ABSENT, EW_ABSENT, EWC_sa,
EWC_la, false, false,
null, EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("sla"));
{
boolean threw = false;
try {
new LegalTshegBar(EW_ABSENT, EWC_sa, EWC_la,
EW_ABSENT, false, false,
null, EW_ABSENT, EW_ABSENT);
} catch (IllegalArgumentException e) {
threw = true;
}
assertTrue(threw);
}
} }
/** Tests the formsLegalTshegBar(..) method. DLC FIXME: but /** Tests the formsLegalTshegBar(..) method. DLC FIXME: but
* doesn't test it very well. */ * doesn't test it very well. */
public void testFormsLegalTshegBar() { public void testFormsLegalTshegBar() {
StringBuffer eb = new StringBuffer();
// Ensure that EWTS's jskad is not legal: // Ensure that EWTS's jskad is not legal:
{
assertTrue(!LegalTshegBar.formsLegalTshegBar(EWC_ja, EWC_sa, assertTrue(!LegalTshegBar.formsLegalTshegBar(EWC_ja, EWC_sa,
EWC_ka, EW_ABSENT, EWC_ka, EW_ABSENT,
false, false, false, false,
EW_ABSENT, EWC_da, EW_ABSENT, EWC_da,
EW_ABSENT)); EW_ABSENT, eb));
}
assertTrue(LegalTshegBar.formsLegalTshegBar(EWC_ba, EW_ABSENT, assertTrue(LegalTshegBar.formsLegalTshegBar(EWC_ba, EW_ABSENT,
EWC_ta, EW_ABSENT, EWC_ta, EW_ABSENT,
false, false, false, false,
EWC_da, EW_ABSENT, EWC_da, EW_ABSENT,
EW_ABSENT)); EW_ABSENT, eb));
// test that there's only one way to make dwa:
assertTrue(!LegalTshegBar.formsLegalTshegBar(EW_ABSENT, EW_ABSENT,
EWC_da, EWSUB_wa_zur,
false, false,
EW_ABSENT, EW_ABSENT,
EW_ABSENT, eb));
assertTrue(!LegalTshegBar.formsLegalTshegBar(EW_ABSENT, EW_ABSENT,
EWC_da, EWC_wa,
false, false,
EW_ABSENT, EW_ABSENT,
EW_ABSENT, eb));
boolean result
= LegalTshegBar.formsLegalTshegBar(EW_ABSENT, EW_ABSENT,
EWC_da, EW_ABSENT,
true, false,
EW_ABSENT, EW_ABSENT,
EW_ABSENT, eb);
assertTrue(eb.toString(), result);
}
/** Tests the behavior of the constructors. */
public void testConstructors() {
boolean x;
x = false;
try {
new LegalTshegBar(EWC_ba, EWC_sa, EWC_ga,
EWSUB_ra_btags, false, false,
EWC_la, EWC_sa, EWV_o);
} catch (IllegalArgumentException e) {
x = true;
}
assertTrue(x);
x = false;
try {
new LegalTshegBar(EWC_ba, EWC_sa, EWC_ga,
EWSUB_ra_btags, false, false,
new String(new char[] { EWC_la }), EWC_sa,
EWV_o);
} catch (IllegalArgumentException e) {
x = true;
}
assertTrue(x);
} }
} }

View file

@ -30,11 +30,11 @@ public class UnicodeUtils implements UnicodeConstants {
/** Returns true iff x is a Unicode codepoint that represents a /** Returns true iff x is a Unicode codepoint that represents a
consonant or two-consonant stack that has a Unicode code consonant or two-consonant stack that has a Unicode code
point. Returns true only for the usual suspects (like point. Returns true only for the usual suspects (like
<code>&#92;u0F40</code>) and for Sanskrit consonants (like <code>U+0F40</code>) and for Sanskrit consonants (like
<code>&#92;u0F71</code>) and the simple two-consonant stacks in <code>U+0F71</code>) and the simple two-consonant stacks in
Unicode (like <code>&#92;u0F43</code>). Returns false for, among Unicode (like <code>U+0F43</code>). Returns false for, among
other things, subjoined consonants like other things, subjoined consonants like
<code>&#92;u0F90</code>. */ <code>U+0F90</code>. */
public static boolean isNonSubjoinedConsonant(char x) { public static boolean isNonSubjoinedConsonant(char x) {
return ((x != '\u0F48' /* reserved in Unicode 3.2, but not in use */) return ((x != '\u0F48' /* reserved in Unicode 3.2, but not in use */)
&& (x >= '\u0F40' && x <= '\u0F6A')); && (x >= '\u0F40' && x <= '\u0F6A'));
@ -43,11 +43,11 @@ public class UnicodeUtils implements UnicodeConstants {
/** Returns true iff x is a Unicode codepoint that represents a /** Returns true iff x is a Unicode codepoint that represents a
subjoined consonant or subjoined two-consonant stack that has subjoined consonant or subjoined two-consonant stack that has
a Unicode code point. Returns true only for the usual a Unicode code point. Returns true only for the usual
suspects (like <code>&#92;u0F90</code>) and for Sanskrit suspects (like <code>U+0F90</code>) and for Sanskrit
consonants (like <code>&#92;u0F9C</code>) and the simple consonants (like <code>U+0F9C</code>) and the simple
two-consonant stacks in Unicode (like <code>&#92;u0FAC</code>). two-consonant stacks in Unicode (like <code>U+0FAC</code>).
Returns false for, among other things, non-subjoined Returns false for, among other things, non-subjoined
consonants like <code>&#92;u0F40</code>. */ consonants like <code>U+0F40</code>. */
public static boolean isSubjoinedConsonant(char x) { public static boolean isSubjoinedConsonant(char x) {
return ((x != '\u0F98' /* reserved in Unicode 3.2, but not in use */) return ((x != '\u0F98' /* reserved in Unicode 3.2, but not in use */)
&& (x >= '\u0F90' && x <= '\u0FBC')); && (x >= '\u0F90' && x <= '\u0FBC'));
@ -56,13 +56,13 @@ public class UnicodeUtils implements UnicodeConstants {
/** Returns true iff x is the preferred representation of a /** Returns true iff x is the preferred representation of a
Tibetan or Sanskrit consonant and cannot be broken down any Tibetan or Sanskrit consonant and cannot be broken down any
further. Returns false for, among other things, subjoined further. Returns false for, among other things, subjoined
consonants like <code>&#92;u0F90</code>, two-component consonants consonants like <code>U+0F90</code>, two-component consonants
like <code>&#92;u0F43</code>, and fixed-form consonants like like <code>U+0F43</code>, and fixed-form consonants like
'&#92;u0F6A'. The new consonants (for transcribing Chinese, I <code>U+0F6A</code>. The new consonants (for transcribing
believe) "&#92;u0F55&#92;u0F39" (which EWTS calls "fa"), Chinese, I believe) "&#92;u0F55&#92;u0F39" (which EWTS calls
"&#92;u0F56&#92;u0F39" ("va"), and "&#92;u0F5F&#92;u0F39" ("Dza") are "fa"), "&#92;u0F56&#92;u0F39" ("va"), and
two-codepoint sequences, but you should be aware of them "&#92;u0F5F&#92;u0F39" ("Dza") are two-codepoint sequences,
also. */ but you should be aware of them also. */
public static boolean isPreferredFormOfConsonant(char x) { public static boolean isPreferredFormOfConsonant(char x) {
return ((x != '\u0F48' /* reserved in Unicode 3.2, but not in use */) return ((x != '\u0F48' /* reserved in Unicode 3.2, but not in use */)
&& (x >= '\u0F40' && x <= '\u0F68') && (x >= '\u0F40' && x <= '\u0F68')
@ -97,7 +97,7 @@ public class UnicodeUtils implements UnicodeConstants {
Unicode codepoints, into either Normalization Form KD (NFKD), Unicode codepoints, into either Normalization Form KD (NFKD),
D (NFD), or THDL (NFTHDL), depending on the value of normForm. D (NFD), or THDL (NFTHDL), depending on the value of normForm.
NFD and NFKD are specified by Unicode 3.2; NFTHDL is needed NFD and NFKD are specified by Unicode 3.2; NFTHDL is needed
for {@link org.thdl.tib.text.tshegbar#UnicodeGraphemeCluster} for {@link org.thdl.tib.text.tshegbar.UnicodeGraphemeCluster}
because NFKD normalizes <code>U+0F0C</code> and neither NFD because NFKD normalizes <code>U+0F0C</code> and neither NFD
nor NFKD breaks down <code>U+0F00</code> into its constituent nor NFKD breaks down <code>U+0F00</code> into its constituent
codepoints. NFTHDL uses a maximum of codepoints, and it never codepoints. NFTHDL uses a maximum of codepoints, and it never
@ -247,7 +247,7 @@ public class UnicodeUtils implements UnicodeConstants {
/** Returns true iff ch corresponds to the Tibetan letter wa. /** Returns true iff ch corresponds to the Tibetan letter wa.
Several Unicode codepoints correspond to the Tibetan letter Several Unicode codepoints correspond to the Tibetan letter
wa. Oftentimes, <code>&#92;u0F5D</code> is thought of as the wa. Oftentimes, <code>U+0F5D</code> is thought of as the
nominal representation. */ nominal representation. */
public static boolean isWa(char ch) { public static boolean isWa(char ch) {
return ('\u0F5D' == ch return ('\u0F5D' == ch
@ -257,7 +257,7 @@ public class UnicodeUtils implements UnicodeConstants {
/** Returns true iff ch corresponds to the Tibetan letter ya. /** Returns true iff ch corresponds to the Tibetan letter ya.
Several Unicode codepoints correspond to the Tibetan letter Several Unicode codepoints correspond to the Tibetan letter
ya. Oftentimes, <code>&#92;u0F61</code> is thought of as the ya. Oftentimes, <code>U+0F61</code> is thought of as the
nominal representation. */ nominal representation. */
public static boolean isYa(char ch) { public static boolean isYa(char ch) {
return ('\u0F61' == ch return ('\u0F61' == ch
@ -267,7 +267,7 @@ public class UnicodeUtils implements UnicodeConstants {
/** Returns true iff there exists at least one codepoint cp in /** Returns true iff there exists at least one codepoint cp in
unicodeString such that cp {@link #isRa(char) is ra} or contains unicodeString such that cp {@link #isRa(char) is ra} or contains
ra (like <code>&#92;u0F77</code>). This method is not implemented ra (like <code>U+0F77</code>). This method is not implemented
as fast as it could be. It calls on the canonicalization code as fast as it could be. It calls on the canonicalization code
in order to maximize reuse and minimize the possibility of in order to maximize reuse and minimize the possibility of
coder error. */ coder error. */
@ -298,6 +298,9 @@ public class UnicodeUtils implements UnicodeConstants {
return "\\u" + Integer.toHexString((int)cp); return "\\u" + Integer.toHexString((int)cp);
} }
/**
* Returns a human-readable, ASCII form of the String s of Unicode
* codepoints. */
public static String unicodeStringToString(String s) { public static String unicodeStringToString(String s) {
StringBuffer sb = new StringBuffer(s.length() * 6); StringBuffer sb = new StringBuffer(s.length() * 6);
for (int i = 0; i < s.length(); i++) { for (int i = 0; i < s.length(); i++) {

View file

@ -40,10 +40,13 @@ public class UnicodeUtilsTest extends TestCase implements UnicodeConstants {
} }
/** Tests Unicode Normalization form KD for Tibetan codepoints. /** Tests Unicode Normalization form KD for Tibetan codepoints.
See Unicode, Inc.'s NormalizationTest-3.2.0.txt. This * See Unicode, Inc.'s NormalizationTest-3.2.0.txt. This
contains all test cases for * contains all test cases for
<code>U+0F00</code>-<code>U+0FFF</code> there, and a few * <code>U+0F00</code>-<code>U+0FFF</code> there, and a few more.
more. */ * Tests both {@link
* UnicodeUtils#toMostlyDecomposedUnicode(String, byte)} and
* {@link UnicodeUtils#toMostlyDecomposedUnicode(StringBuffer,
* byte)}.*/
public void testMostlyNFKD() { public void testMostlyNFKD() {
assertTrue(UnicodeUtils.toMostlyDecomposedUnicode("\u0F0B", NORM_NFKD).equals("\u0F0B")); assertTrue(UnicodeUtils.toMostlyDecomposedUnicode("\u0F0B", NORM_NFKD).equals("\u0F0B"));
assertTrue(UnicodeUtils.toMostlyDecomposedUnicode("\u0F40", NORM_NFKD).equals("\u0F40")); assertTrue(UnicodeUtils.toMostlyDecomposedUnicode("\u0F40", NORM_NFKD).equals("\u0F40"));
@ -112,10 +115,13 @@ public class UnicodeUtilsTest extends TestCase implements UnicodeConstants {
} }
/** Tests Unicode Normalization form D for Tibetan codepoints. /** Tests Unicode Normalization form D for Tibetan codepoints.
See Unicode, Inc.'s NormalizationTest-3.2.0.txt. This * See Unicode, Inc.'s NormalizationTest-3.2.0.txt. This
contains all test cases for * contains all test cases for
<code>U+0F00</code>-<code>U+0FFF</code> there, and a few * <code>U+0F00</code>-<code>U+0FFF</code> there, and a few more.
more. */ * Tests both {@link
* UnicodeUtils#toMostlyDecomposedUnicode(String, byte)} and
* {@link UnicodeUtils#toMostlyDecomposedUnicode(StringBuffer,
* byte)}.*/
public void testMostlyNFD() { public void testMostlyNFD() {
assertTrue(UnicodeUtils.toMostlyDecomposedUnicode("\u0F0B", NORM_NFD).equals("\u0F0B")); assertTrue(UnicodeUtils.toMostlyDecomposedUnicode("\u0F0B", NORM_NFD).equals("\u0F0B"));
assertTrue(UnicodeUtils.toMostlyDecomposedUnicode("\u0F40", NORM_NFD).equals("\u0F40")); assertTrue(UnicodeUtils.toMostlyDecomposedUnicode("\u0F40", NORM_NFD).equals("\u0F40"));
@ -184,10 +190,13 @@ public class UnicodeUtilsTest extends TestCase implements UnicodeConstants {
} }
/** Tests Unicode Normalization form THDL for Tibetan codepoints. /** Tests Unicode Normalization form THDL for Tibetan codepoints.
See Unicode, Inc.'s NormalizationTest-3.2.0.txt. This * See Unicode, Inc.'s NormalizationTest-3.2.0.txt. This
contains all test cases for * contains all test cases for
<code>U+0F00</code>-<code>U+0FFF</code> there, and a few * <code>U+0F00</code>-<code>U+0FFF</code> there, and a few more.
more. */ * Tests both {@link
* UnicodeUtils#toMostlyDecomposedUnicode(String, byte)} and
* {@link UnicodeUtils#toMostlyDecomposedUnicode(StringBuffer,
* byte)}. */
public void testMostlyNFTHDL() { public void testMostlyNFTHDL() {
assertTrue(UnicodeUtils.toMostlyDecomposedUnicode("\u0F0B", NORM_NFTHDL).equals("\u0F0B")); assertTrue(UnicodeUtils.toMostlyDecomposedUnicode("\u0F0B", NORM_NFTHDL).equals("\u0F0B"));
assertTrue(UnicodeUtils.toMostlyDecomposedUnicode("\u0F40", NORM_NFTHDL).equals("\u0F40")); assertTrue(UnicodeUtils.toMostlyDecomposedUnicode("\u0F40", NORM_NFTHDL).equals("\u0F40"));
@ -253,10 +262,36 @@ public class UnicodeUtilsTest extends TestCase implements UnicodeConstants {
assertTrue(UnicodeUtils.toMostlyDecomposedUnicode("\u0F79", NORM_NFTHDL).equals("\u0FB3\u0F71\u0F80")); assertTrue(UnicodeUtils.toMostlyDecomposedUnicode("\u0F79", NORM_NFTHDL).equals("\u0FB3\u0F71\u0F80"));
assertTrue(UnicodeUtils.toMostlyDecomposedUnicode("\u0FB3\u0F81", NORM_NFTHDL).equals("\u0FB3\u0F71\u0F80")); assertTrue(UnicodeUtils.toMostlyDecomposedUnicode("\u0FB3\u0F81", NORM_NFTHDL).equals("\u0FB3\u0F71\u0F80"));
assertTrue(UnicodeUtils.toMostlyDecomposedUnicode("\u0FB3\u0F71\u0F80", NORM_NFTHDL).equals("\u0FB3\u0F71\u0F80")); assertTrue(UnicodeUtils.toMostlyDecomposedUnicode("\u0FB3\u0F71\u0F80", NORM_NFTHDL).equals("\u0FB3\u0F71\u0F80"));
assertTrue(UnicodeUtils.toMostlyDecomposedUnicode("", NORM_NFTHDL).equals(""));
{
StringBuffer sb = new StringBuffer("\u0FAC");
UnicodeUtils.toMostlyDecomposedUnicode(sb, NORM_NFTHDL);
assertTrue(sb.toString().equals("\u0FAB\u0FB7"));
}
{
StringBuffer sb = new StringBuffer("\u0F66");
UnicodeUtils.toMostlyDecomposedUnicode(sb, NORM_NFTHDL);
assertTrue(sb.toString().equals("\u0F66"));
}
{
StringBuffer sb = new StringBuffer("");
UnicodeUtils.toMostlyDecomposedUnicode(sb, NORM_NFTHDL);
assertTrue(sb.toString().equals(""));
}
} }
/** Tests the containsRa method. */ /** Tests the containsRa method. */
public void testContainsRa() { public void testContainsRa() {
assertTrue(!UnicodeUtils.containsRa('\u0F69'));
assertTrue(!UnicodeUtils.containsRa('\u0FB1'));
assertTrue(!UnicodeUtils.containsRa('\u0F48'));
assertTrue(!UnicodeUtils.containsRa('\u0060'));
assertTrue(!UnicodeUtils.containsRa('\uFFFF'));
assertTrue(!UnicodeUtils.containsRa('\uFFFF'));
assertTrue(UnicodeUtils.containsRa('\u0FB2')); assertTrue(UnicodeUtils.containsRa('\u0FB2'));
assertTrue(UnicodeUtils.containsRa('\u0F77')); assertTrue(UnicodeUtils.containsRa('\u0F77'));
assertTrue(UnicodeUtils.containsRa('\u0F76')); assertTrue(UnicodeUtils.containsRa('\u0F76'));
@ -264,4 +299,84 @@ public class UnicodeUtilsTest extends TestCase implements UnicodeConstants {
assertTrue(UnicodeUtils.containsRa('\u0F62')); assertTrue(UnicodeUtils.containsRa('\u0F62'));
assertTrue(UnicodeUtils.containsRa('\u0FBC')); assertTrue(UnicodeUtils.containsRa('\u0FBC'));
} }
/**
* Tests the {@link UnicodeUtils#unicodeStringToString(String)}
* method. */
public void testUnicodeStringToString() {
assertTrue(UnicodeUtils.unicodeStringToString("\u0000").equals("\\u0000"));
assertTrue(UnicodeUtils.unicodeStringToString("\u0001").equals("\\u0001"));
assertTrue(UnicodeUtils.unicodeStringToString("\u000F").equals("\\u000f"));
assertTrue(UnicodeUtils.unicodeStringToString("\u001F").equals("\\u001f"));
assertTrue(UnicodeUtils.unicodeStringToString("\u00fF").equals("\\u00ff"));
assertTrue(UnicodeUtils.unicodeStringToString("\u01fF").equals("\\u01ff"));
assertTrue(UnicodeUtils.unicodeStringToString("\u0ffF").equals("\\u0fff"));
assertTrue(UnicodeUtils.unicodeStringToString("\u1ffF").equals("\\u1fff"));
assertTrue(UnicodeUtils.unicodeStringToString("\ufffF").equals("\\uffff"));
assertTrue(UnicodeUtils.unicodeStringToString("\u0F00\u0091\uABCD\u0FFF\u0Ff1\uFFFF\u0000").equals("\\u0f00\\u0091\\uabcd\\u0fff\\u0ff1\\uffff\\u0000"));
}
/**
* Tests the {@link UnicodeUtils#unicodeCodepointToString(char)}
* method. */
public void testUnicodeCodepointToString() {
assertTrue(UnicodeUtils.unicodeCodepointToString('\u0000').equals("\\u0000"));
assertTrue(UnicodeUtils.unicodeCodepointToString('\u0001').equals("\\u0001"));
assertTrue(UnicodeUtils.unicodeCodepointToString('\u000F').equals("\\u000f"));
assertTrue(UnicodeUtils.unicodeCodepointToString('\u001F').equals("\\u001f"));
assertTrue(UnicodeUtils.unicodeCodepointToString('\u00fF').equals("\\u00ff"));
assertTrue(UnicodeUtils.unicodeCodepointToString('\u01fF').equals("\\u01ff"));
assertTrue(UnicodeUtils.unicodeCodepointToString('\u0ffF').equals("\\u0fff"));
assertTrue(UnicodeUtils.unicodeCodepointToString('\u1ffF').equals("\\u1fff"));
assertTrue(UnicodeUtils.unicodeCodepointToString('\ufffF').equals("\\uffff"));
}
/**
* Tests the {@link UnicodeUtils#isEntirelyTibetanUnicode(String)}
* method. */
public void testIsEntirelyTibetanUnicode() {
assertTrue(UnicodeUtils.isEntirelyTibetanUnicode("\u0F00\u0FFF\u0F00\u0F1e\u0F48")); // U+0F48 is reserved, but in the range.
assertTrue(!UnicodeUtils.isEntirelyTibetanUnicode("\u0F00\u1000\u0FFF\u0F00\u0F1e\u0F48")); // U+0F48 is reserved, but in the range.
}
/**
* Tests the {@link UnicodeUtils#isTibetanConsonant(char)}
* method. */
public void testIsTibetanConsonant() {
assertTrue(!UnicodeUtils.isTibetanConsonant('\u0000'));
assertTrue(!UnicodeUtils.isTibetanConsonant('\uF000'));
assertTrue(!UnicodeUtils.isTibetanConsonant('\u0EFF'));
assertTrue(!UnicodeUtils.isTibetanConsonant('\u1000'));
assertTrue(!UnicodeUtils.isTibetanConsonant('\u0F00'));
assertTrue(!UnicodeUtils.isTibetanConsonant('\u0FFF'));
assertTrue(UnicodeUtils.isTibetanConsonant('\u0FB2'));
assertTrue(UnicodeUtils.isTibetanConsonant('\u0F6A'));
assertTrue(UnicodeUtils.isTibetanConsonant('\u0F40'));
assertTrue(UnicodeUtils.isTibetanConsonant('\u0F50'));
assertTrue(UnicodeUtils.isTibetanConsonant('\u0FBC'));
assertTrue(UnicodeUtils.isTibetanConsonant('\u0FB9'));
assertTrue(UnicodeUtils.isTibetanConsonant('\u0FB0'));
assertTrue(UnicodeUtils.isTibetanConsonant('\u0FAD'));
assertTrue(UnicodeUtils.isTibetanConsonant('\u0FA6'));
assertTrue(UnicodeUtils.isTibetanConsonant('\u0F90'));
assertTrue(UnicodeUtils.isTibetanConsonant('\u0F91'));
// reserved codepoints:
assertTrue(!UnicodeUtils.isTibetanConsonant('\u0F48'));
assertTrue(!UnicodeUtils.isTibetanConsonant('\u0F98'));
}
/**
* Tests the {@link UnicodeUtils#isInTibetanRange(char)}
* method. */
public void testIsInTibetanRange() {
assertTrue(!UnicodeUtils.isInTibetanRange('\u0000'));
assertTrue(!UnicodeUtils.isInTibetanRange('\u0100'));
assertTrue(!UnicodeUtils.isInTibetanRange('\u1000'));
assertTrue(UnicodeUtils.isInTibetanRange('\u0F00'));
assertTrue(UnicodeUtils.isInTibetanRange('\u0FF0'));
assertTrue(UnicodeUtils.isInTibetanRange('\u0FFF'));
}
} }