diff --git a/source/org/thdl/tib/text/THDLWylieConstants.java b/source/org/thdl/tib/text/THDLWylieConstants.java new file mode 100644 index 0000000..29a6a52 --- /dev/null +++ b/source/org/thdl/tib/text/THDLWylieConstants.java @@ -0,0 +1,117 @@ +/* +The contents of this file are subject to the THDL Open Community License +Version 1.0 (the "License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License on the THDL web site +(http://www.thdl.org/). + +Software distributed under the License is distributed on an "AS IS" basis, +WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +License for the specific terms governing rights and limitations under the +License. + +The Initial Developer of this software is the Tibetan and Himalayan Digital +Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL. +All Rights Reserved. + +Contributor(s): ______________________________________. +*/ + +package org.thdl.tib.text; + +/** This is where basic, static knowledge of THDL's Extended Wylie is housed. + * @see org.thdl.tib.text#TibetanMachineWeb */ +public interface THDLWylieConstants { +/** +* the Wylie for bindu/anusvara +*/ + public static final char BINDU = 'M'; +/** +* the Wylie for tsheg +*/ + public static final char TSHEG = ' '; //this character occurs in all ten TMW fonts +/** +* the Wylie for whitespace +*/ + public static final char SPACE = '_'; //this character occurs in all ten TMW fonts +/** +* the Sanskrit stacking separator used in Extended Wylie +*/ + public static final char WYLIE_SANSKRIT_STACKING_KEY = '+'; +/** +* the Wylie disambiguating key, as a char +*/ + public static final char WYLIE_DISAMBIGUATING_KEY = '.'; +/** +* the Wylie for the invisible 'a' vowel +*/ + public static final String WYLIE_aVOWEL = "a"; +/** +* the Wylie for achung +*/ + public static final char ACHUNG_character = '\''; +/** +* the Wylie for achung +*/ + public static final String ACHUNG + = new String(new char[] { ACHUNG_character }); +/** +* the Wylie for the 28th of the 30 consonants, sa: +*/ + public static final String SA = "s"; +/** +* the Wylie for the 16th of the 30 consonants, ma: +*/ + public static final String MA = "m"; +/** +* the Wylie for the 4th of the 30 consonants, nga: +*/ + public static final String NGA = "ng"; +/** +* the Wylie for achen +*/ + public static final String ACHEN = "a"; +/** +* the Wylie for gigu +*/ + public static final String i_VOWEL = "i"; +/** +* the Wylie for zhebju +*/ + public static final String u_VOWEL = "u"; +/** +* the Wylie for drengbu +*/ + public static final String e_VOWEL = "e"; +/** +* the Wylie for naro +*/ + public static final String o_VOWEL = "o"; +/** +* the Wylie for double drengbu +*/ + public static final String ai_VOWEL = "ai"; +/** +* the Wylie for double naro +*/ + public static final String au_VOWEL = "au"; +/** +* the Wylie for the subscript achung vowel +*/ + public static final String A_VOWEL = "A"; +/** +* the Wylie for log yig gigu +*/ + public static final String reverse_i_VOWEL = "-i"; +/** +* the Wylie for the vowel achung + gigu +*/ + public static final String I_VOWEL = "I"; +/** +* the Wylie for the vowel achung + zhebju +*/ + public static final String U_VOWEL = "U"; +/** +* the Wylie for the vowel achung + log yig gigu +*/ + public static final String reverse_I_VOWEL = "-I"; +} diff --git a/source/org/thdl/tib/text/TibTextUtils.java b/source/org/thdl/tib/text/TibTextUtils.java index e52b4c1..f3dfd0c 100644 --- a/source/org/thdl/tib/text/TibTextUtils.java +++ b/source/org/thdl/tib/text/TibTextUtils.java @@ -28,7 +28,8 @@ import org.thdl.util.ThdlDebug; /** * Provides methods for converting back and forth between Extended -* Wylie and TibetanMachineWeb. This class is not instantiable. +* Wylie and Tibetan represented in TibetanMachineWeb glyphs. This +* class is not instantiable. * *
* The class provides a variety of static methods for converting
@@ -37,7 +38,7 @@ import org.thdl.util.ThdlDebug;
* be exported as Rich Text Format.
*
* @author Edward Garrett, Tibetan and Himalayan Digital Library */
-public class TibTextUtils {
+public class TibTextUtils implements THDLWylieConstants {
/** Do not use this contructor. */
private TibTextUtils() { super(); }
@@ -255,11 +256,11 @@ public class TibTextUtils {
if (k < 32) //return null if character is just formatting
return String.valueOf(c);
- if (c == TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY)
- return String.valueOf(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
+ if (c == WYLIE_DISAMBIGUATING_KEY)
+ return String.valueOf(WYLIE_DISAMBIGUATING_KEY);
- if (c == TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY)
- return String.valueOf(TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY);
+ if (c == WYLIE_SANSKRIT_STACKING_KEY)
+ return String.valueOf(WYLIE_SANSKRIT_STACKING_KEY);
for (i=offset+1; i A LegalTshegBar is a simple Tibetan syllable or a syllable with
@@ -29,7 +29,7 @@ import org.thdl.util.ThdlDebug;
*
*
*
"\u0F60\u0F72"
)."\u0F60\u0F72"
) (DLC FIXME: 'o and
- * 'am maybe? I asked in the "Embarrasing error in wylie conversion"
- * bug report.).
"\u0F60\u0F72"
- * containing two codepoints in the special case that the suffix
- * is that connective case marker {@link
- * #getConnectiveCaseSuffix()}. */
+ * one consonant or a string like "\u0F60\u0F72"
+ * in the case that the suffix
+ * is 'i, 'u'i'o, 'am, 'ang, etc. */
public String getSuffix() {
return suffix;
}
/** Returns true iff there is a suffixed consonant or a suffixed
- * 'i
(DLC FIXME). */
+ * string consisting of 'i, 'u, 'o, 'am, and 'ang. */
public boolean hasSuffix() {
return (null != suffix);
}
/** Returns true iff there is a single, suffixed consonant. This
- means that suffixes like 'am
, 'i
,
- 'u
, and 'o
are not present, but this
- does not rule out the presence of a postsuffix. */
+ means that suffixes made from 'am
,
+ 'ang
'i
, 'u
, and
+ 'o
are not present, but this does not rule out
+ the presence of a postsuffix. */
public boolean hasSimpleSuffix() {
return ((null != suffix) && (1 == suffix.length()));
}
@@ -280,12 +277,6 @@ public class LegalTshegBar
return (EW_ABSENT != postsuffix);
}
- /** Returns true iff this syllable has a 'i
- * suffix. */
- public boolean hasConnectiveCaseMarkerSuffix() {
- return getSuffix().equals(getConnectiveCaseSuffix());
- }
-
/** Returns the root consonant. */
public char getRootLetter() {
return rootLetter;
@@ -324,7 +315,7 @@ public class LegalTshegBar
private final static String possibleSuffixes
= new String(new char[] {
- EWC_ga, EWC_nga, EWC_da, EWC_na, EWC_ba, EWC_ma, EWC_achen,
+ EWC_ga, EWC_nga, EWC_da, EWC_na, EWC_ba, EWC_ma, EWC_achung,
EWC_ra, EWC_la, EWC_sa
});
@@ -340,18 +331,6 @@ public class LegalTshegBar
// EWSUB_ra_btags.
}
- private final static String connectiveCaseSuffix
- = new String(new char[] {
- EWC_achen, EWV_i
- });
-
- /** Returns a two-codepoint string consisting of the Unicode
- * representation of what THDL Extended Wylie calls
- * 'i
. */
- public static String getConnectiveCaseSuffix() {
- return connectiveCaseSuffix;
- }
-
private final static String thirtyConsonants
= new String(new char[] {
EWC_ga, EWC_kha, EWC_ga, EWC_nga,
@@ -359,7 +338,7 @@ public class LegalTshegBar
EWC_ta, EWC_tha, EWC_da, EWC_na,
EWC_pa, EWC_pha, EWC_ba, EWC_ma,
EWC_tsa, EWC_tsha, EWC_dza, EWC_wa,
- EWC_zha, EWC_za, EWC_achen, EWC_ya,
+ EWC_zha, EWC_za, EWC_achung, EWC_ya,
EWC_ra, EWC_la, EWC_sha, EWC_sa,
EWC_ha, EWC_a
});
@@ -388,10 +367,10 @@ public class LegalTshegBar
This is not very efficient.
*/ public static String[] getPossibleSuffixParticles() { return new String[] { - new String(new char[] { EWC_achen, EWV_i }), - new String(new char[] { EWC_achen, EWV_o }), - new String(new char[] { EWC_achen, EWV_u }), - new String(new char[] { EWC_achen, EWC_ma }), + new String(new char[] { EWC_achung, EWV_i }), + new String(new char[] { EWC_achung, EWV_o }), + new String(new char[] { EWC_achung, EWV_u }), + new String(new char[] { EWC_achung, EWC_ma }), }; } @@ -402,7 +381,7 @@ public class LegalTshegBar * @see org.thdl.tib.text.tshegbar.UnicodeConstants */ public static String getTheFivePrefixes() { final String s = new String(new char[] { - EWC_ga, EWC_da, EWC_ba, EWC_ma, EWC_achen + EWC_ga, EWC_da, EWC_ba, EWC_ma, EWC_achung }); ThdlDebug.verify(s.length() == 5); // DLC put this into a JUnit test to avoid the slow-down. return s; @@ -416,27 +395,104 @@ public class LegalTshegBar /** Returns a String containing the nominal Unicode * representations of the ten suffixes. The suffixes are in - * dictionary order. - * @see #getConnectiveCaseSuffix() + * dictionary order. This doesn't include oddballs like suffixes + * based on 'i, 'u, 'o, 'am, and 'ang. * @see org.thdl.tib.text.tshegbar.UnicodeConstants */ public static String getTheTenSuffixes() { final String s = new String(new char[] { EWC_ga, EWC_nga, EWC_da, EWC_na, EWC_ba, - EWC_ma, EWC_achen, EWC_ra, EWC_la, EWC_sa + EWC_ma, EWC_achung, EWC_ra, EWC_la, EWC_sa }); - ThdlDebug.verify(s.length() == 10); // DLC put this into a JUnit test to avoid the slow-down. return s; } /** Returns true iff x is the preferred, nominal Unicode * representation of one of the ten suffixes. - * @see #getConnectiveCaseSuffix() */ public static boolean isNominalRepresentationOfSimpleSuffix(char x) { return (-1 != getTheTenSuffixes().indexOf(x)); } + /** Legal suffix-like particles, excluding the ten suffixes. If + * you add one, be sure that a tsheg-bar with it has the extended + * wylie you wish by adding the correct extended Wylie with it. */ + private static final String[][] oddball_suffixes = new String[][] { + { + // connective case marker: + new String( new char[] { + EWC_achung, EWV_i + }), + THDLWylieConstants.ACHUNG + THDLWylieConstants.i_VOWEL + }, + { + new String( new char[] { + EWC_achung, EWV_u + }), + THDLWylieConstants.ACHUNG + THDLWylieConstants.u_VOWEL + }, + { + // in at least one context, this shows end of sentence: + new String( new char[] { + EWC_achung, EWV_o + }), + THDLWylieConstants.ACHUNG + THDLWylieConstants.o_VOWEL + }, + { + // as in sgom pa'am: + new String( new char[] { + EWC_achung, EWC_ma + }), + THDLWylieConstants.ACHUNG + THDLWylieConstants.WYLIE_aVOWEL + + THDLWylieConstants.MA + }, + { + // meaning or, as opposed to and: + new String( new char[] { + EWC_achung, EWC_nga + }), + THDLWylieConstants.ACHUNG + THDLWylieConstants.WYLIE_aVOWEL + + THDLWylieConstants.NGA + } + }; + + /** Returns true iff suffix is 'i, 'o, 'u, 'am, 'ang, or a + * concatenation like 'u'i'o. Returns false otherwise (including + * the case that suffix is the empty string). */ + public static boolean isAchungBasedSuffix(String suffix) { + int i = 0; // so that the empty string causes false to be returned. + while (i == 0 || !suffix.equals("")) { + boolean startsWithOneOfThem = false; + for (int x = 0; x < oddball_suffixes.length; x++) { + if (suffix.startsWith(oddball_suffixes[x][0])) { + startsWithOneOfThem = true; + suffix = suffix.substring(oddball_suffixes[x][0].length()); + break; + } + } + if (!startsWithOneOfThem) + return false; + ++i; + } + return true; + } + + private static String getTHDLWylieForOddballSuffix(String suffix) { + // FIXME: assert that isAchungBasedSuffix + StringBuffer wylie = new StringBuffer(); + while (!suffix.equals("")) { + for (int x = 0; x < oddball_suffixes.length; x++) { + if (suffix.startsWith(oddball_suffixes[x][0])) { + wylie.append(oddball_suffixes[x][1]); + suffix = suffix.substring(oddball_suffixes[x][0].length()); + break; + } + } + } + return wylie.toString(); + } + + /** Returns true iff the given (rootLetter, subjoinedLetter) combination can accept an additional wa-zur. Only g-r-w, d-r-w, and ph-y-w fall into this category according to @@ -595,8 +651,8 @@ public class LegalTshegBar * @param subjoinedLetter the optional, subscribed consonant * @param suffix the optional suffix, which is null, a String * consisting of a single consonant (i.e. a single, - * nondecomposable codepoint) except in the special case that - * this is {@link #getConnectiveCaseSuffix()} + * nondecomposable codepoint), or a string of 'i (U+0F, 'u, 'o, 'am, + * and 'ang. * @param postsuffix the optional postsuffix, which should be * EWC_sa or EWC_da * @param errorBuffer if non-null, and if the return code is @@ -763,13 +819,12 @@ public class LegalTshegBar } // subjoinedLetter tests // Suffix tests: - // DLC NOW -- allow 'o, 'u, 'am, etc. if (null != suffix) { - if (!getConnectiveCaseSuffix().equals(suffix)) { + if (!isAchungBasedSuffix(suffix)) { if (suffix.length() != 1) { return internalThrowThing(throwIfIllegal, errorBuf, - "Illegal suffix -- not one of the legal complex suffixes like 'u, 'o, 'i, 'am."); + "Illegal suffix -- not one of the legal complex suffixes like 'u, 'o, 'i, 'am, 'ang."); } if (!isNominalRepresentationOfSimpleSuffix(suffix.charAt(0))) { return internalThrowThing(throwIfIllegal, @@ -784,6 +839,10 @@ public class LegalTshegBar return internalThrowThing(throwIfIllegal, errorBuf, "You cannot have a postsuffix unless you also have a suffix."); + if (isAchungBasedSuffix(suffix)) + return internalThrowThing(throwIfIllegal, + errorBuf, + "You cannot have a postsuffix if you have a suffix based on 'i, 'o, 'u, 'am, and 'ang."); } if (EW_ABSENT != headLetter) { @@ -812,7 +871,9 @@ public class LegalTshegBar "The head letter sa cannot be used with that root letter."); } } else { - // '\u0F6A' is not a valid head letter, even for + // Illegal head letter. + // + // Note: U+0F6A is not a valid head letter, even for // "rnya". Use EWC_ra instead. return internalThrowThing(throwIfIllegal, errorBuf, @@ -827,14 +888,14 @@ public class LegalTshegBar && EWV_e != vowel && EWV_o != vowel) { - if (EWC_achen == vowel) + if (EWC_achung == vowel) return internalThrowThing(throwIfIllegal, errorBuf, - "The vowel given is not valid. Use EW_ABSENT for the EWC_achen sound."); + "The vowel given is not valid. Use EW_ABSENT for the EWC_achung sound."); if ('\u0F71' == vowel) return internalThrowThing(throwIfIllegal, errorBuf, - "a-chung cannot be used in a simple Tibetan syllable."); // DLC FIXME: what about pA? + "a-chung can be used, but there is a flag for it; you don't call it the vowel."); return internalThrowThing(throwIfIllegal, errorBuf, "The vowel given is not valid."); @@ -848,9 +909,6 @@ public class LegalTshegBar /* - DLC add a method giving the correct connective case thingy or - throwing error if the 'i suffix already appears. - DLC put in a method that gets pronunciation using Unicode diacritical marks. And another using just US Roman. Note that pronunciation is contextual, so have these methods return all @@ -875,7 +933,7 @@ public class LegalTshegBar boolean disambiguatorNeeded = false; char prefix = getPrefix(); sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(prefix)); - if (!hasHeadLetter()) { + if (!hasHeadLetter() && !hasSubjoinedLetter()) { if (EWC_ya == rootLetter) { if (isConsonantThatTakesYaBtags(prefix)) disambiguatorNeeded = true; @@ -891,7 +949,7 @@ public class LegalTshegBar } } if (disambiguatorNeeded) - sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY); + sb.append(THDLWylieConstants.WYLIE_DISAMBIGUATING_KEY); } if (hasHeadLetter()) sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getHeadLetter())); @@ -914,14 +972,14 @@ public class LegalTshegBar // DLC FIXME: are these allowed in legal Tibetan? // EWTS would have special cases for them if so, - // I'd wager... - sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EW_achung)); + // I'd wager, so I bet they're not. + sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EW_achung_vowel)); sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getVowel())); } else { ThdlDebug.abort("only simple vowels occur in this class, how did this get past internalLegalityTest(..)?"); } } else { - sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EW_achung)); + sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EW_achung_vowel)); } } else { if (hasExplicitVowel()) @@ -930,19 +988,34 @@ public class LegalTshegBar sb.append("a"); } + String suf = null; if (hasSuffix()) { - String suf = getSuffix(); - sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(suf.charAt(0))); + suf = getSuffix(); if (suf.length() > 1) { - // DLC assert, don't verify, that the length is two. - // This could change if I learn of more suffix - // particles. - ThdlDebug.verify(2 == suf.length()); - sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(suf.charAt(1))); + // pa'am, not pa'm or pa'ama! + sb.append(getTHDLWylieForOddballSuffix(suf)); + } else { + sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(suf.charAt(0))); } } - if (hasPostsuffix()) + if (hasPostsuffix()) { + // lar.d, la-ra-da, needs a disambiguator. EWC_sa doesn't + // take any head letters, but EWC_da does. + boolean disambiguatorNeeded = false; + if (getPostsuffix() == EWC_da) { + if (suf.length() == 1) { + char simpleSuffix = suf.charAt(0); + if (EWC_ra == simpleSuffix + || EWC_la == simpleSuffix + || EWC_sa == simpleSuffix) { + disambiguatorNeeded = true; + } + } + } + if (disambiguatorNeeded) + sb.append(THDLWylieConstants.WYLIE_DISAMBIGUATING_KEY); sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getPostsuffix())); + } return sb; } @@ -987,7 +1060,7 @@ public class LegalTshegBar ? "hasAChungOnRootLetter=\"true\"" : "") - // DLC NOW: what about the root letter a, i.e. \u0F68 ? do we want the EWTS to be 'aa' ? + // DLC NOW FIXME: what about the root letter a, i.e. \u0F68 ? do we want the EWTS to be 'aa' ? + ("vowel=\"" + (hasExplicitVowel() ? UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getVowel()) @@ -1019,7 +1092,8 @@ public class LegalTshegBar sb.append(getPrefix()); } if (hasHeadLetter()) { - // DLC FIXME this crap won't be true... + // DLC NOW FIXME this crap won't be true... it's what we must + // convert to, though. Do it. ThdlDebug.verify(UnicodeUtils.isNonSubjoinedConsonant(getPrefix())); ThdlDebug.verify(UnicodeUtils.isSubjoinedConsonant(getRootLetter())); sb.append(getHeadLetter()); @@ -1036,8 +1110,8 @@ public class LegalTshegBar sb.append(EWSUB_wa_zur); } if (hasAChungOnRootLetter()) { - ThdlDebug.verify('\u0F71' == EW_achung); - sb.append(EW_achung); + ThdlDebug.verify('\u0F71' == EW_achung_vowel); + sb.append(EW_achung_vowel); } if (hasExplicitVowel()) { sb.append(getVowel()); diff --git a/source/org/thdl/tib/text/tshegbar/LegalTshegBarTest.java b/source/org/thdl/tib/text/tshegbar/LegalTshegBarTest.java index 1749648..c747304 100644 --- a/source/org/thdl/tib/text/tshegbar/LegalTshegBarTest.java +++ b/source/org/thdl/tib/text/tshegbar/LegalTshegBarTest.java @@ -38,8 +38,64 @@ public class LegalTshegBarTest extends TestCase implements UnicodeConstants { junit.textui.TestRunner.run(LegalTshegBarTest.class); } + /** Tests the getThdlWylie() method to see if we + handle "le'u'i'o", "sgom pa'am", "sgom pa'ang", etc. + */ + public void testGetThdlWylieForLongSuffixLikeThings() { + assertTrue(new LegalTshegBar(EW_ABSENT, EW_ABSENT, EWC_la, + EW_ABSENT, false, false, + new String(new char[] { + EWC_achung, EWV_u, + EWC_achung, EWV_i, + EWC_achung, EWV_o + }), + EW_ABSENT, EWV_e).getThdlWylie().toString().equals("le'u'i'o")); + assertTrue(new LegalTshegBar(EW_ABSENT, EW_ABSENT, EWC_la, + EW_ABSENT, false, false, + new String(new char[] { + EWC_achung, EWV_u, + EWC_achung, EWV_i, + EWC_achung, EWV_o, + EWC_achung, EWC_ma, + EWC_achung, EWC_nga, + EWC_achung, EWV_o, + EWC_achung, EWC_ma + }), + EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("la'u'i'o'am'ang'o'am")); + assertTrue(new LegalTshegBar(EW_ABSENT, EW_ABSENT, EWC_pa, + EW_ABSENT, false, false, + new String(new char[] { EWC_achung, EWC_ma }), + EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("pa'am")); + assertTrue(new LegalTshegBar(EW_ABSENT, EW_ABSENT, EWC_pa, + EW_ABSENT, false, false, + new String(new char[] { EWC_achung, EWC_nga }), + EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("pa'ang")); + } + /** Tests the getThdlWylie() method and one of the constructors. */ public void testGetThdlWylie() { + // do we disambiguate when needed? + { + assertTrue(new LegalTshegBar(EW_ABSENT, EW_ABSENT, EWC_ga, EWC_ya, + false, false, EW_ABSENT, EW_ABSENT, EWV_o).getThdlWylie().toString().equals("gyo")); + assertTrue(new LegalTshegBar(EWC_ga, EW_ABSENT, EWC_ya, EW_ABSENT, + false, false, EW_ABSENT, EW_ABSENT, EWV_o).getThdlWylie().toString().equals("g.yo")); + assertTrue(new LegalTshegBar(EWC_ba, EW_ABSENT, EWC_la, EW_ABSENT, + false, false, EWC_ga, EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("b.lag")); + assertTrue(new LegalTshegBar(EWC_ba, EW_ABSENT, EWC_la, EW_ABSENT, + false, false, EWC_ga, EWC_sa, EW_ABSENT).getThdlWylie().toString().equals("b.lags")); + assertTrue(new LegalTshegBar(EWC_ba, EW_ABSENT, EWC_ra, EW_ABSENT, + false, false, EWC_ga, EWC_da, EW_ABSENT).getThdlWylie().toString().equals("b.ragd")); + assertTrue(new LegalTshegBar(EWC_ba, EW_ABSENT, EWC_ra, EWC_la, + false, false, EWC_ga, EWC_da, EW_ABSENT).getThdlWylie().toString().equals("brlagd")); + assertTrue(new LegalTshegBar(EWC_ba, EWC_ra, EWC_ga, EW_ABSENT, + false, false, EWC_ga, EWC_da, EW_ABSENT).getThdlWylie().toString().equals("brgagd")); + assertTrue(new LegalTshegBar(EWC_ba, EWC_la, EWC_ha, EW_ABSENT, + false, false, EWC_ga, EWC_da, EW_ABSENT).getThdlWylie().toString().equals("blhagd")); + assertTrue(new LegalTshegBar(EWC_ba, EWC_la, EWC_da, EW_ABSENT, + false, false, EWC_ga, EWC_da, EW_ABSENT).getThdlWylie().toString().equals("bldagd")); + } + assertTrue(new LegalTshegBar(EWC_ba, EWC_sa, EWC_ga, EWC_ra, false, true, EWC_la, EWC_sa, EWV_o).getThdlWylie().toString().equals("bsgrAols")); assertTrue(new LegalTshegBar(EWC_ba, EWC_sa, EWC_ga, @@ -81,6 +137,10 @@ public class LegalTshegBarTest extends TestCase implements UnicodeConstants { EWC_la, false, false, null, EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("sla")); + assertTrue(new LegalTshegBar(EW_ABSENT, EW_ABSENT, EWC_pa, + EW_ABSENT, false, true, + null, EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("pA")); + { boolean threw = false; try { @@ -159,4 +219,64 @@ public class LegalTshegBarTest extends TestCase implements UnicodeConstants { } assertTrue(x); } + + /** Tests {@link + * org.thdl.tib.text.tshegbar.LegalTshegBar#getTheTenSuffixes()}. */ + public void testGetTheTenSuffixes() { + String x = LegalTshegBar.getTheTenSuffixes(); + assertTrue(x.length() == 10); + assertTrue(x.charAt(0) == EWC_ga); + assertTrue(x.charAt(4) == EWC_ba); + assertTrue(x.charAt(9) == EWC_sa); + } + + /** Tests {@link + * org.thdl.tib.text.tshegbar.LegalTshegBar#isAchungBasedSuffix(String)}. */ + public void testIsAchungBasedSuffix() { + assertTrue(LegalTshegBar.isAchungBasedSuffix(new String(new char[] { + EWC_achung, EWC_nga + }))); + assertTrue(LegalTshegBar.isAchungBasedSuffix(new String(new char[] { + EWC_achung, EWC_ma + }))); + assertTrue(LegalTshegBar.isAchungBasedSuffix(new String(new char[] { + EWC_achung, EWV_i + }))); + assertTrue(LegalTshegBar.isAchungBasedSuffix(new String(new char[] { + EWC_achung, EWV_o + }))); + assertTrue(LegalTshegBar.isAchungBasedSuffix(new String(new char[] { + EWC_achung, EWV_u + }))); + assertTrue(LegalTshegBar.isAchungBasedSuffix(new String(new char[] { + EWC_achung, EWV_u, + EWC_achung, EWV_i, + EWC_achung, EWV_o + }))); + assertTrue(!LegalTshegBar.isAchungBasedSuffix(new String(new char[] { + EWC_achung, EWV_u, + EWC_achung, EWV_i, + EWC_achung, EWV_o, /* no EWC_achung, */ EWC_nga + }))); + + // syntactically illegal, I'd bet, but our algorithm allows it: + assertTrue(LegalTshegBar.isAchungBasedSuffix(new String(new char[] { + EWC_achung, EWC_ma, + EWC_achung, EWV_i, + EWC_achung, EWV_i, + EWC_achung, EWV_i, + EWC_achung, EWV_o, + EWC_achung, EWC_nga, + EWC_achung, EWV_o + }))); + + assertTrue(!LegalTshegBar.isAchungBasedSuffix(new String(new char[] { + EWC_achung, EWC_la + }))); + assertTrue(!LegalTshegBar.isAchungBasedSuffix(new String(new char[] { + EWC_achung, EWV_e + }))); + + assertTrue(!LegalTshegBar.isAchungBasedSuffix("")); + } } diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeConstants.java b/source/org/thdl/tib/text/tshegbar/UnicodeConstants.java index 8496989..119c160 100644 --- a/source/org/thdl/tib/text/tshegbar/UnicodeConstants.java +++ b/source/org/thdl/tib/text/tshegbar/UnicodeConstants.java @@ -83,20 +83,21 @@ public interface UnicodeConstants { static final char EWC_za = '\u0F5F'; /** Note the irregular name. The Extended Wylie representation is'a
. */
- static final char EWC_achen = '\u0F60'; /* DLC NOW is this achen or achung? achen is EWC_a, right? comment it. replace EWC_achen everywhere if you change it. */
+ static final char EWC_achung = '\u0F60';
static final char EWC_ya = '\u0F61';
static final char EWC_ra = '\u0F62';
static final char EWC_la = '\u0F63';
static final char EWC_sha = '\u0F64';
static final char EWC_sa = '\u0F66';
static final char EWC_ha = '\u0F67';
+ /** achen, the 30th consonant (and, some say, the fifth vowel) DLC NOW FIXME: rename to EWC_achen */
static final char EWC_a = '\u0F68';
/** In the word for father, "pA lags", there is an a-chung (i.e.,
\u0F71
). This is the constant for that little
guy. */
- static final char EW_achung = '\u0F71';
+ static final char EW_achung_vowel = '\u0F71';
/* Four of the five vowels, some say, or, others say, "the four
diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeGraphemeCluster.java b/source/org/thdl/tib/text/tshegbar/UnicodeGraphemeCluster.java
index 62fe9c4..e0c7103 100644
--- a/source/org/thdl/tib/text/tshegbar/UnicodeGraphemeCluster.java
+++ b/source/org/thdl/tib/text/tshegbar/UnicodeGraphemeCluster.java
@@ -127,11 +127,12 @@ public class UnicodeGraphemeCluster
/** Returns the THDL Extended Wylie transliteration of this
grapheme cluster, or null if there is none (which happens for
a few Tibetan codepoints, if you'll recall). If needsVowel is
- true, then an "a" will be appended when there is no EW_achung
- or explicit simple vowel. If there is an explicit vowel or
- EW_achung, it will always be present. Note that needsVowel is
- provided because btags is the preferred THDL Extended Wylie
- for the four contiguous grapheme clusters
+ true, then an "a" will be appended when there is no
+ EW_achung_vowel or explicit simple vowel. If there is an
+ explicit vowel or EW_achung_vowel, it will always be present.
+ Note that needsVowel is provided because btags is the
+ preferred THDL Extended Wylie for the four contiguous grapheme
+ clusters
"\u0F56\u0F4F\u0F42\u0F66"
, and
needsVowel must be set to false for all but the grapheme
cluster corresponding to \u0F4F
if you wish
@@ -257,7 +258,7 @@ public class UnicodeGraphemeCluster
/** Returns the height for the Tibetan Unicode codepoint x.
This relative height is 0 for a base consonant, digit,
punctuation, mark, or sign. It is -1 for a subjoined
- consonant, -2 for EWSUB_wa_zur, -3 for EW_achung, +1 for
+ consonant, -2 for EWSUB_wa_zur, -3 for EW_achung_vowel, +1 for
EWV_gigu, and so on according to the height these codepoints
appear relative to one another when on the same stack. If two
codepoints have equal height, they should not exist in the