0);
+
+ // A list of grapheme clusters (see UnicodeGraphemeCluster).
+ // sz is an overestimate (speeds us up, wastes some memory).
+ ArrayList gcs = new ArrayList(sz);
+
+ StringBuffer buildingUpGc = new StringBuffer();
+
+ boolean consonantal_with_vowel = false;
+ boolean buildingUpSanskrit = false;
+ for (int i = 0; i < sz; i++) {
+ DuffCode dc = (DuffCode)glyphList.get(i);
+ String wylie = TibetanMachineWeb.getWylieForGlyph(dc, noSuchWylie);
+ boolean containsWylieVowel = false;
+ boolean buildingUpSanskritNext = false;
+ if ((buildingUpSanskritNext
+ = TibetanMachineWeb.isWylieSanskritConsonantStack(wylie))
+ || TibetanMachineWeb.isWylieTibetanConsonantOrConsonantStack(wylie)) {
+ if (buildingUpGc.length() > 0) {
+ gcs.add(new TGCPair(buildingUpGc.toString(),
+ consonantal_with_vowel
+ ? (buildingUpSanskrit
+ ? TGCPair.SANSKRIT_WITH_VOWEL
+ : TGCPair.CONSONANTAL_WITH_VOWEL)
+ : (buildingUpSanskrit
+ ? TGCPair.SANSKRIT_WITHOUT_VOWEL
+ : TGCPair.CONSONANTAL_WITHOUT_VOWEL)));
+ buildingUpGc.delete(0, buildingUpGc.length());
+ }
+ buildingUpGc.append(wylie);
+ consonantal_with_vowel = false;
+ buildingUpSanskrit = buildingUpSanskritNext;
+ } else if ((containsWylieVowel
+ = TibetanMachineWeb.isWylieAdornmentAndContainsVowel(wylie))
+ || TibetanMachineWeb.isWylieAdornment(wylie)) {
+
+ if (buildingUpGc.length() > 0) {
+ buildingUpGc.append(wylie);
+ if (containsWylieVowel) {
+ if (debug)
+ System.out.println("DEBUG: with_vowel is true thanks to " + wylie);
+ consonantal_with_vowel = true;
+ }
+ // do not clear; we might have {cui} or {hUM}, e.g.
+ } else {
+ gcs.add(new TGCPair(wylie,
+ TGCPair.LONE_VOWEL));
+ consonantal_with_vowel = false;
+ }
+ } else {
+ // number or weird thing:
+
+ if (buildingUpGc.length() > 0) {
+ gcs.add(new TGCPair(buildingUpGc.toString(),
+ consonantal_with_vowel
+ ? (buildingUpSanskrit
+ ? TGCPair.SANSKRIT_WITH_VOWEL
+ : TGCPair.CONSONANTAL_WITH_VOWEL)
+ : (buildingUpSanskrit
+ ? TGCPair.SANSKRIT_WITHOUT_VOWEL
+ : TGCPair.CONSONANTAL_WITHOUT_VOWEL)));
+ buildingUpGc.delete(0, buildingUpGc.length());
+ }
+ gcs.add(new TGCPair(wylie, TGCPair.OTHER));
+ consonantal_with_vowel = false;
+ buildingUpSanskrit = false;
+ }
+ }
+ if (buildingUpGc.length() > 0) {
+ gcs.add(new TGCPair(buildingUpGc.toString(),
+ consonantal_with_vowel
+ ? (buildingUpSanskrit
+ ? TGCPair.SANSKRIT_WITH_VOWEL
+ : TGCPair.CONSONANTAL_WITH_VOWEL)
+ : (buildingUpSanskrit
+ ? TGCPair.SANSKRIT_WITHOUT_VOWEL
+ : TGCPair.CONSONANTAL_WITHOUT_VOWEL)));
+ }
+ buildingUpGc = null;
+ return gcs;
+ }
+
+
+ private static String getClassificationOfTshegBar(ArrayList gcs,
+ StringBuffer warnings) {
+ String candidateType = null;
+ // Now that we have grapheme clusters, see if they match any
+ // of the "legal tsheg bars":
+ int sz = gcs.size();
+ for (int i = 0; i < sz; i++) {
+ TGCPair tp = (TGCPair)gcs.get(i);
+ int cls = tp.classification;
+ String wylie = tp.wylie;
+ if (TGCPair.OTHER == cls) {
+ if (TibetanMachineWeb.isWylieNumber(wylie)) {
+ if (null == candidateType) {
+ candidateType = "number";
+ } else {
+ if ("number" != candidateType) {
+ if (null != warnings)
+ warnings.append("Found something odd; the wylie is " + wylie + "\n");
+ candidateType = "invalid";
+ break;
+ }
+ }
+ } else {
+ if (null != warnings)
+ warnings.append("Found something odd; the wylie is " + wylie + "\n");
+ candidateType = "invalid";
+ break;
+ }
+ } else if (TGCPair.SANSKRIT_WITHOUT_VOWEL == cls
+ || TGCPair.SANSKRIT_WITH_VOWEL == cls) {
+ candidateType = "invalid";
+ } else if (TGCPair.CONSONANTAL_WITHOUT_VOWEL == cls
+ || TGCPair.CONSONANTAL_WITH_VOWEL == cls) {
+ if (null == candidateType) {
+ if (TibetanMachineWeb.isWylieLeft(wylie)) {
+ candidateType = "prefix/root";
+ } else {
+ candidateType = "root";
+ }
+ } else {
+ if ("prefix/root" == candidateType) {
+ if (ACHUNG.equals(wylie)) {
+ // peek ahead to distinguish between ba's,
+ // ba'ala and ba'am:
+ TGCPair nexttp = (i+1 < sz) ? (TGCPair)gcs.get(i+1) : null;
+ String nextwylie = (nexttp == null) ? "" : nexttp.wylie;
+ if (isAppendageNonVowelWylie(nextwylie)) {
+ candidateType = "maybe-appendaged-prefix/root";
+ } else {
+ candidateType = "prefix/root-root/suffix";
+ }
+ } else if (TibetanMachineWeb.isWylieRight(wylie)) {
+ candidateType = "prefix/root-root/suffix";
+ } else if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) {
+ candidateType = "appendaged-prefix/root";
+ } else {
+ candidateType = "prefix-root";
+ }
+ } else if ("root" == candidateType) {
+ if (ACHUNG.equals(wylie)) {
+ // peek ahead to distinguish between pa's,
+ // pa'ala and pa'am:
+ TGCPair nexttp = (i+1 < sz) ? (TGCPair)gcs.get(i+1) : null;
+ String nextwylie = (nexttp == null) ? "" : nexttp.wylie;
+ if (isAppendageNonVowelWylie(nextwylie)) {
+ candidateType = "maybe-appendaged-root";
+ } else {
+ candidateType = "root-suffix";
+ }
+ } else if (TibetanMachineWeb.isWylieRight(wylie)) {
+ candidateType = "root-suffix";
+ } else if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) {
+ candidateType = "appendaged-root";
+ } else {
+ if (null != warnings)
+ warnings.append("Found a non-prefix consonant or consonant stack followed by a consonant or consonant stack that is not simply a suffix; that thing's wylie is " + wylie + "\n");
+ candidateType = "invalid";
+ break;
+ }
+ } else if ("prefix-root" == candidateType) {
+ if (ACHUNG.equals(wylie)) {
+ // peek ahead to distinguish between bpa's,
+ // bpa'ala and bpa'am:
+ TGCPair nexttp = (i+1 < sz) ? (TGCPair)gcs.get(i+1) : null;
+ String nextwylie = (nexttp == null) ? "" : nexttp.wylie;
+ if (isAppendageNonVowelWylie(nextwylie)) {
+ candidateType = "maybe-appendaged-prefix-root";
+ } else {
+ candidateType = "prefix-root-suffix";
+ }
+ } else if (TibetanMachineWeb.isWylieRight(wylie)) {
+ candidateType = "prefix-root-suffix";
+ } else if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) {
+ candidateType = "appendaged-prefix-root";
+ } else {
+ if (null != warnings)
+ warnings.append("Found a prefix plus a root stack plus a non-suffix consonant or consonant stack whose wylie is " + wylie + "\n");
+ candidateType = "invalid";
+ break;
+ }
+ } else if ("prefix/root-root/suffix" == candidateType) {
+ // this has no peekahead, gag'am works.
+ if (ACHUNG.equals(wylie)) {
+ // peek ahead to distinguish between
+ // gga'am and gaga'ala:
+ TGCPair nexttp = (i+1 < sz) ? (TGCPair)gcs.get(i+1) : null;
+ String nextwylie = (nexttp == null) ? "" : nexttp.wylie;
+ if (isAppendageNonVowelWylie(nextwylie)) {
+ candidateType = "maybe-appendaged-prefix/root-root/suffix";
+ } else {
+ candidateType = "prefix-root-suffix";
+ }
+ } else if (TibetanMachineWeb.isWylieFarRight(wylie)) {
+ candidateType = "prefix/root-root/suffix-suffix/postsuffix";
+ } else if (TibetanMachineWeb.isWylieRight(wylie)) {
+ candidateType = "prefix-root-suffix";
+ } else if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) {
+ candidateType = "appendaged-prefix/root-root/suffix";
+ } else {
+ if (null != warnings)
+ warnings.append("Found a prefix/root stack plus a suffix/root stack plus a non-suffix, non-postsuffix consonant or consonant stack whose wylie is " + wylie + "\n");
+ candidateType = "invalid";
+ break;
+ }
+ } else if ("root-suffix" == candidateType) {
+ // This has no peekahead w.r.t. 'am and 'ang,
+ // but it needs none because we peeked to be
+ // sure that this was root-suffix and not
+ // maybe-appendaged-root.
+ if (TibetanMachineWeb.isWylieFarRight(wylie)) {
+ candidateType = "root-suffix-postsuffix";
+ } else if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) {
+ candidateType = "appendaged-root-suffix";
+ } else if (ACHUNG.equals(wylie)) {
+ candidateType = "maybe-appendaged-root-suffix";
+ } else {
+ if (null != warnings)
+ warnings.append("Found a root stack plus a suffix plus a non-postsuffix consonant or consonant stack whose wylie is " + wylie + "\n");
+ candidateType = "invalid";
+ break;
+ }
+ } else if ("prefix/root-root/suffix-suffix/postsuffix" == candidateType
+ || "prefix-root-suffix" == candidateType) {
+ // this has no peekahead and needs none.
+ if (TibetanMachineWeb.isWylieFarRight(wylie)) {
+ candidateType = "prefix-root-suffix-postsuffix";
+ } else if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) {
+ // if we simply prepended to
+ // candidateType, we wouldn't get interned
+ // strings.
+ candidateType = ("appendaged-" + candidateType).intern();
+ } else if (ACHUNG.equals(wylie)) {
+ candidateType = ("maybe-appendaged-" + candidateType).intern();
+ } else {
+ if (null != warnings)
+ warnings.append("Found a prefix/root stack plus a suffix/root stack plus a suffix/postsuffix plus a non-postsuffix consonant or consonant stack whose wylie is " + wylie + "\n");
+ candidateType = "invalid";
+ break;
+ }
+ } else if ("prefix-root-suffix-postsuffix" == candidateType) {
+ // this has no peekahead and needs none.
+ if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) {
+ candidateType = "appendaged-prefix-root-suffix-postsuffix";
+ } else if (ACHUNG.equals(wylie)) {
+ candidateType = "maybe-appendaged-prefix-root-suffix-postsuffix";
+ } else {
+ if (null != warnings)
+ warnings.append("Found a prefix plus root stack plus suffix plus postsuffix; then found yet another consonant or consonant stack whose wylie is " + wylie + "\n");
+ candidateType = "invalid";
+ break;
+ }
+ } else if ("root-suffix-postsuffix" == candidateType) {
+ // this has no peekahead and needs none.
+ if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) {
+ candidateType = "appendaged-root-suffix-postsuffix";
+ } else if (ACHUNG.equals(wylie)) {
+ candidateType = "maybe-appendaged-root-suffix-postsuffix";
+ } else {
+ if (null != warnings)
+ warnings.append("Found a root stack plus suffix plus postsuffix; then found yet another consonant or consonant stack whose wylie is " + wylie + "\n");
+ candidateType = "invalid";
+ break;
+ }
+ } else if (candidateType.startsWith("maybe-appendaged-")) {
+ if (isAppendageNonVowelWylie(wylie)) {
+ candidateType
+ = candidateType.substring("maybe-".length()).intern();
+ // So that we get 'am, not 'm; 'ang, not 'ng:
+ tp.wylie = WYLIE_aVOWEL + tp.wylie;
+ } else {
+ if (null != warnings)
+ warnings.append("Found a tsheg bar that has an achung (" + ACHUNG + ") tacked on, followed by some other thing whose wylie is " + wylie + "\n");
+ candidateType = "invalid";
+ break;
+ }
+ } else if (candidateType.startsWith("appendaged-")) {
+ if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) {
+ // candidateType stays what it is.
+ } else if (ACHUNG.equals(wylie)) {
+ candidateType = ("maybe-" + candidateType).intern();
+ } else {
+ if (null != warnings)
+ warnings.append("Found a tsheg bar that has a 'i, 'e, 'o, 'u, or 'ang 'am appendage already and then found yet another consonant or consonant stack whose wylie is " + wylie + "\n");
+ candidateType = "invalid";
+ break;
+ }
+ } else {
+ if ("number" != candidateType)
+ throw new Error("missed a case");
+ if (null != warnings)
+ warnings.append("Found a consonant or consonant stack after something odd; the consonantish thing has wylie " + wylie + "\n");
+ candidateType = "invalid";
+ break;
+ }
+ }
+ } else if (TGCPair.LONE_VOWEL == cls) {
+ if (null != warnings)
+ warnings.append("Found a vowel that did not follow either a Tibetan consonant or consonant stack or another vowel.");
+ candidateType = "invalid";
+ break;
+ } else {
+ throw new Error("bad cls");
+ }
+ }
+ if (candidateType.startsWith("maybe-appendaged-")) {
+ if (null != warnings)
+ warnings.append("Found a tsheg bar that has an extra achung (" + ACHUNG + ") tacked on\n");
+ candidateType = "invalid";
+ }
+ return candidateType;
+ }
+
+ /** Appends to wylieBuffer the wylie for the glyph list glyphList
+ (which should be an ArrayList for speed). This will be very
+ user-friendly for "legal tsheg bars" and will be valid, but
+ possibly ugly (interspersed with disambiguators or extra
+ vowels, etc.) Wylie for other things, such as Sanskrit
+ transliteration. Updates warnings and noSuchWylie like the
+ caller does.
+
+ What constitutes a legal, non-punctuation, non-whitespace
+ tsheg bar? The following are the only such:
+
+ - one or more numbers
+
+ - a single, possibly adorned consonant stack
+
+ - a legal "tyllable" appended with zero or more particles
+ from the set { 'i, 'o, 'u, 'e, 'ang, 'am }
+
+
+ A "tyllable" is, by definition, one of the following:
+
+
+ - a single, possibly adorned consonant stack
+
+ - two consonant stacks where one is a single,
+ unadorned consonant (and is a prefix it it is first and
+ a suffix if it is last) and the other is possibly
+ adorned
+
+ - three consonant stacks where at most one has adornment.
+ If the second has adornment, then the first must be an
+ unadorned prefix consonant and the last must be an
+ unadorned suffix consonant. If the first has adornment,
+ then the second must be an unadorned suffix consonant
+ and the third must be an unadorned secondary suffix
+ consonant.
+
+ - four consonant stacks where either none is adorned or
+ only the second consonant stack is adorned, the first is
+ an unadorned prefix consonant, the third is an unadorned
+ suffix consonant, and the fourth is an unadorned
+ secondary suffix consonant.
+
+
+
+ When there are three unadorned consonant stacks in a
+ tyllable, a hard-coded list of valid Tibetan tsheg bars is
+ relied upon to determine if the 'a' vowel comes after the
+ first or the second consonant.
*/
+ private static void getTshegBarWylie(java.util.List glyphList,
+ boolean noSuchWylie[],
+ StringBuffer warnings,
+ StringBuffer wylieBuffer) {
+ ArrayList gcs
+ = breakTshegBarIntoGraphemeClusters(glyphList, noSuchWylie);
+ String candidateType = getClassificationOfTshegBar(gcs, warnings);
+ int sz = gcs.size();
+ if (candidateType == "invalid") {
+ // Forget beauty and succintness -- just be sure to
+ // generate Wylie that can be converted unambiguously into
+ // Tibetan. Use a disambiguator or vowel after each
+ // grapheme cluster.
+ //
+ // If we truly didn't care about beauty, we'd just lump
+ // SANSKRIT_WITHOUT_VOWEL and SANSKRIT_WITH_VOWEL into
+ // OTHER.
+
+ for (int i = 0; i < sz; i++) {
+ TGCPair tp = (TGCPair)gcs.get(i);
+ int cls = tp.classification;
+ String wylie = tp.wylie;
+ wylieBuffer.append(wylie);
+ if (TibetanMachineWeb.isWylieTibetanConsonantOrConsonantStack(wylie)
+ || TibetanMachineWeb.isWylieSanskritConsonantStack(wylie)) {
+ wylieBuffer.append(aVowelToUseAfter(wylie));
+ } else {
+ if (TGCPair.CONSONANTAL_WITH_VOWEL != cls
+ && TGCPair.SANSKRIT_WITH_VOWEL != cls)
+ wylieBuffer.append(WYLIE_DISAMBIGUATING_KEY);
+ }
+ }
+ } else {
+ // Generate perfect, beautiful, Wylie, using the minimum
+ // number of vowels and disambiguators.
+
+ int leftover = sz + 1;
+
+ // Appendaged vs. not appendaged? it affects nothing at
+ // this stage.
+ if (candidateType.startsWith("appendaged-")) {
+ candidateType
+ = candidateType.substring("appendaged-".length()).intern();
+ }
+
+ if ("prefix/root-root/suffix-suffix/postsuffix" == candidateType) {
+ /* Yes, this is ambiguous. How do we handle it? See
+ * this from Andres:
+ *
+ * I'm posting this upon David Chandler's
+ * request. According to Lobsang Thonden in Modern
+ * Tibetan Grammar Language (page 42), with regards to
+ * identifying the root letter in 3 lettered words
+ * there are only 23 ambiguous cases. He writes:
+ *
+ * If the last letter is 'sa' and the first two
+ * letters are affixes, then the SECOND ONE is the
+ * root letter in the following 9 WORDS ONLY:
+ *
+ * gdas gnas gsas dgas dmas bdas mdas 'gas 'das
+ *
+ * And the FIRST is the root letter in the following
+ * 14 WORDS ONLY:
+ *
+ * rags lags nags bags bangs gangs rangs langs nangs
+ * sangs babs rabs rams nams
+ *
+ * As I mentioned before, I think that the best
+ * solution for now is to hard-wire these cases. Even
+ * if the list is not exhaustive, at least we'll have
+ * most cases covered. */
+
+ leftover = 3;
+ /* FIXME: these constants are hard-wired here, rather
+ * than in TibetanMachineWeb, because I'm lazy. */
+ String wylie1 = ((TGCPair)gcs.get(0)).wylie;
+ String wylie2 = ((TGCPair)gcs.get(1)).wylie;
+ String wylie3 = ((TGCPair)gcs.get(2)).wylie;
+ if ((wylie1.equals("g") && (wylie2.equals("d") || wylie2.equals("n") || wylie2.equals("s")))
+ || (wylie1.equals("d") && (wylie2.equals("g") || wylie2.equals("m")))
+ || (wylie1.equals("b") && wylie2.equals("d"))
+ || (wylie1.equals("m") && wylie2.equals("d"))
+ || (wylie1.equals("'") && (wylie2.equals("g") || wylie2.equals("d")))) {
+ if (TibetanMachineWeb.isAmbiguousWylie(wylie1, wylie2))
+ wylieBuffer.append(wylie1 + WYLIE_DISAMBIGUATING_KEY + wylie2);
+ else
+ wylieBuffer.append(wylie1 + wylie2);
+
+ wylieBuffer.append(aVowelToUseAfter(wylie2)
+ + wylie3);
+ } else {
+ wylieBuffer.append(wylie1
+ + aVowelToUseAfter(wylie1)
+ + unambiguousPostAVowelWylie(wylie2,
+ wylie3));
+ }
+ } else if ("root" == candidateType
+ || "prefix/root-root/suffix" == candidateType
+ || "prefix/root" == candidateType
+ || "root-suffix-postsuffix" == candidateType
+ || "root-suffix" == candidateType) {
+ String wylie1 = ((TGCPair)gcs.get(0)).wylie;
+ leftover = 1;
+ wylieBuffer.append(wylie1);
+ if (((TGCPair)gcs.get(0)).classification
+ != TGCPair.CONSONANTAL_WITH_VOWEL) {
+ ThdlDebug.verify(TGCPair.CONSONANTAL_WITHOUT_VOWEL
+ == ((TGCPair)gcs.get(0)).classification);
+ wylieBuffer.append(aVowelToUseAfter(wylie1));
+ if (debug) System.out.println("DEBUG: appending vowel");
+ } else {
+ if (debug) System.out.println("DEBUG: already has vowel 2");
+ }
+ if ("root-suffix-postsuffix" == candidateType) {
+ leftover = 3;
+ String wylie2 = ((TGCPair)gcs.get(1)).wylie;
+ String wylie3 = ((TGCPair)gcs.get(2)).wylie;
+ wylieBuffer.append(unambiguousPostAVowelWylie(wylie2,
+ wylie3));
+ }
+ } else if ("prefix-root-suffix" == candidateType
+ || "prefix-root" == candidateType
+ || "prefix-root-suffix-postsuffix" == candidateType) {
+ String wylie1 = ((TGCPair)gcs.get(0)).wylie;
+ String wylie2 = ((TGCPair)gcs.get(1)).wylie;
+ leftover = 2;
+ if (TibetanMachineWeb.isAmbiguousWylie(wylie1, wylie2))
+ wylieBuffer.append(wylie1 + WYLIE_DISAMBIGUATING_KEY + wylie2);
+ else
+ wylieBuffer.append(wylie1 + wylie2);
+
+ if (((TGCPair)gcs.get(1)).classification
+ != TGCPair.CONSONANTAL_WITH_VOWEL) {
+ ThdlDebug.verify(TGCPair.CONSONANTAL_WITHOUT_VOWEL
+ == ((TGCPair)gcs.get(1)).classification);
+ if (debug) System.out.println("DEBUG: appending vowel");
+ wylieBuffer.append(aVowelToUseAfter(wylie2));
+ } else {
+ if (debug) System.out.println("DEBUG: already has vowel 1");
+ }
+ if ("prefix-root-suffix-postsuffix" == candidateType) {
+ leftover = 4;
+ String wylie3 = ((TGCPair)gcs.get(2)).wylie;
+ String wylie4 = ((TGCPair)gcs.get(3)).wylie;
+ wylieBuffer.append(unambiguousPostAVowelWylie(wylie3,
+ wylie4));
+ }
+ } else if ("number" == candidateType) {
+ leftover = 0;
+ } else {
+ throw new Error("missed a case down here");
+ }
+
+ // append the wylie left over:
+ for (int i = leftover; i < sz; i++) {
+ TGCPair tp = (TGCPair)gcs.get(i);
+ String wylie = tp.wylie;
+ wylieBuffer.append(wylie);
+ }
+ }
+ }
+
+/**
+* Gets the Extended Wylie for a sequence of glyphs using Chandler's
+* experimental method. This works as follows:
+*
+* We run along until we hit whitespace or punctuation. We take
+* everything before that and we see if it's a legal Tibetan tsheg bar,
+* either a number or a word fragment. If it is, we insert only one
+* vowel in the correct place. If not, then we throw a disambiguating
+* key or a vowel after each stack.
+*
+* @param dcs an array of glyphs
+* @param noSuchWylie an array which will not be touched if this is
+* successful; however, if there is no THDL Extended Wylie
+* corresponding to these glyphs, then noSuchWylie[0] will be set to
+* true
+* @param warnings either null or a buffer to which will be appended
+* warnings about illegal tsheg bars
+* @return the Extended Wylie corresponding to these glyphs, or null */
+ public static String getWylieImplementation(DuffCode[] dcs,
+ boolean noSuchWylie[],
+ StringBuffer warnings) {
+ if (dcs.length == 0)
+ return null;
+
+ ArrayList glyphList = new ArrayList();
+ StringBuffer wylieBuffer = new StringBuffer();
+
+ for (int i=0; i 0 || !glyphList.isEmpty()) {
- String thisPart;
- if (needsVowel)
- thisPart = withA(glyphList, noSuchWylie);
- else
- thisPart = withoutA(glyphList, false, noSuchWylie);
- wylieBuffer.append(thisPart);
-
+ if (!glyphList.isEmpty()) {
+ getTshegBarWylie(glyphList, noSuchWylie,
+ warnings, wylieBuffer);
glyphList.clear();
- needsVowel = true;
- isLastVowel = false;
+ if (null != warnings)
+ warnings.append("Some glyphs came right before a newline; they did not have a tsheg or shad come first.");
}
wylieBuffer.append(ch);
} else {
- wylie = TibetanMachineWeb.getWylieForGlyph(dcs[i], noSuchWylie);
-
- boolean containsBindu = false;
- if (wylie.length() > 1 && wylie.charAt(wylie.length()-1) == BINDU) {
- char[] cArray = wylie.toCharArray();
- wylie = new String(cArray, 0, wylie.length()-1);
- containsBindu = true;
- }
-
- process_block: {
- if (TibetanMachineWeb.isWyliePunc(wylie)) {
- isLastVowel = false;
-
- if (glyphList.isEmpty()) {
- wylieBuffer.append(wylie);
- } else {
- String thisPart;
- if (needsVowel)
- thisPart = withA(glyphList, noSuchWylie);
- else
- thisPart = withoutA(glyphList, false, noSuchWylie);
- wylieBuffer.append(thisPart);
-
- wylieBuffer.append(wylie); //append the punctuation
-
- glyphList.clear();
- }
- needsVowel = true; //next consonants are syllable onset, so we are awaiting vowel
- } else if (TibetanMachineWeb.isWylieChar(wylie)) {
- //isChar must come before isVowel because ACHEN has priority over WYLIE_aVOWEL
- isLastVowel = false;
- glyphList.add(dcs[i]);
- } else if (TibetanMachineWeb.isWylieVowel(wylie)) {
- if (isLastVowel) {
- int len = wylieBuffer.length();
- int A_len = A_VOWEL.length();
-
- if (wylieBuffer.substring(len-A_len).equals(A_VOWEL)) {
- try {
- if (wylie.equals(i_VOWEL)) {
- wylieBuffer.delete(len-A_len, len);
- wylieBuffer.append(I_VOWEL);
- isLastVowel = false;
- break process_block;
- } else if (wylie.equals(reverse_i_VOWEL)) {
- wylieBuffer.delete(len-A_len, len);
- wylieBuffer.append(reverse_I_VOWEL);
- isLastVowel = false;
- break process_block;
- }
- }
- catch (StringIndexOutOfBoundsException se) {
- ThdlDebug.noteIffyCode();
- }
-
- wylieBuffer.append(wylie); //append current vowel
- isLastVowel = false;
- } else
- wylieBuffer.append(wylie); //append current vowel
- } else {
- int glyphCount = glyphList.size();
- boolean insertDisAmbig = false;
-
- if (0 != glyphCount) {
- DuffCode top_dc = (DuffCode)glyphList.get(glyphCount-1);
- String top_wylie = TibetanMachineWeb.getWylieForGlyph(top_dc, noSuchWylie);
-
- if (top_wylie.equals(ACHEN)) {
- glyphList.remove(glyphCount-1);
-
- if (glyphCount-1 == 0) {
- top_dc = null;
- } else {
- insertDisAmbig = true;
- top_dc = (DuffCode)glyphList.get(glyphCount-2);
- }
- }
-
- if (top_dc == null || !TibetanMachineWeb.getWylieForGlyph(top_dc, noSuchWylie).equals(ACHUNG)) {
- String thisPart = withoutA(glyphList, true, noSuchWylie);
- wylieBuffer.append(thisPart); //append consonants in glyphList
- } else {
- glyphCount = glyphList.size();
- glyphList.remove(glyphCount-1);
-
- if (glyphCount-1 != 0) {
- String thisPart = withA(glyphList, noSuchWylie);
- wylieBuffer.append(thisPart);
- }
-
- wylieBuffer.append(ACHUNG);
- }
- }
-
- if (insertDisAmbig)
- wylieBuffer.append(WYLIE_DISAMBIGUATING_KEY);
-
- wylieBuffer.append(wylie); //append vowel
-
- glyphList.clear();
- isLastVowel = true;
- needsVowel = false;
- }
- } else { //must be a stack
- isLastVowel = false;
- glyphList.add(dcs[i]);
+ String wylie = TibetanMachineWeb.getWylieForGlyph(dcs[i], noSuchWylie);
+ if (TibetanMachineWeb.isWyliePunc(wylie)
+ && !TibetanMachineWeb.isWylieAdornment(wylie)) {
+ if (!glyphList.isEmpty()) {
+ getTshegBarWylie(glyphList, noSuchWylie,
+ warnings, wylieBuffer);
+ glyphList.clear();
}
- }
-
- if (containsBindu) {
- isLastVowel = false;
- wylieBuffer.append(withoutA(glyphList, false, noSuchWylie));
- wylieBuffer.append(BINDU); //append the bindu
- glyphList.clear();
+ wylieBuffer.append(wylie); //append the punctuation
+ } else {
+ glyphList.add(dcs[i]);
}
}
}
- //replace TMW with Wylie
+ // replace remaining TMW with Wylie
if (!glyphList.isEmpty()) {
- String thisPart;
- if (needsVowel)
- thisPart = withA(glyphList, noSuchWylie);
- else
- thisPart = withoutA(glyphList, false, noSuchWylie);
- wylieBuffer.append(thisPart);
+ getTshegBarWylie(glyphList, noSuchWylie, warnings, wylieBuffer);
+ // glyphList.clear() if we weren't about to exit...
+ if (null != warnings)
+ warnings.append("The stretch of Tibetan ended without final punctuation.");
}
if (wylieBuffer.length() > 0)
return wylieBuffer.toString();
else
return null;
- }
+ }
+}
+
+/** An ordered pair consisting of a Tibetan grapheme cluster's {@link
+ org.thdl.tib.text.tshegbar#UnicodeGraphemeCluster see
+ UnicodeGraphemeCluster for a definition of the term}
+ classification and its context-insensitive THDL Extended Wylie
+ representation. */
+class TGCPair {
+ static final int OTHER = 1;
+ // a standalone achen would fall into this category:
+ static final int CONSONANTAL_WITHOUT_VOWEL = 2;
+ static final int CONSONANTAL_WITH_VOWEL = 3;
+ static final int LONE_VOWEL = 4;
+ static final int SANSKRIT_WITHOUT_VOWEL = 5;
+ static final int SANSKRIT_WITH_VOWEL = 6;
+
+ String wylie;
+ int classification;
+ TGCPair(String wylie, int classification) {
+ this.wylie = wylie;
+ this.classification = classification;
+ }
+ public String toString() {
+ return "";
+ }
}
diff --git a/source/org/thdl/tib/text/TibetanMachineWeb.java b/source/org/thdl/tib/text/TibetanMachineWeb.java
index 0914265..5cbf04b 100644
--- a/source/org/thdl/tib/text/TibetanMachineWeb.java
+++ b/source/org/thdl/tib/text/TibetanMachineWeb.java
@@ -60,6 +60,9 @@ public class TibetanMachineWeb implements THDLWylieConstants {
private static TibetanKeyboard keyboard = null;
private static Set charSet = null;
+ private static Set tibSet = null;
+ private static Set sanskritStackSet = null;
+ private static Set numberSet = null;
private static Set vowelSet = null;
private static Set puncSet = null;
private static Set topSet = null;
@@ -346,26 +349,64 @@ public class TibetanMachineWeb implements THDLWylieConstants {
}
String line;
boolean hashOn = false;
- boolean isSanskrit = false; //FIXME: this is never read.
+
+ // is this a Tibetan consonant or consonant stack?
+ boolean isTibetan = false;
+
+ // is this a Sanskrit consonant stack?
+ boolean isSanskrit = false;
+
boolean ignore = false;
+ tibSet = new HashSet();
+ sanskritStackSet = new HashSet();
+
while ((line = in.readLine()) != null) {
if (line.startsWith("")) { //line is command
if (line.equalsIgnoreCase("")) {
isSanskrit = false;
+ isTibetan = true;
hashOn = false;
+ ignore = false;
line = in.readLine();
- charSet = new HashSet();
+ if (null == charSet) charSet = new HashSet();
StringTokenizer st = new StringTokenizer(line,",");
while (st.hasMoreTokens()) {
String ntk;
charSet.add(ntk = st.nextToken());
+ tibSet.add(ntk);
validInputSequences.put(ntk, anyOldObjectWillDo);
}
}
+ else if (line.equalsIgnoreCase("")) {
+ // FIXME: for historical reasons, numbers go
+ // in both charSet and numberSet.
+ isSanskrit = false;
+ isTibetan = false;
+ hashOn = false;
+ ignore = false;
+ line = in.readLine();
+ if (null == charSet) charSet = new HashSet();
+ numberSet = new HashSet();
+ StringTokenizer st = new StringTokenizer(line,",");
+ while (st.hasMoreTokens()) {
+ String ntk;
+ // DLC FIXME: don't add it to numberSet
+ // and charSet here; do it in
+ // so that Jskad has the
+ // same TMW->Wylie conversion regardless
+ // of whether or not it chooses to support
+ // inputting numbers.
+ numberSet.add(ntk = st.nextToken());
+ charSet.add(ntk);
+ validInputSequences.put(ntk, anyOldObjectWillDo);
+ }
+ }
else if (line.equalsIgnoreCase("")) {
isSanskrit = false;
+ isTibetan = false;
hashOn = false;
+ ignore = false;
line = in.readLine();
vowelSet = new HashSet();
StringTokenizer st = new StringTokenizer(line,",");
@@ -377,7 +418,9 @@ public class TibetanMachineWeb implements THDLWylieConstants {
}
else if (line.equalsIgnoreCase("")) {
isSanskrit = false;
+ isTibetan = false;
hashOn = false;
+ ignore = false;
line = in.readLine();
puncSet = new HashSet();
StringTokenizer st = new StringTokenizer(line,",");
@@ -389,29 +432,47 @@ public class TibetanMachineWeb implements THDLWylieConstants {
}
else if (line.equalsIgnoreCase("")
- || line.equalsIgnoreCase("")
- || line.equalsIgnoreCase("")) {
+ || line.equalsIgnoreCase("")) {
isSanskrit = false;
+ isTibetan = false;
+ hashOn = true;
+ ignore = false;
+ }
+ else if (line.equalsIgnoreCase("")) {
+ isSanskrit = false;
+ isTibetan = true;
+ hashOn = true;
+ ignore = false;
+ }
+ else if (line.equalsIgnoreCase("")) {
+ isSanskrit = false;
+ isTibetan = false;
hashOn = true;
ignore = false;
}
else if (line.equalsIgnoreCase("")) {
isSanskrit = true;
+ isTibetan = false;
hashOn = true;
ignore = false;
}
else if (line.equalsIgnoreCase("")) {
isSanskrit = false;
+ isTibetan = false;
hashOn = false;
ignore = false;
}
- else if (line.equalsIgnoreCase(""))
+ else if (line.equalsIgnoreCase("")) {
+ isSanskrit = false;
ignore = true;
+ }
}
- else if (line.startsWith("//")) //comment
+ else if (line.startsWith("//")) { //comment
;
- else if (line.equals("")) //empty string
+ }
+ else if (line.equals("")) {//empty string
;
+ }
else {
StringTokenizer st = new StringTokenizer(line,DELIMITER,true);
@@ -559,6 +620,21 @@ public class TibetanMachineWeb implements THDLWylieConstants {
if (hashOn) {
tibHash.put(wylie, duffCodes);
}
+ if (isTibetan) {
+ // Delete the dashes:
+ StringBuffer wylieWithoutDashes = new StringBuffer(wylie);
+ for (int wl = 0; wl < wylieWithoutDashes.length(); wl++) {
+ if (wylieWithoutDashes.charAt(wl) == '-') {
+ wylieWithoutDashes.deleteCharAt(wl);
+ --wl;
+ }
+ }
+ tibSet.add(wylieWithoutDashes.toString());
+ }
+
+ if (isSanskrit) {
+ sanskritStackSet.add(wylie);
+ }
if (null == duffCodes[TMW])
throw new Error(fileName
@@ -726,13 +802,13 @@ public static boolean isFormatting(char c) {
}
/**
-* Checks to see if the passed string
-* is a character in the installed keyboard.
+* Checks to see if the passed string is a character (a single
+* [possibly Sanskrit or va or fa] consonant or a number [possibly
+* super- or subscribed]) in the installed keyboard.
*
* @param s the string you want to check
-* @return true if s is a character in the current keyboard,
-* false if not
-*/
+* @return true if s is a character in the current keyboard, false if
+* not */
public static boolean isChar(String s) {
if (currentKeyboardIsExtendedWylie())
return charSet.contains(s);
@@ -741,16 +817,58 @@ public static boolean isChar(String s) {
}
/**
-* Checks to see if the passed string
-* is a character in Extended Wylie.
+* Checks to see if the passed string is a character (a single
+* [possibly Sanskrit or va or fa] consonant or a number [possibly
+* super- or subscribed]) in Extended Wylie.
* @param s the string to be checked
-* @return true if s is a character in
-* Extended Wylie transliteration, false if not
-*/
+* @return true if s is a character in Extended Wylie transliteration,
+* false if not */
public static boolean isWylieChar(String s) {
return charSet.contains(s);
}
+
+/**
+* Checks to see if the passed string is a consonant or unadorned
+* consonant stack in Extended Wylie.
+* @param s the string to be checked
+* @return true if s is such in Extended Wylie transliteration, false
+* if not */
+public static boolean isWylieTibetanConsonantOrConsonantStack(String s) {
+ return tibSet.contains(s);
+}
+
+/**
+* Returns true if and only if s is the THDL Extended Wylie for a
+* Sanskrit multi-consonant stack.
+*/
+public static boolean isWylieSanskritConsonantStack(String s) {
+ return sanskritStackSet.contains(s);
+}
+
+/** Returns true if and only if s is the THDL Extended Wylie
+ representation of a legal tsheg-bar appendage 'i, 'e, 'u, 'o, 'am,
+ or 'ang. The word le'u (chapter) contains such an appendage,
+ e.g. */
+public static boolean isWylieAchungAppendage(String s) {
+ return (s.equals("'e")
+ || s.equals("'i")
+ || s.equals("'o")
+ || s.equals("'u")
+ || s.equals("'ang")
+ || s.equals("'am"));
+}
+
+/**
+* Checks to see if the passed string is a number [possibly super- or
+* subscribed]) in Extended Wylie.
+* @param s the string to be checked
+* @return true if s is a number in Extended Wylie transliteration,
+* false if not */
+public static boolean isWylieNumber(String s) {
+ return numberSet.contains(s);
+}
+
/**
* Checks to see if the passed string
* is punctuation in the installed keyboard.
@@ -826,6 +944,32 @@ public static boolean isWylieVowel(String s) {
return vowelSet.contains(s);
}
+/** Returns true if and only if wylie is the THDL Extended Wylie for
+ an adornment. An adornment is something that is part of a stack
+ but is not a consonant, such as a Tibetan or Sanskrit vowel or a
+ bindu. Note that an adornment might be both an adornment and a
+ vowel, or an adornment and punctuation. */
+public static boolean isWylieAdornment(String wylie) {
+ return (vowelSet.contains(wylie)
+ || (wylie.equals("M") /* U+0F7E */
+ || wylie.equals("M^") /* U+0F83 */
+ || wylie.equals("iM")
+ || wylie.equals("-iM")
+ || wylie.equals("eM")
+ || wylie.equals("aiM")
+ || wylie.equals("oM")
+ || wylie.equals("auM")));
+}
+
+/** Returns true if and only if wylie is the THDL Extended Wylie for
+ an adornment {@link #isWylieAdornment(String)} that contains a
+ vowel within it. */
+public static boolean isWylieAdornmentAndContainsVowel(String wylie) {
+ return (isWylieAdornment(wylie) &&
+ !wylie.equals("M") /* U+0F7E */
+ && !wylie.equals("M^") /* U+0F83 */);
+}
+
/**
* Returns true iff this Wylie is valid as a leftmost character in a
* Tibetan syllable. For example, in the syllable 'brgyad', 'b' is the
@@ -839,9 +983,9 @@ public static boolean isWylieLeft(String s) {
}
/**
-* Returns true iff this Wylie is valid as a right (post-vowel)
-* character in a Tibetan syllable. For example, in the syllable
-* 'lags', 'g' is in the right character position. Valid right
+* Returns true iff this Wylie is valid as a suffix (i.e., a right
+* (post-vowel) character) in a Tibetan syllable. For example, in the
+* syllable 'lags', 'g' is in the right character position. Valid right
* characters include g, ng, d, n, b, m, r, l, s, ', and T.
* @param s the (Wylie) string to be checked
* @return true if s is a possible right character in a Tibetan
diff --git a/source/org/thdl/tib/text/tibwn.ini b/source/org/thdl/tib/text/tibwn.ini
index eda2b1d..064eb9f 100644
--- a/source/org/thdl/tib/text/tibwn.ini
+++ b/source/org/thdl/tib/text/tibwn.ini
@@ -7,22 +7,27 @@
// - marks a command
// - the commands are:
// Consonants - set of consonants in tibetan
+// Numbers - set of numbers in tibetan
// Vowels - set of vowels
-// Other - other characters: numbers, punctuation, etc.
+// Other - other characters: punctuation, etc.
// Input - those codes which serve basis for wylie input method
-// subtypes: Input:Punctuation, Input:Vowels, Input:Tibetan, Input:Sanskrit
+// subtypes: Input:Punctuation, Input:Vowels, Input:Tibetan,
+// Input:Numbers, Input:Sanskrit
// ToWylie - codes only needed for duff to wylie conversion, including vowels
// Ignore - ignore until another command is reached
-k,kh,g,ng,c,ch,j,ny,t,th,d,n,p,ph,b,m,ts,tsh,dz,w,zh,z,',y,r,l,sh,s,h,a,T,Th,D,N,Sh,v,f,Dz,0,1,2,3,4,5,6,7,8,9,>0,>1,>2,>3,>4,>5,>6,>7,>8,>9,<0,<1,<2,<3,<4,<5,<6,<7,<8,<9
+k,kh,g,ng,c,ch,j,ny,t,th,d,n,p,ph,b,m,ts,tsh,dz,w,zh,z,',y,r,l,sh,s,h,a,T,Th,D,N,Sh,v,f,Dz
+
+
+0,1,2,3,4,5,6,7,8,9,>0,>1,>2,>3,>4,>5,>6,>7,>8,>9,<0,<1,<2,<3,<4,<5,<6,<7,<8,<9
a,i,u,e,o,I,U,ai,au,A,-i,-I
-_, ,/,|,!,:,;,@,#,$,%,(,),H,M,`,&,@#,?,=,[,],<,>,{,},*
-// FIXME: add these etc.: M^,~,~^
+_, ,/,|,!,:,;,@,#,$,%,(,),H,M,`,&,@#,?,=,[,],{,},*
+// FIXME: add these etc.: M^,~,~^,<,> (< and > cause ka<7 to quit working)
//_~32,1~0,32
@@ -691,6 +696,8 @@ a+y~143,4~~8,63~1,109~8,120~1,123~1,125~8,106~8,113~f68,fb1
a+r~144,4~~8,64~1,109~8,120~1,123~1,125~8,106~8,113~f68,fb2
a+r+y~145,4~~8,65~1,109~8,121~1,123~1,125~8,107~8,114~f68,fb2,fb1
+
+
//numbers
0~190,1~~10,48~~~~~~~0F20
1~191,1~~10,49~~~~~~~0F21