diff --git a/source/org/thdl/tib/input/DuffPaneTest.java b/source/org/thdl/tib/input/DuffPaneTest.java index a2e0fbd..8dd352c 100644 --- a/source/org/thdl/tib/input/DuffPaneTest.java +++ b/source/org/thdl/tib/input/DuffPaneTest.java @@ -42,7 +42,7 @@ public class DuffPaneTest extends TestCase { // We don't want to load the TM or TMW font files ourselves: ThdlOptions.setUserPreference("thdl.rely.on.system.tmw.fonts", true); - ThdlOptions.setUserPreference("thdl.do.not.rely.on.system.tm.fonts", false); + ThdlOptions.setUserPreference("thdl.rely.on.system.tm.fonts", true); ThdlOptions.setUserPreference("thdl.debug", true); dp = new DuffPane(); @@ -102,11 +102,23 @@ public class DuffPaneTest extends TestCase { * and then converting the result to Extended Wylie. */ public void testWylieToIRToWylie() { ensureKeysGiveCorrectWylie("kue "); + ensureKeysGiveCorrectWylie("<8<7<0 "); + ensureKeysGiveCorrectWylie("012345678901234 "); + ensureKeysGiveCorrectWylie("ka<7 ", + "ka<7. "); + ensureKeysGiveCorrectWylie("ka <7 "); + ensureKeysGiveCorrectWylie("ka>7 ", + "ka>7. "); + ensureKeysGiveCorrectWylie("ka >7 "); +// DLC FIXME : M^ doesn't work. nga, na do, k,kh do, why not M, M^? ensureKeysGiveCorrectWylie("kuau "); ensureKeysGiveCorrectWylie("ku-i "); ensureKeysGiveCorrectWylie("kuai "); ensureKeysGiveCorrectWylie("cuig "); - ensureKeysGiveCorrectWylie("kcuig "); + ensureKeysGiveCorrectWylie("kcuig ", + "kacuiga "); + ensureKeysGiveCorrectWylie("gcuig "); + ensureKeysGiveCorrectWylie("gcuigs'e'i'i'o'am'ang'e'o'u'am'am "); ensureKeysGiveCorrectWylie("nga "); ensureKeysGiveCorrectWylie("nga /"); @@ -144,17 +156,20 @@ public class DuffPaneTest extends TestCase { ensureKeysGiveCorrectWylie("blar.d"); ensureKeysGiveCorrectWylie("blarad", "blar.d"); - ensureKeysGiveCorrectWylie("b.lard"); + ensureKeysGiveCorrectWylie("b.lard", + "balarda"); ensureKeysGiveCorrectWylie("b.lal.d"); ensureKeysGiveCorrectWylie("blald", "blalda"); - ensureKeysGiveCorrectWylie("b.lald"); + ensureKeysGiveCorrectWylie("b.lald", + "balalda"); ensureKeysGiveCorrectWylie("b.las.d"); ensureKeysGiveCorrectWylie("blasd", "blasda"); - ensureKeysGiveCorrectWylie("b.lasd"); + ensureKeysGiveCorrectWylie("b.lasd", + "balasda"); ensureKeysGiveCorrectWylie("b.lag"); ensureKeysGiveCorrectWylie("blg", @@ -233,7 +248,7 @@ public class DuffPaneTest extends TestCase { ensureKeysGiveCorrectWylie("b.lags"); ensureKeysGiveCorrectWylie("blags"); - // DLC add b-r-g-s, b-l-g-s, + // DLC FIXME: add b-r-g-s, b-l-g-s, etc. ensureKeysGiveCorrectWylie("mngas", @@ -268,15 +283,43 @@ public class DuffPaneTest extends TestCase { ensureKeysGiveCorrectWylie("skalazasa"); ensureKeysGiveCorrectWylie("jskad", "jaskada"); - } - { - // These are incorrectly handled in terms of - // makeIllegalTibetanGoEndToEnd. DLC FIXME. ensureKeysGiveCorrectWylie("jeskad", - "jeskd"); - ensureKeysGiveCorrectWylie("jeskd"); + "jeskada"); + ensureKeysGiveCorrectWylie("jeskd", + "jesakada"); ensureKeysGiveCorrectWylie("jesakada", - "jeskd"); + "jesakada"); } + + { + // DLC FIXME: ai gives a.ai, a.i is required to get ai. + + // DLC FIXME: haaa doesn't get you h.a., neither does + // ha.a; achen is tough to get. + } + + ensureKeysGiveCorrectWylie("heM hiM h-iM heM haiM hoM hauM hUM "); + ensureKeysGiveCorrectWylie("hi.M ho.M he.M hu.M", + "hiM hoM heM huM"); + + ensureKeysGiveCorrectWylie("brgwU-imd"); + + ensureKeysGiveCorrectWylie("pad+me"); + ensureKeysGiveCorrectWylie("pad+men+b+h+yuM"); + + ensureKeysGiveCorrectWylie("bskyUMbs"); + ensureKeysGiveCorrectWylie("bskyUMbsHgro "); + + ensureKeysGiveCorrectWylie("favakakhagangacachajanyatathadanapaphabamatsatshadzawazhaza'ayaralashasahaTaThaDaNaSha"); + ensureKeysGiveCorrectWylie("fevekekhegengecechejenyetethedenepephebemetsetshedzewezheze'eyerelesheseheTeTheDeNeShe"); + ensureKeysGiveCorrectWylie("fuvukukhugungucuchujunyututhudunupuphubumutsutshudzuwuzhuzu'uyurulushusuhuTuThuDuNuShu"); + ensureKeysGiveCorrectWylie("fovokokhogongocochojonyotothodonopophobomotsotshodzowozhozo'oyoroloshosohoToThoDoNoSho"); + ensureKeysGiveCorrectWylie("faivaikaikhaigaingaicaichaijainyaitaithaidainaipaiphaibaimaitsaitshaidzaiwaizhaizai'aiyairailaishaisaihaiTaiThaiDaiNaiShai"); + ensureKeysGiveCorrectWylie("fauvaukaukhaugaungaucauchaujaunyautauthaudaunaupauphaubaumautsautshaudzauwauzhauzau'auyauraulaushausauhauTauThauDauNauShau"); + ensureKeysGiveCorrectWylie("fivikikhigingicichijinyitithidinipiphibimitsitshidziwizhizi'iyirilishisihiTiThiDiNiShi"); + + ensureKeysGiveCorrectWylie("don't touch my coffee/that makes me very angry/supersize my drink", + "dona'ata tocha mya cofafe/thata mkes me veraya angaraya/superasize mya drinaka"); + } } diff --git a/source/org/thdl/tib/text/DuffCode.java b/source/org/thdl/tib/text/DuffCode.java index 316e9b6..526f96f 100644 --- a/source/org/thdl/tib/text/DuffCode.java +++ b/source/org/thdl/tib/text/DuffCode.java @@ -152,7 +152,11 @@ public final class DuffCode { /** * @return a string representation of this object */ public String toString() { - return ""; } @@ -160,7 +164,11 @@ public final class DuffCode { * @param TMW if this DuffCode represents a TMW glyph, not a TM glyph * @return a string representation of this object */ public String toString(boolean TMW) { - return "= 0 - && TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(effectiveSize), noSuchWylie).equals(ACHUNG)) { - if (null == tailEndWylie) tailEndWylie = new StringBuffer(); - // prepend: - tailEndWylie.insert(0, - ACHUNG - + aVowelToUseAfter(ACHUNG) - + TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(effectiveSize + 1), noSuchWylie)); - effectiveSize -= 2; - } - if (null != tailEndWylie) { - return (withA(glyphList.subList(0, effectiveSize + 2), noSuchWylie) - + tailEndWylie.toString()); - } - } - - if (makeIllegalTibetanGoEndToEnd - && (size > 4 // this is too many glyphs to be legal - // this is illegal because it doesn't begin - // with a prefix: - || (size == 4 - && (!TibetanMachineWeb.isWylieLeft(TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(0), weDoNotCareIfThereIsCorrespondingWylieOrNot)) - // this is illegal because it doesn't have a - // suffix in the proper place, e.g. mjskad: - || !TibetanMachineWeb.isWylieRight(TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(size - 2), weDoNotCareIfThereIsCorrespondingWylieOrNot)) - // this is illegal because it doesn't have a - // postsuffix in the proper place, - // e.g. 'lan.g, which would otherwise become - // 'lang (with nga, not na and then ga): - || !TibetanMachineWeb.isWylieFarRight(TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(size - 1), weDoNotCareIfThereIsCorrespondingWylieOrNot)))))) { - for (int i = 0; i < size; i++) { - wylie = TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(i), noSuchWylie); - if (TibetanMachineWeb.isAmbiguousWylie(lastWylie, wylie) - || (i != 0 && wylie.equals(ACHEN))) - sb.append(WYLIE_DISAMBIGUATING_KEY); - - sb.append(wylie + aVowelToUseAfter(wylie)); - lastWylie = wylie; - } - return sb.toString(); - } - - /* Else, chew up all the glyphs except for the last two. Then decide. */ - int i = 0; - while (i+2 < size) { - wylie = TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(i), noSuchWylie); - if (TibetanMachineWeb.isAmbiguousWylie(lastWylie, wylie) - || (i != 0 && wylie.equals(ACHEN))) - sb.append(WYLIE_DISAMBIGUATING_KEY); - - sb.append(wylie); - lastWylie = wylie; - i++; - } - - String wylie1 - = TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(i), noSuchWylie); - String wylie2 - = TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(i + 1), noSuchWylie); - - if (size == 3) { - String wylie0 = lastWylie; - // Let's see if wylie0+wylie1+wylie2 is ambiguous - // -- if wylie0 could be a prefix and if wylie1 - // could be a suffix, and if wylie2 is "s". If - // it's ambigous, let's look up - // wylie0+wylie1+wylie2 in our magic table. - // Otherwise, see if we have a prefix, and if we - // do, the "a" vowel comes after wylie1. Else the - // "a" vowel comes after wylie0. - if (TibetanMachineWeb.isWylieLeft(wylie0)) { - /* is it ambiguous? */ - if (TibetanMachineWeb.isWylieRight(wylie1) - && SA.equals(wylie2) /* isWylieFarRight would - * work, but the list of - * 9 words doesn't have - * any ending with d -- - * all end with s. */) { - /* Yes, this is ambiguous. How do we handle - * it? See this from Andres: - * - * I'm posting this upon David Chandler's - * request. According to Lobsang Thonden in - * Modern Tibetan Grammar Language (page 42), - * with regards to identifying the root letter - * in 3 lettered words there are only 23 - * ambiguous cases. He writes: - * - * If the last letter is 'sa' and the first - * two letters are affixes, then the SECOND - * ONE is the root letter in the following 9 - * WORDS ONLY: - * - * gdas gnas gsas dgas dmas bdas mdas 'gas - * 'das - * - * And the FIRST is the root letter in the - * following 14 WORDS ONLY: - * - * rags lags nags bags bangs gangs rangs langs - * nangs sangs babs rabs rams nams - * - * As I mentioned before, I think that the - * best solution for now is to hard-wire these - * cases. Even if the list is not exhaustive, - * at least we'll have most cases covered. - */ - - /* FIXME: these constants are hard-wired here, - * rather than in TibetanMachineWeb, because - * I'm lazy. */ - if ((wylie0.equals("g") && (wylie1.equals("d") || wylie1.equals("n") || wylie1.equals("s"))) - || (wylie0.equals("d") && (wylie1.equals("g") || wylie1.equals("m"))) - || (wylie0.equals("b") && wylie1.equals("d")) - || (wylie0.equals("m") && wylie1.equals("d")) - || (wylie0.equals("'") && (wylie1.equals("g") || wylie1.equals("d")))) { - sb.append(wylie1 - + aVowelToUseAfter(wylie1) - + wylie2); - } else { - sb.append(aVowelToUseAfter(wylie0) - + unambiguousPostAVowelWylie(wylie1, - wylie2)); - } - - } else { - /* no ambiguity. the "a" vowel comes after - * wylie1. */ - if (TibetanMachineWeb.isAmbiguousWylie(wylie0, wylie1)) - sb.append(WYLIE_DISAMBIGUATING_KEY); - sb.append(wylie1 - + aVowelToUseAfter(wylie1) - + wylie2); - } - } else { - if (makeIllegalTibetanGoEndToEnd - && !(TibetanMachineWeb.isWylieRight(wylie1) - && TibetanMachineWeb.isWylieFarRight(wylie2))) { - /* handle skaskaska, e.g. */ - sb.append(aVowelToUseAfter(wylie0) - + wylie1 - + aVowelToUseAfter(wylie1) - + wylie2 - + aVowelToUseAfter(wylie2)); - } else { - /* no ambiguity. the "a" vowel comes after - * wylie0. */ - sb.append(aVowelToUseAfter(wylie0) - + unambiguousPostAVowelWylie(wylie1, - wylie2)); - } - } - } else { - /* If size==4, then we assume this is legal. If - * size==5, anything will do! So assume we have a - * prefix, a root letter, a suffix, and a postsuffix. - * The "a" vowel comes after the root letter. */ - sb.append(aVowelToUseAfter(lastWylie) - + unambiguousPostAVowelWylie(wylie1, - wylie2)); - } - return sb.toString(); - } - } - -/** -* Gets the Extended Wylie for a list of glyphs. Passed a list of -* TibetanMachineWeb glyphs that constitute a partial or complete -* syllable, this method scans the list, and then returns a string of -* Wylie corresponding to this sequence. No 'a' vowel is inserted -* because it is assumed that the glyph list already contains some -* other vowel. If the glyph list does not already contain a vowel, -* then this method should not be called. -* -* @param glyphList a list of TibetanMachineWeb glyphs, i.e. {@link -* org.thdl.tib.text.DuffCode DuffCodes} -* @param isBeforeVowel true if these glyphs occur before a vowel, -* false if these glyphs occur after a vowel -* @param noSuchWylie an array which will not be touched if this is -* successful; however, if there is no THDL Extended Wylie -* corresponding to these glyphs, then noSuchWylie[0] will be set to -* true -* @return the Wylie string corresponding to this glyph list */ - public static String withoutA(java.util.ArrayList glyphList, boolean isBeforeVowel, boolean noSuchWylie[]) { - StringBuffer sb = new StringBuffer(); - Iterator iter = glyphList.iterator(); - DuffCode dc; - String currWylie; - String lastWylie = new String(); - - while (iter.hasNext()) { - dc = (DuffCode)iter.next(); - currWylie = TibetanMachineWeb.getWylieForGlyph(dc, noSuchWylie); - - if (TibetanMachineWeb.isAmbiguousWylie(lastWylie, currWylie) - || (!lastWylie.equals("") - && currWylie.equals(ACHEN))) - sb.append(WYLIE_DISAMBIGUATING_KEY); - - /* le'ang, not le'ng, to be consistent w.r.t. pa'am - * vs. pa'm: */ - if (lastWylie.equals(ACHUNG) && !isBeforeVowel) - sb.append(WYLIE_aVOWEL); - - sb.append(currWylie); - - lastWylie = currWylie; - } - - // DLC FIXME: type jeskada, convert Tibetan->Wylie. You get - // the wrong thing in makeIllegalTibetanGoEndToEnd mode. Fix - // it here. - return sb.toString(); - } - /** * Gets the Extended Wylie for a sequence of glyphs. * @param dcs an array of glyphs @@ -1100,173 +810,654 @@ public class TibTextUtils implements THDLWylieConstants { * successful; however, if there is no THDL Extended Wylie * corresponding to these glyphs, then noSuchWylie[0] will be set to * true -* @return the Extended Wylie corresponding to these glyphs */ - public static String getWylie(DuffCode[] dcs, boolean noSuchWylie[]) { - if (dcs.length == 0) - return null; +* @return the Extended Wylie corresponding to these glyphs, or null */ + public static String getWylie(DuffCode[] dcs, boolean noSuchWylie[]) { + StringBuffer warnings = (debug ? new StringBuffer() : null); + String ans = getWylieImplementation(dcs, noSuchWylie, warnings); + if (debug && warnings.length() > 0) + System.out.println("DEBUG: warnings in TMW->Wylie: " + warnings); + return ans; + } - char ch; - String wylie; + /** True for and only for ma and nga because 'am and 'ang are + appendages. */ + private static final boolean isAppendageNonVowelWylie(String wylie) { + return (MA.equals(wylie) || NGA.equals(wylie)); + } - ArrayList glyphList = new ArrayList(); - boolean needsVowel = true; - boolean isLastVowel = false; - int start = 0; - StringBuffer wylieBuffer = new StringBuffer(); + /** Scans the glyphs in glyphList and creates the returned list of + grapheme clusters based on them. A grapheme cluster is a + consonant or consonant stack with optional adornment or a + number (possibly super- or subscribed) or some other glyph + alone. */ + private static ArrayList breakTshegBarIntoGraphemeClusters(java.util.List glyphList, + boolean noSuchWylie[]) { - for (int i=start; i 0); + + // A list of grapheme clusters (see UnicodeGraphemeCluster). + // sz is an overestimate (speeds us up, wastes some memory). + ArrayList gcs = new ArrayList(sz); + + StringBuffer buildingUpGc = new StringBuffer(); + + boolean consonantal_with_vowel = false; + boolean buildingUpSanskrit = false; + for (int i = 0; i < sz; i++) { + DuffCode dc = (DuffCode)glyphList.get(i); + String wylie = TibetanMachineWeb.getWylieForGlyph(dc, noSuchWylie); + boolean containsWylieVowel = false; + boolean buildingUpSanskritNext = false; + if ((buildingUpSanskritNext + = TibetanMachineWeb.isWylieSanskritConsonantStack(wylie)) + || TibetanMachineWeb.isWylieTibetanConsonantOrConsonantStack(wylie)) { + if (buildingUpGc.length() > 0) { + gcs.add(new TGCPair(buildingUpGc.toString(), + consonantal_with_vowel + ? (buildingUpSanskrit + ? TGCPair.SANSKRIT_WITH_VOWEL + : TGCPair.CONSONANTAL_WITH_VOWEL) + : (buildingUpSanskrit + ? TGCPair.SANSKRIT_WITHOUT_VOWEL + : TGCPair.CONSONANTAL_WITHOUT_VOWEL))); + buildingUpGc.delete(0, buildingUpGc.length()); + } + buildingUpGc.append(wylie); + consonantal_with_vowel = false; + buildingUpSanskrit = buildingUpSanskritNext; + } else if ((containsWylieVowel + = TibetanMachineWeb.isWylieAdornmentAndContainsVowel(wylie)) + || TibetanMachineWeb.isWylieAdornment(wylie)) { + + if (buildingUpGc.length() > 0) { + buildingUpGc.append(wylie); + if (containsWylieVowel) { + if (debug) + System.out.println("DEBUG: with_vowel is true thanks to " + wylie); + consonantal_with_vowel = true; + } + // do not clear; we might have {cui} or {hUM}, e.g. + } else { + gcs.add(new TGCPair(wylie, + TGCPair.LONE_VOWEL)); + consonantal_with_vowel = false; + } + } else { + // number or weird thing: + + if (buildingUpGc.length() > 0) { + gcs.add(new TGCPair(buildingUpGc.toString(), + consonantal_with_vowel + ? (buildingUpSanskrit + ? TGCPair.SANSKRIT_WITH_VOWEL + : TGCPair.CONSONANTAL_WITH_VOWEL) + : (buildingUpSanskrit + ? TGCPair.SANSKRIT_WITHOUT_VOWEL + : TGCPair.CONSONANTAL_WITHOUT_VOWEL))); + buildingUpGc.delete(0, buildingUpGc.length()); + } + gcs.add(new TGCPair(wylie, TGCPair.OTHER)); + consonantal_with_vowel = false; + buildingUpSanskrit = false; + } + } + if (buildingUpGc.length() > 0) { + gcs.add(new TGCPair(buildingUpGc.toString(), + consonantal_with_vowel + ? (buildingUpSanskrit + ? TGCPair.SANSKRIT_WITH_VOWEL + : TGCPair.CONSONANTAL_WITH_VOWEL) + : (buildingUpSanskrit + ? TGCPair.SANSKRIT_WITHOUT_VOWEL + : TGCPair.CONSONANTAL_WITHOUT_VOWEL))); + } + buildingUpGc = null; + return gcs; + } + + + private static String getClassificationOfTshegBar(ArrayList gcs, + StringBuffer warnings) { + String candidateType = null; + // Now that we have grapheme clusters, see if they match any + // of the "legal tsheg bars": + int sz = gcs.size(); + for (int i = 0; i < sz; i++) { + TGCPair tp = (TGCPair)gcs.get(i); + int cls = tp.classification; + String wylie = tp.wylie; + if (TGCPair.OTHER == cls) { + if (TibetanMachineWeb.isWylieNumber(wylie)) { + if (null == candidateType) { + candidateType = "number"; + } else { + if ("number" != candidateType) { + if (null != warnings) + warnings.append("Found something odd; the wylie is " + wylie + "\n"); + candidateType = "invalid"; + break; + } + } + } else { + if (null != warnings) + warnings.append("Found something odd; the wylie is " + wylie + "\n"); + candidateType = "invalid"; + break; + } + } else if (TGCPair.SANSKRIT_WITHOUT_VOWEL == cls + || TGCPair.SANSKRIT_WITH_VOWEL == cls) { + candidateType = "invalid"; + } else if (TGCPair.CONSONANTAL_WITHOUT_VOWEL == cls + || TGCPair.CONSONANTAL_WITH_VOWEL == cls) { + if (null == candidateType) { + if (TibetanMachineWeb.isWylieLeft(wylie)) { + candidateType = "prefix/root"; + } else { + candidateType = "root"; + } + } else { + if ("prefix/root" == candidateType) { + if (ACHUNG.equals(wylie)) { + // peek ahead to distinguish between ba's, + // ba'ala and ba'am: + TGCPair nexttp = (i+1 < sz) ? (TGCPair)gcs.get(i+1) : null; + String nextwylie = (nexttp == null) ? "" : nexttp.wylie; + if (isAppendageNonVowelWylie(nextwylie)) { + candidateType = "maybe-appendaged-prefix/root"; + } else { + candidateType = "prefix/root-root/suffix"; + } + } else if (TibetanMachineWeb.isWylieRight(wylie)) { + candidateType = "prefix/root-root/suffix"; + } else if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) { + candidateType = "appendaged-prefix/root"; + } else { + candidateType = "prefix-root"; + } + } else if ("root" == candidateType) { + if (ACHUNG.equals(wylie)) { + // peek ahead to distinguish between pa's, + // pa'ala and pa'am: + TGCPair nexttp = (i+1 < sz) ? (TGCPair)gcs.get(i+1) : null; + String nextwylie = (nexttp == null) ? "" : nexttp.wylie; + if (isAppendageNonVowelWylie(nextwylie)) { + candidateType = "maybe-appendaged-root"; + } else { + candidateType = "root-suffix"; + } + } else if (TibetanMachineWeb.isWylieRight(wylie)) { + candidateType = "root-suffix"; + } else if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) { + candidateType = "appendaged-root"; + } else { + if (null != warnings) + warnings.append("Found a non-prefix consonant or consonant stack followed by a consonant or consonant stack that is not simply a suffix; that thing's wylie is " + wylie + "\n"); + candidateType = "invalid"; + break; + } + } else if ("prefix-root" == candidateType) { + if (ACHUNG.equals(wylie)) { + // peek ahead to distinguish between bpa's, + // bpa'ala and bpa'am: + TGCPair nexttp = (i+1 < sz) ? (TGCPair)gcs.get(i+1) : null; + String nextwylie = (nexttp == null) ? "" : nexttp.wylie; + if (isAppendageNonVowelWylie(nextwylie)) { + candidateType = "maybe-appendaged-prefix-root"; + } else { + candidateType = "prefix-root-suffix"; + } + } else if (TibetanMachineWeb.isWylieRight(wylie)) { + candidateType = "prefix-root-suffix"; + } else if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) { + candidateType = "appendaged-prefix-root"; + } else { + if (null != warnings) + warnings.append("Found a prefix plus a root stack plus a non-suffix consonant or consonant stack whose wylie is " + wylie + "\n"); + candidateType = "invalid"; + break; + } + } else if ("prefix/root-root/suffix" == candidateType) { + // this has no peekahead, gag'am works. + if (ACHUNG.equals(wylie)) { + // peek ahead to distinguish between + // gga'am and gaga'ala: + TGCPair nexttp = (i+1 < sz) ? (TGCPair)gcs.get(i+1) : null; + String nextwylie = (nexttp == null) ? "" : nexttp.wylie; + if (isAppendageNonVowelWylie(nextwylie)) { + candidateType = "maybe-appendaged-prefix/root-root/suffix"; + } else { + candidateType = "prefix-root-suffix"; + } + } else if (TibetanMachineWeb.isWylieFarRight(wylie)) { + candidateType = "prefix/root-root/suffix-suffix/postsuffix"; + } else if (TibetanMachineWeb.isWylieRight(wylie)) { + candidateType = "prefix-root-suffix"; + } else if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) { + candidateType = "appendaged-prefix/root-root/suffix"; + } else { + if (null != warnings) + warnings.append("Found a prefix/root stack plus a suffix/root stack plus a non-suffix, non-postsuffix consonant or consonant stack whose wylie is " + wylie + "\n"); + candidateType = "invalid"; + break; + } + } else if ("root-suffix" == candidateType) { + // This has no peekahead w.r.t. 'am and 'ang, + // but it needs none because we peeked to be + // sure that this was root-suffix and not + // maybe-appendaged-root. + if (TibetanMachineWeb.isWylieFarRight(wylie)) { + candidateType = "root-suffix-postsuffix"; + } else if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) { + candidateType = "appendaged-root-suffix"; + } else if (ACHUNG.equals(wylie)) { + candidateType = "maybe-appendaged-root-suffix"; + } else { + if (null != warnings) + warnings.append("Found a root stack plus a suffix plus a non-postsuffix consonant or consonant stack whose wylie is " + wylie + "\n"); + candidateType = "invalid"; + break; + } + } else if ("prefix/root-root/suffix-suffix/postsuffix" == candidateType + || "prefix-root-suffix" == candidateType) { + // this has no peekahead and needs none. + if (TibetanMachineWeb.isWylieFarRight(wylie)) { + candidateType = "prefix-root-suffix-postsuffix"; + } else if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) { + // if we simply prepended to + // candidateType, we wouldn't get interned + // strings. + candidateType = ("appendaged-" + candidateType).intern(); + } else if (ACHUNG.equals(wylie)) { + candidateType = ("maybe-appendaged-" + candidateType).intern(); + } else { + if (null != warnings) + warnings.append("Found a prefix/root stack plus a suffix/root stack plus a suffix/postsuffix plus a non-postsuffix consonant or consonant stack whose wylie is " + wylie + "\n"); + candidateType = "invalid"; + break; + } + } else if ("prefix-root-suffix-postsuffix" == candidateType) { + // this has no peekahead and needs none. + if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) { + candidateType = "appendaged-prefix-root-suffix-postsuffix"; + } else if (ACHUNG.equals(wylie)) { + candidateType = "maybe-appendaged-prefix-root-suffix-postsuffix"; + } else { + if (null != warnings) + warnings.append("Found a prefix plus root stack plus suffix plus postsuffix; then found yet another consonant or consonant stack whose wylie is " + wylie + "\n"); + candidateType = "invalid"; + break; + } + } else if ("root-suffix-postsuffix" == candidateType) { + // this has no peekahead and needs none. + if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) { + candidateType = "appendaged-root-suffix-postsuffix"; + } else if (ACHUNG.equals(wylie)) { + candidateType = "maybe-appendaged-root-suffix-postsuffix"; + } else { + if (null != warnings) + warnings.append("Found a root stack plus suffix plus postsuffix; then found yet another consonant or consonant stack whose wylie is " + wylie + "\n"); + candidateType = "invalid"; + break; + } + } else if (candidateType.startsWith("maybe-appendaged-")) { + if (isAppendageNonVowelWylie(wylie)) { + candidateType + = candidateType.substring("maybe-".length()).intern(); + // So that we get 'am, not 'm; 'ang, not 'ng: + tp.wylie = WYLIE_aVOWEL + tp.wylie; + } else { + if (null != warnings) + warnings.append("Found a tsheg bar that has an achung (" + ACHUNG + ") tacked on, followed by some other thing whose wylie is " + wylie + "\n"); + candidateType = "invalid"; + break; + } + } else if (candidateType.startsWith("appendaged-")) { + if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) { + // candidateType stays what it is. + } else if (ACHUNG.equals(wylie)) { + candidateType = ("maybe-" + candidateType).intern(); + } else { + if (null != warnings) + warnings.append("Found a tsheg bar that has a 'i, 'e, 'o, 'u, or 'ang 'am appendage already and then found yet another consonant or consonant stack whose wylie is " + wylie + "\n"); + candidateType = "invalid"; + break; + } + } else { + if ("number" != candidateType) + throw new Error("missed a case"); + if (null != warnings) + warnings.append("Found a consonant or consonant stack after something odd; the consonantish thing has wylie " + wylie + "\n"); + candidateType = "invalid"; + break; + } + } + } else if (TGCPair.LONE_VOWEL == cls) { + if (null != warnings) + warnings.append("Found a vowel that did not follow either a Tibetan consonant or consonant stack or another vowel."); + candidateType = "invalid"; + break; + } else { + throw new Error("bad cls"); + } + } + if (candidateType.startsWith("maybe-appendaged-")) { + if (null != warnings) + warnings.append("Found a tsheg bar that has an extra achung (" + ACHUNG + ") tacked on\n"); + candidateType = "invalid"; + } + return candidateType; + } + + /** Appends to wylieBuffer the wylie for the glyph list glyphList + (which should be an ArrayList for speed). This will be very + user-friendly for "legal tsheg bars" and will be valid, but + possibly ugly (interspersed with disambiguators or extra + vowels, etc.) Wylie for other things, such as Sanskrit + transliteration. Updates warnings and noSuchWylie like the + caller does. + +

What constitutes a legal, non-punctuation, non-whitespace + tsheg bar? The following are the only such:

+
    +
  • one or more numbers
  • + +
  • a single, possibly adorned consonant stack
  • + +
  • a legal "tyllable" appended with zero or more particles + from the set { 'i, 'o, 'u, 'e, 'ang, 'am }
  • +
+ +

A "tyllable" is, by definition, one of the following:

+ +
    +
  • a single, possibly adorned consonant stack
  • + +
  • two consonant stacks where one is a single, + unadorned consonant (and is a prefix it it is first and + a suffix if it is last) and the other is possibly + adorned
  • + +
  • three consonant stacks where at most one has adornment. + If the second has adornment, then the first must be an + unadorned prefix consonant and the last must be an + unadorned suffix consonant. If the first has adornment, + then the second must be an unadorned suffix consonant + and the third must be an unadorned secondary suffix + consonant.
  • + +
  • four consonant stacks where either none is adorned or + only the second consonant stack is adorned, the first is + an unadorned prefix consonant, the third is an unadorned + suffix consonant, and the fourth is an unadorned + secondary suffix consonant.
  • + +
+ +

When there are three unadorned consonant stacks in a + tyllable, a hard-coded list of valid Tibetan tsheg bars is + relied upon to determine if the 'a' vowel comes after the + first or the second consonant.

*/ + private static void getTshegBarWylie(java.util.List glyphList, + boolean noSuchWylie[], + StringBuffer warnings, + StringBuffer wylieBuffer) { + ArrayList gcs + = breakTshegBarIntoGraphemeClusters(glyphList, noSuchWylie); + String candidateType = getClassificationOfTshegBar(gcs, warnings); + int sz = gcs.size(); + if (candidateType == "invalid") { + // Forget beauty and succintness -- just be sure to + // generate Wylie that can be converted unambiguously into + // Tibetan. Use a disambiguator or vowel after each + // grapheme cluster. + // + // If we truly didn't care about beauty, we'd just lump + // SANSKRIT_WITHOUT_VOWEL and SANSKRIT_WITH_VOWEL into + // OTHER. + + for (int i = 0; i < sz; i++) { + TGCPair tp = (TGCPair)gcs.get(i); + int cls = tp.classification; + String wylie = tp.wylie; + wylieBuffer.append(wylie); + if (TibetanMachineWeb.isWylieTibetanConsonantOrConsonantStack(wylie) + || TibetanMachineWeb.isWylieSanskritConsonantStack(wylie)) { + wylieBuffer.append(aVowelToUseAfter(wylie)); + } else { + if (TGCPair.CONSONANTAL_WITH_VOWEL != cls + && TGCPair.SANSKRIT_WITH_VOWEL != cls) + wylieBuffer.append(WYLIE_DISAMBIGUATING_KEY); + } + } + } else { + // Generate perfect, beautiful, Wylie, using the minimum + // number of vowels and disambiguators. + + int leftover = sz + 1; + + // Appendaged vs. not appendaged? it affects nothing at + // this stage. + if (candidateType.startsWith("appendaged-")) { + candidateType + = candidateType.substring("appendaged-".length()).intern(); + } + + if ("prefix/root-root/suffix-suffix/postsuffix" == candidateType) { + /* Yes, this is ambiguous. How do we handle it? See + * this from Andres: + * + * I'm posting this upon David Chandler's + * request. According to Lobsang Thonden in Modern + * Tibetan Grammar Language (page 42), with regards to + * identifying the root letter in 3 lettered words + * there are only 23 ambiguous cases. He writes: + * + * If the last letter is 'sa' and the first two + * letters are affixes, then the SECOND ONE is the + * root letter in the following 9 WORDS ONLY: + * + * gdas gnas gsas dgas dmas bdas mdas 'gas 'das + * + * And the FIRST is the root letter in the following + * 14 WORDS ONLY: + * + * rags lags nags bags bangs gangs rangs langs nangs + * sangs babs rabs rams nams + * + * As I mentioned before, I think that the best + * solution for now is to hard-wire these cases. Even + * if the list is not exhaustive, at least we'll have + * most cases covered. */ + + leftover = 3; + /* FIXME: these constants are hard-wired here, rather + * than in TibetanMachineWeb, because I'm lazy. */ + String wylie1 = ((TGCPair)gcs.get(0)).wylie; + String wylie2 = ((TGCPair)gcs.get(1)).wylie; + String wylie3 = ((TGCPair)gcs.get(2)).wylie; + if ((wylie1.equals("g") && (wylie2.equals("d") || wylie2.equals("n") || wylie2.equals("s"))) + || (wylie1.equals("d") && (wylie2.equals("g") || wylie2.equals("m"))) + || (wylie1.equals("b") && wylie2.equals("d")) + || (wylie1.equals("m") && wylie2.equals("d")) + || (wylie1.equals("'") && (wylie2.equals("g") || wylie2.equals("d")))) { + if (TibetanMachineWeb.isAmbiguousWylie(wylie1, wylie2)) + wylieBuffer.append(wylie1 + WYLIE_DISAMBIGUATING_KEY + wylie2); + else + wylieBuffer.append(wylie1 + wylie2); + + wylieBuffer.append(aVowelToUseAfter(wylie2) + + wylie3); + } else { + wylieBuffer.append(wylie1 + + aVowelToUseAfter(wylie1) + + unambiguousPostAVowelWylie(wylie2, + wylie3)); + } + } else if ("root" == candidateType + || "prefix/root-root/suffix" == candidateType + || "prefix/root" == candidateType + || "root-suffix-postsuffix" == candidateType + || "root-suffix" == candidateType) { + String wylie1 = ((TGCPair)gcs.get(0)).wylie; + leftover = 1; + wylieBuffer.append(wylie1); + if (((TGCPair)gcs.get(0)).classification + != TGCPair.CONSONANTAL_WITH_VOWEL) { + ThdlDebug.verify(TGCPair.CONSONANTAL_WITHOUT_VOWEL + == ((TGCPair)gcs.get(0)).classification); + wylieBuffer.append(aVowelToUseAfter(wylie1)); + if (debug) System.out.println("DEBUG: appending vowel"); + } else { + if (debug) System.out.println("DEBUG: already has vowel 2"); + } + if ("root-suffix-postsuffix" == candidateType) { + leftover = 3; + String wylie2 = ((TGCPair)gcs.get(1)).wylie; + String wylie3 = ((TGCPair)gcs.get(2)).wylie; + wylieBuffer.append(unambiguousPostAVowelWylie(wylie2, + wylie3)); + } + } else if ("prefix-root-suffix" == candidateType + || "prefix-root" == candidateType + || "prefix-root-suffix-postsuffix" == candidateType) { + String wylie1 = ((TGCPair)gcs.get(0)).wylie; + String wylie2 = ((TGCPair)gcs.get(1)).wylie; + leftover = 2; + if (TibetanMachineWeb.isAmbiguousWylie(wylie1, wylie2)) + wylieBuffer.append(wylie1 + WYLIE_DISAMBIGUATING_KEY + wylie2); + else + wylieBuffer.append(wylie1 + wylie2); + + if (((TGCPair)gcs.get(1)).classification + != TGCPair.CONSONANTAL_WITH_VOWEL) { + ThdlDebug.verify(TGCPair.CONSONANTAL_WITHOUT_VOWEL + == ((TGCPair)gcs.get(1)).classification); + if (debug) System.out.println("DEBUG: appending vowel"); + wylieBuffer.append(aVowelToUseAfter(wylie2)); + } else { + if (debug) System.out.println("DEBUG: already has vowel 1"); + } + if ("prefix-root-suffix-postsuffix" == candidateType) { + leftover = 4; + String wylie3 = ((TGCPair)gcs.get(2)).wylie; + String wylie4 = ((TGCPair)gcs.get(3)).wylie; + wylieBuffer.append(unambiguousPostAVowelWylie(wylie3, + wylie4)); + } + } else if ("number" == candidateType) { + leftover = 0; + } else { + throw new Error("missed a case down here"); + } + + // append the wylie left over: + for (int i = leftover; i < sz; i++) { + TGCPair tp = (TGCPair)gcs.get(i); + String wylie = tp.wylie; + wylieBuffer.append(wylie); + } + } + } + +/** +* Gets the Extended Wylie for a sequence of glyphs using Chandler's +* experimental method. This works as follows: +* +*

We run along until we hit whitespace or punctuation. We take +* everything before that and we see if it's a legal Tibetan tsheg bar, +* either a number or a word fragment. If it is, we insert only one +* vowel in the correct place. If not, then we throw a disambiguating +* key or a vowel after each stack. +* +* @param dcs an array of glyphs +* @param noSuchWylie an array which will not be touched if this is +* successful; however, if there is no THDL Extended Wylie +* corresponding to these glyphs, then noSuchWylie[0] will be set to +* true +* @param warnings either null or a buffer to which will be appended +* warnings about illegal tsheg bars +* @return the Extended Wylie corresponding to these glyphs, or null */ + public static String getWylieImplementation(DuffCode[] dcs, + boolean noSuchWylie[], + StringBuffer warnings) { + if (dcs.length == 0) + return null; + + ArrayList glyphList = new ArrayList(); + StringBuffer wylieBuffer = new StringBuffer(); + + for (int i=0; i 0 || !glyphList.isEmpty()) { - String thisPart; - if (needsVowel) - thisPart = withA(glyphList, noSuchWylie); - else - thisPart = withoutA(glyphList, false, noSuchWylie); - wylieBuffer.append(thisPart); - + if (!glyphList.isEmpty()) { + getTshegBarWylie(glyphList, noSuchWylie, + warnings, wylieBuffer); glyphList.clear(); - needsVowel = true; - isLastVowel = false; + if (null != warnings) + warnings.append("Some glyphs came right before a newline; they did not have a tsheg or shad come first."); } wylieBuffer.append(ch); } else { - wylie = TibetanMachineWeb.getWylieForGlyph(dcs[i], noSuchWylie); - - boolean containsBindu = false; - if (wylie.length() > 1 && wylie.charAt(wylie.length()-1) == BINDU) { - char[] cArray = wylie.toCharArray(); - wylie = new String(cArray, 0, wylie.length()-1); - containsBindu = true; - } - - process_block: { - if (TibetanMachineWeb.isWyliePunc(wylie)) { - isLastVowel = false; - - if (glyphList.isEmpty()) { - wylieBuffer.append(wylie); - } else { - String thisPart; - if (needsVowel) - thisPart = withA(glyphList, noSuchWylie); - else - thisPart = withoutA(glyphList, false, noSuchWylie); - wylieBuffer.append(thisPart); - - wylieBuffer.append(wylie); //append the punctuation - - glyphList.clear(); - } - needsVowel = true; //next consonants are syllable onset, so we are awaiting vowel - } else if (TibetanMachineWeb.isWylieChar(wylie)) { - //isChar must come before isVowel because ACHEN has priority over WYLIE_aVOWEL - isLastVowel = false; - glyphList.add(dcs[i]); - } else if (TibetanMachineWeb.isWylieVowel(wylie)) { - if (isLastVowel) { - int len = wylieBuffer.length(); - int A_len = A_VOWEL.length(); - - if (wylieBuffer.substring(len-A_len).equals(A_VOWEL)) { - try { - if (wylie.equals(i_VOWEL)) { - wylieBuffer.delete(len-A_len, len); - wylieBuffer.append(I_VOWEL); - isLastVowel = false; - break process_block; - } else if (wylie.equals(reverse_i_VOWEL)) { - wylieBuffer.delete(len-A_len, len); - wylieBuffer.append(reverse_I_VOWEL); - isLastVowel = false; - break process_block; - } - } - catch (StringIndexOutOfBoundsException se) { - ThdlDebug.noteIffyCode(); - } - - wylieBuffer.append(wylie); //append current vowel - isLastVowel = false; - } else - wylieBuffer.append(wylie); //append current vowel - } else { - int glyphCount = glyphList.size(); - boolean insertDisAmbig = false; - - if (0 != glyphCount) { - DuffCode top_dc = (DuffCode)glyphList.get(glyphCount-1); - String top_wylie = TibetanMachineWeb.getWylieForGlyph(top_dc, noSuchWylie); - - if (top_wylie.equals(ACHEN)) { - glyphList.remove(glyphCount-1); - - if (glyphCount-1 == 0) { - top_dc = null; - } else { - insertDisAmbig = true; - top_dc = (DuffCode)glyphList.get(glyphCount-2); - } - } - - if (top_dc == null || !TibetanMachineWeb.getWylieForGlyph(top_dc, noSuchWylie).equals(ACHUNG)) { - String thisPart = withoutA(glyphList, true, noSuchWylie); - wylieBuffer.append(thisPart); //append consonants in glyphList - } else { - glyphCount = glyphList.size(); - glyphList.remove(glyphCount-1); - - if (glyphCount-1 != 0) { - String thisPart = withA(glyphList, noSuchWylie); - wylieBuffer.append(thisPart); - } - - wylieBuffer.append(ACHUNG); - } - } - - if (insertDisAmbig) - wylieBuffer.append(WYLIE_DISAMBIGUATING_KEY); - - wylieBuffer.append(wylie); //append vowel - - glyphList.clear(); - isLastVowel = true; - needsVowel = false; - } - } else { //must be a stack - isLastVowel = false; - glyphList.add(dcs[i]); + String wylie = TibetanMachineWeb.getWylieForGlyph(dcs[i], noSuchWylie); + if (TibetanMachineWeb.isWyliePunc(wylie) + && !TibetanMachineWeb.isWylieAdornment(wylie)) { + if (!glyphList.isEmpty()) { + getTshegBarWylie(glyphList, noSuchWylie, + warnings, wylieBuffer); + glyphList.clear(); } - } - - if (containsBindu) { - isLastVowel = false; - wylieBuffer.append(withoutA(glyphList, false, noSuchWylie)); - wylieBuffer.append(BINDU); //append the bindu - glyphList.clear(); + wylieBuffer.append(wylie); //append the punctuation + } else { + glyphList.add(dcs[i]); } } } - //replace TMW with Wylie + // replace remaining TMW with Wylie if (!glyphList.isEmpty()) { - String thisPart; - if (needsVowel) - thisPart = withA(glyphList, noSuchWylie); - else - thisPart = withoutA(glyphList, false, noSuchWylie); - wylieBuffer.append(thisPart); + getTshegBarWylie(glyphList, noSuchWylie, warnings, wylieBuffer); + // glyphList.clear() if we weren't about to exit... + if (null != warnings) + warnings.append("The stretch of Tibetan ended without final punctuation."); } if (wylieBuffer.length() > 0) return wylieBuffer.toString(); else return null; - } + } +} + +/** An ordered pair consisting of a Tibetan grapheme cluster's {@link + org.thdl.tib.text.tshegbar#UnicodeGraphemeCluster see + UnicodeGraphemeCluster for a definition of the term} + classification and its context-insensitive THDL Extended Wylie + representation. */ +class TGCPair { + static final int OTHER = 1; + // a standalone achen would fall into this category: + static final int CONSONANTAL_WITHOUT_VOWEL = 2; + static final int CONSONANTAL_WITH_VOWEL = 3; + static final int LONE_VOWEL = 4; + static final int SANSKRIT_WITHOUT_VOWEL = 5; + static final int SANSKRIT_WITH_VOWEL = 6; + + String wylie; + int classification; + TGCPair(String wylie, int classification) { + this.wylie = wylie; + this.classification = classification; + } + public String toString() { + return ""; + } } diff --git a/source/org/thdl/tib/text/TibetanMachineWeb.java b/source/org/thdl/tib/text/TibetanMachineWeb.java index 0914265..5cbf04b 100644 --- a/source/org/thdl/tib/text/TibetanMachineWeb.java +++ b/source/org/thdl/tib/text/TibetanMachineWeb.java @@ -60,6 +60,9 @@ public class TibetanMachineWeb implements THDLWylieConstants { private static TibetanKeyboard keyboard = null; private static Set charSet = null; + private static Set tibSet = null; + private static Set sanskritStackSet = null; + private static Set numberSet = null; private static Set vowelSet = null; private static Set puncSet = null; private static Set topSet = null; @@ -346,26 +349,64 @@ public class TibetanMachineWeb implements THDLWylieConstants { } String line; boolean hashOn = false; - boolean isSanskrit = false; //FIXME: this is never read. + + // is this a Tibetan consonant or consonant stack? + boolean isTibetan = false; + + // is this a Sanskrit consonant stack? + boolean isSanskrit = false; + boolean ignore = false; + tibSet = new HashSet(); + sanskritStackSet = new HashSet(); + while ((line = in.readLine()) != null) { if (line.startsWith("")) { isSanskrit = false; + isTibetan = true; hashOn = false; + ignore = false; line = in.readLine(); - charSet = new HashSet(); + if (null == charSet) charSet = new HashSet(); StringTokenizer st = new StringTokenizer(line,","); while (st.hasMoreTokens()) { String ntk; charSet.add(ntk = st.nextToken()); + tibSet.add(ntk); validInputSequences.put(ntk, anyOldObjectWillDo); } } + else if (line.equalsIgnoreCase("")) { + // FIXME: for historical reasons, numbers go + // in both charSet and numberSet. + isSanskrit = false; + isTibetan = false; + hashOn = false; + ignore = false; + line = in.readLine(); + if (null == charSet) charSet = new HashSet(); + numberSet = new HashSet(); + StringTokenizer st = new StringTokenizer(line,","); + while (st.hasMoreTokens()) { + String ntk; + // DLC FIXME: don't add it to numberSet + // and charSet here; do it in + // so that Jskad has the + // same TMW->Wylie conversion regardless + // of whether or not it chooses to support + // inputting numbers. + numberSet.add(ntk = st.nextToken()); + charSet.add(ntk); + validInputSequences.put(ntk, anyOldObjectWillDo); + } + } else if (line.equalsIgnoreCase("")) { isSanskrit = false; + isTibetan = false; hashOn = false; + ignore = false; line = in.readLine(); vowelSet = new HashSet(); StringTokenizer st = new StringTokenizer(line,","); @@ -377,7 +418,9 @@ public class TibetanMachineWeb implements THDLWylieConstants { } else if (line.equalsIgnoreCase("")) { isSanskrit = false; + isTibetan = false; hashOn = false; + ignore = false; line = in.readLine(); puncSet = new HashSet(); StringTokenizer st = new StringTokenizer(line,","); @@ -389,29 +432,47 @@ public class TibetanMachineWeb implements THDLWylieConstants { } else if (line.equalsIgnoreCase("") - || line.equalsIgnoreCase("") - || line.equalsIgnoreCase("")) { + || line.equalsIgnoreCase("")) { isSanskrit = false; + isTibetan = false; + hashOn = true; + ignore = false; + } + else if (line.equalsIgnoreCase("")) { + isSanskrit = false; + isTibetan = true; + hashOn = true; + ignore = false; + } + else if (line.equalsIgnoreCase("")) { + isSanskrit = false; + isTibetan = false; hashOn = true; ignore = false; } else if (line.equalsIgnoreCase("")) { isSanskrit = true; + isTibetan = false; hashOn = true; ignore = false; } else if (line.equalsIgnoreCase("")) { isSanskrit = false; + isTibetan = false; hashOn = false; ignore = false; } - else if (line.equalsIgnoreCase("")) + else if (line.equalsIgnoreCase("")) { + isSanskrit = false; ignore = true; + } } - else if (line.startsWith("//")) //comment + else if (line.startsWith("//")) { //comment ; - else if (line.equals("")) //empty string + } + else if (line.equals("")) {//empty string ; + } else { StringTokenizer st = new StringTokenizer(line,DELIMITER,true); @@ -559,6 +620,21 @@ public class TibetanMachineWeb implements THDLWylieConstants { if (hashOn) { tibHash.put(wylie, duffCodes); } + if (isTibetan) { + // Delete the dashes: + StringBuffer wylieWithoutDashes = new StringBuffer(wylie); + for (int wl = 0; wl < wylieWithoutDashes.length(); wl++) { + if (wylieWithoutDashes.charAt(wl) == '-') { + wylieWithoutDashes.deleteCharAt(wl); + --wl; + } + } + tibSet.add(wylieWithoutDashes.toString()); + } + + if (isSanskrit) { + sanskritStackSet.add(wylie); + } if (null == duffCodes[TMW]) throw new Error(fileName @@ -726,13 +802,13 @@ public static boolean isFormatting(char c) { } /** -* Checks to see if the passed string -* is a character in the installed keyboard. +* Checks to see if the passed string is a character (a single +* [possibly Sanskrit or va or fa] consonant or a number [possibly +* super- or subscribed]) in the installed keyboard. * * @param s the string you want to check -* @return true if s is a character in the current keyboard, -* false if not -*/ +* @return true if s is a character in the current keyboard, false if +* not */ public static boolean isChar(String s) { if (currentKeyboardIsExtendedWylie()) return charSet.contains(s); @@ -741,16 +817,58 @@ public static boolean isChar(String s) { } /** -* Checks to see if the passed string -* is a character in Extended Wylie. +* Checks to see if the passed string is a character (a single +* [possibly Sanskrit or va or fa] consonant or a number [possibly +* super- or subscribed]) in Extended Wylie. * @param s the string to be checked -* @return true if s is a character in -* Extended Wylie transliteration, false if not -*/ +* @return true if s is a character in Extended Wylie transliteration, +* false if not */ public static boolean isWylieChar(String s) { return charSet.contains(s); } + +/** +* Checks to see if the passed string is a consonant or unadorned +* consonant stack in Extended Wylie. +* @param s the string to be checked +* @return true if s is such in Extended Wylie transliteration, false +* if not */ +public static boolean isWylieTibetanConsonantOrConsonantStack(String s) { + return tibSet.contains(s); +} + +/** +* Returns true if and only if s is the THDL Extended Wylie for a +* Sanskrit multi-consonant stack. +*/ +public static boolean isWylieSanskritConsonantStack(String s) { + return sanskritStackSet.contains(s); +} + +/** Returns true if and only if s is the THDL Extended Wylie + representation of a legal tsheg-bar appendage 'i, 'e, 'u, 'o, 'am, + or 'ang. The word le'u (chapter) contains such an appendage, + e.g. */ +public static boolean isWylieAchungAppendage(String s) { + return (s.equals("'e") + || s.equals("'i") + || s.equals("'o") + || s.equals("'u") + || s.equals("'ang") + || s.equals("'am")); +} + +/** +* Checks to see if the passed string is a number [possibly super- or +* subscribed]) in Extended Wylie. +* @param s the string to be checked +* @return true if s is a number in Extended Wylie transliteration, +* false if not */ +public static boolean isWylieNumber(String s) { + return numberSet.contains(s); +} + /** * Checks to see if the passed string * is punctuation in the installed keyboard. @@ -826,6 +944,32 @@ public static boolean isWylieVowel(String s) { return vowelSet.contains(s); } +/** Returns true if and only if wylie is the THDL Extended Wylie for + an adornment. An adornment is something that is part of a stack + but is not a consonant, such as a Tibetan or Sanskrit vowel or a + bindu. Note that an adornment might be both an adornment and a + vowel, or an adornment and punctuation. */ +public static boolean isWylieAdornment(String wylie) { + return (vowelSet.contains(wylie) + || (wylie.equals("M") /* U+0F7E */ + || wylie.equals("M^") /* U+0F83 */ + || wylie.equals("iM") + || wylie.equals("-iM") + || wylie.equals("eM") + || wylie.equals("aiM") + || wylie.equals("oM") + || wylie.equals("auM"))); +} + +/** Returns true if and only if wylie is the THDL Extended Wylie for + an adornment {@link #isWylieAdornment(String)} that contains a + vowel within it. */ +public static boolean isWylieAdornmentAndContainsVowel(String wylie) { + return (isWylieAdornment(wylie) && + !wylie.equals("M") /* U+0F7E */ + && !wylie.equals("M^") /* U+0F83 */); +} + /** * Returns true iff this Wylie is valid as a leftmost character in a * Tibetan syllable. For example, in the syllable 'brgyad', 'b' is the @@ -839,9 +983,9 @@ public static boolean isWylieLeft(String s) { } /** -* Returns true iff this Wylie is valid as a right (post-vowel) -* character in a Tibetan syllable. For example, in the syllable -* 'lags', 'g' is in the right character position. Valid right +* Returns true iff this Wylie is valid as a suffix (i.e., a right +* (post-vowel) character) in a Tibetan syllable. For example, in the +* syllable 'lags', 'g' is in the right character position. Valid right * characters include g, ng, d, n, b, m, r, l, s, ', and T. * @param s the (Wylie) string to be checked * @return true if s is a possible right character in a Tibetan diff --git a/source/org/thdl/tib/text/tibwn.ini b/source/org/thdl/tib/text/tibwn.ini index eda2b1d..064eb9f 100644 --- a/source/org/thdl/tib/text/tibwn.ini +++ b/source/org/thdl/tib/text/tibwn.ini @@ -7,22 +7,27 @@ // - marks a command // - the commands are: // Consonants - set of consonants in tibetan +// Numbers - set of numbers in tibetan // Vowels - set of vowels -// Other - other characters: numbers, punctuation, etc. +// Other - other characters: punctuation, etc. // Input - those codes which serve basis for wylie input method -// subtypes: Input:Punctuation, Input:Vowels, Input:Tibetan, Input:Sanskrit +// subtypes: Input:Punctuation, Input:Vowels, Input:Tibetan, +// Input:Numbers, Input:Sanskrit // ToWylie - codes only needed for duff to wylie conversion, including vowels // Ignore - ignore until another command is reached -k,kh,g,ng,c,ch,j,ny,t,th,d,n,p,ph,b,m,ts,tsh,dz,w,zh,z,',y,r,l,sh,s,h,a,T,Th,D,N,Sh,v,f,Dz,0,1,2,3,4,5,6,7,8,9,>0,>1,>2,>3,>4,>5,>6,>7,>8,>9,<0,<1,<2,<3,<4,<5,<6,<7,<8,<9 +k,kh,g,ng,c,ch,j,ny,t,th,d,n,p,ph,b,m,ts,tsh,dz,w,zh,z,',y,r,l,sh,s,h,a,T,Th,D,N,Sh,v,f,Dz + + +0,1,2,3,4,5,6,7,8,9,>0,>1,>2,>3,>4,>5,>6,>7,>8,>9,<0,<1,<2,<3,<4,<5,<6,<7,<8,<9 a,i,u,e,o,I,U,ai,au,A,-i,-I -_, ,/,|,!,:,;,@,#,$,%,(,),H,M,`,&,@#,?,=,[,],<,>,{,},* -// FIXME: add these etc.: M^,~,~^ +_, ,/,|,!,:,;,@,#,$,%,(,),H,M,`,&,@#,?,=,[,],{,},* +// FIXME: add these etc.: M^,~,~^,<,> (< and > cause ka<7 to quit working) //_~32,1~0,32 @@ -691,6 +696,8 @@ a+y~143,4~~8,63~1,109~8,120~1,123~1,125~8,106~8,113~f68,fb1 a+r~144,4~~8,64~1,109~8,120~1,123~1,125~8,106~8,113~f68,fb2 a+r+y~145,4~~8,65~1,109~8,121~1,123~1,125~8,107~8,114~f68,fb2,fb1 + + //numbers 0~190,1~~10,48~~~~~~~0F20 1~191,1~~10,49~~~~~~~0F21