/* The contents of this file are subject to the THDL Open Community License Version 1.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License on the THDL web site (http://www.thdl.org/). Software distributed under the License is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for the specific terms governing rights and limitations under the License. The Initial Developer of this software is the Tibetan and Himalayan Digital Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL. All Rights Reserved. Contributor(s): ______________________________________. */ package org.thdl.tib.text; import java.util.*; import javax.swing.*; import javax.swing.text.*; import javax.swing.text.rtf.RTFEditorKit; import java.io.*; import org.thdl.util.ThdlDebug; import org.thdl.tib.text.tshegbar.LegalTshegBar; import org.thdl.tib.text.tshegbar.UnicodeConstants; import org.thdl.tib.text.tshegbar.UnicodeUtils; /** * Provides methods for converting back and forth between Extended * Wylie and Tibetan represented in TibetanMachineWeb glyphs. This * class is not instantiable. * *

* The class provides a variety of static methods for converting * back and forth between Extended Wylie and TibetanMachineWeb. The * Wylie can be accessed as a String, while the TibetanMachineWeb can * be exported as Rich Text Format. * * @author Edward Garrett, Tibetan and Himalayan Digital Library */ public class TibTextUtils implements THDLWylieConstants { /** Change to true to see various things on System.out and System.err. */ private static final boolean debug = false; /** Do not use this contructor. */ private TibTextUtils() { super(); } /** * Converts a list of glyphs into an array of {@link DuffData DuffData}. * The motivation for this is that most processes - for example using * TibetanMachineWeb in HTML - only need to know what * text to output, and when to change fonts. In general, they don't * need to have an explicit indication for each glyph of the font * for that glyph. * @param glyphs the list of TibetanMachineWeb glyphs * you want to convert * @return an array of DuffData corresponding to this * list of glyphs */ public static DuffData[] convertGlyphs(List glyphs) { if (glyphs.size() == 0) return null; List data = new ArrayList(); StringBuffer sb = new StringBuffer(); Iterator iter = glyphs.iterator(); DuffCode dc = (DuffCode)iter.next(); int lastfont = dc.getFontNum(); sb.append(dc.getCharacter()); while (iter.hasNext()) { dc = (DuffCode)iter.next(); if (dc.getFontNum() == lastfont) sb.append(dc.getCharacter()); else { data.add(new DuffData(sb.toString(), lastfont)); lastfont = dc.getFontNum(); sb = new StringBuffer(); sb.append(dc.getCharacter()); } } data.add(new DuffData(sb.toString(), lastfont)); DuffData[] dd = new DuffData[0]; dd = (DuffData[])data.toArray(dd); return dd; } /** * Figures out how to arrange a list of characters into glyphs. For * example, if the user types 'bsgr' using the Extended Wylie keyboard, * this method figures out that this should be represented as a 'b' * glyph followed by a 's-g-r' glyph. If you know that the characters * do not contain Sanskrit stacks, or do not contain Tibetan stacks, * then you can specify this to speed the process up. Otherwise, the * method will first check to see if the characters correspond to any * Tibetan stacks, and if not, then it will check for Sanskrit stacks. * @param chars the list of Tibetan characters you want to find glyphs * for * @param areStacksOnRight whether stacking should try to maximize from * right to left (true) or from left to right (false). In the Extended * Wylie keyboard, you try to stack from right to left. Thus, the * character sequence r-g-r would be stacked as r followed by gr, * rather than rg followed by r. In the Sambhota and TCC keyboards, the * stack direction is reversed. * @param definitelyTibetan should be true if the characters are known * to be Tibetan and not Sanskrit * @param definitelySanskrit should be true if the characters are known * to be Sanskrit and not Tibetan */ public static List getGlyphs(List chars, boolean areStacksOnRight, boolean definitelyTibetan, boolean definitelySanskrit) { StringBuffer tibBuffer, sanBuffer; String tibCluster, sanCluster; boolean checkTibetan, checkSanskrit; if (!(definitelyTibetan || definitelySanskrit)) { checkTibetan = true; checkSanskrit = true; } else { checkTibetan = definitelyTibetan; checkSanskrit = definitelySanskrit; } int length = chars.size(); List glyphs = new ArrayList(); glyphs.clear(); if (areStacksOnRight) { for (int i=0; i-1; i--) { tibBuffer = new StringBuffer(); tibCluster = null; sanBuffer = new StringBuffer(); sanCluster = null; Iterator iter = chars.iterator(); for (int k=0; k 1) { dc = (DuffCode)glyphs.get(glyphs.size()-1); if (!TibetanMachineWeb.isWyliePunc(TibetanMachineWeb.getWylieForGlyph(dc, weDoNotCareIfThereIsCorrespondingWylieOrNot))) { DuffCode dc_2 = (DuffCode)glyphs.removeLast(); DuffCode dc_1 = (DuffCode)glyphs.removeLast(); glyphs.addAll(getVowel(dc_1, dc_2, next)); break vowel_block; } } DuffCode[] dc_array = (DuffCode[])TibetanMachineWeb.getTibHash().get(ACHEN); dc = dc_array[TibetanMachineWeb.TMW]; glyphs.addAll(getVowel(dc, next)); } chars.clear(); } isSanskrit = false; } else if (TibetanMachineWeb.isWylieChar(next)) { if (!isSanskrit) //add char to list - it is not sanskrit chars.add(next); else if (wasLastSanskritStackingKey) { //add char to list - it is still part of sanskrit stack chars.add(next); wasLastSanskritStackingKey = false; } else { //char is no longer part of sanskrit stack, therefore compute and add previous stack glyphs.addAll(getGlyphs(chars, true, !isSanskrit, isSanskrit)); chars.clear(); chars.add(next); isSanskrit = false; wasLastSanskritStackingKey = false; } } else if (next.equals(String.valueOf(WYLIE_DISAMBIGUATING_KEY))) { if (!chars.isEmpty()) glyphs.addAll(getGlyphs(chars, true, !isSanskrit, isSanskrit)); chars.clear(); isSanskrit = false; } else if (next.equals(String.valueOf(WYLIE_SANSKRIT_STACKING_KEY))) { if (!isSanskrit) { //begin sanskrit stack switch (chars.size()) { case 0: break; //'+' is not "pre-stacking" key case 1: isSanskrit = true; wasLastSanskritStackingKey = true; break; default: String top_char = (String)chars.get(chars.size()-1); chars.remove(chars.size()-1); glyphs.addAll(getGlyphs(chars, true, !isSanskrit, isSanskrit)); chars.clear(); chars.add(top_char); isSanskrit = true; wasLastSanskritStackingKey = true; break; } } } else if (TibetanMachineWeb.isFormatting(next.charAt(0))) { if (!chars.isEmpty()) glyphs.addAll(getGlyphs(chars, true, !isSanskrit, isSanskrit)); dc = new DuffCode(1,next.charAt(0)); glyphs.add(dc); chars.clear(); isSanskrit = false; } if (next != null) start += next.length(); } if (!chars.isEmpty()) { glyphs.addAll(getGlyphs(chars, true, !isSanskrit, isSanskrit)); chars.clear(); } DuffData[] dd = convertGlyphs(glyphs); return dd; } /** * Gets the bindu sequence for a given context. * In the TibetanMachineWeb fonts, bindu (anusvara) is realized * differently depending on which vowel it attaches to. Although * the default bindu glyph is affixed to consonants and subscript vowels, * for superscript vowels (i, e, o, etc), there is a single glyph * which merges the bindu and that vowel together. When you pass this * method a glyph context, it will return a List of glyphs which * will either consist of the original glyph followed by the default * bindu glyph, or a composite vowel+bindu glyph. * Note that there is only one glyph in the context. This means that * bindus will not affix properly if superscript vowels are allowed to directly * precede subscript vowels (e.g. pou). * @param dc the DuffCode of the glyph you * want to attach a bindu to * @return a List of DuffCode glyphs that include the * original dc, as well as a bindu */ public static List getBindu(DuffCode dc) { List bindus = new ArrayList(); if (null == dc) { bindus.add(TibetanMachineWeb.getGlyph(String.valueOf(BINDU))); return bindus; } if (!TibetanMachineWeb.getBinduMap().containsKey(dc)) { bindus.add(dc); bindus.add(TibetanMachineWeb.getGlyph(String.valueOf(BINDU))); return bindus; } bindus.add((DuffCode)TibetanMachineWeb.getBinduMap().get(dc)); return bindus; } /** * Gets the vowel sequence for a given vowel in a given context. * Given a context, this method affixes a vowel and returns the * context plus the vowel. Generally, it is enough to provide just * one glyph for context. * @param context the glyph preceding the vowel you want to affix * @param vowel the vowel you want to affix, in Wylie * @return a List of glyphs equal to the vowel in context */ public static List getVowel(DuffCode context, String vowel) { return getVowel(null, context, vowel); } /** * Gets the vowel sequence for a given vowel in a given context. * Given a context, this method affixes a vowel and returns the context plus the vowel. * Since the choice of vowel glyph depends on the consonant to which it is attached, * generally it is enough to provide just the immediately preceding context. However, * in some cases, double vowels are allowed - for example 'buo'. To find the correct * glyph for 'o', we need 'b' in this case, not 'u'. Note also that some Extended * Wylie vowels correspond to multiple glyphs in TibetanMachineWeb. For example, * the vowel I consists of both an achung and a reverse gigu. All required glyphs * are part of the returned List. * @param context_1 the glyph occurring two glyphs before the vowel you want to affix * @param context_2 the glyph immediately before the vowel you want to affix * @param vowel the vowel you want to affix, in Wylie * @return a List of glyphs equal to the vowel in context */ public static List getVowel(DuffCode context_1, DuffCode context_2, String vowel) { List vowels = new ArrayList(); //this vowel doesn't correspond to a glyph - //so you just return the original context if ( vowel.equals(WYLIE_aVOWEL) || TibetanMachineWeb.isTopVowel(context_2)) { if (context_1 != null) vowels.add(context_1); vowels.add(context_2); return vowels; } //first, the three easiest cases: ai, au, and 0) System.out.println("DEBUG: warnings in TMW->Wylie: " + warnings); return ans; } /** True for and only for ma and nga because 'am and 'ang are appendages. */ private static final boolean isAppendageNonVowelWylie(String wylie) { return (MA.equals(wylie) || NGA.equals(wylie)); } // DLC FIXME: {H}, U+0F7F, is part of a grapheme cluster! // David Chapman and I both need a comprehensive list of these // guys. /** Scans the glyphs in glyphList and creates the returned list of grapheme clusters based on them. A grapheme cluster is a consonant or consonant stack with optional adornment or a number (possibly super- or subscribed) or some other glyph alone. */ private static TGCList breakTshegBarIntoGraphemeClusters(java.util.List glyphList, boolean noSuchWylie[]) { // Definition: adornment means vowels and achungs and bindus. // It should be this, though (FIXME): any combining // characters. int sz = glyphList.size(); ThdlDebug.verify(sz > 0); // A list of grapheme clusters (see UnicodeGraphemeCluster). // sz is an overestimate (speeds us up, wastes some memory). TMWGCList gcs = new TMWGCList(sz); StringBuffer buildingUpVowel = new StringBuffer(); // for {cui}, we append to this guy twice. String nonVowelWylie = null; // for the "c" in {cui} int pairType = TGCPair.TYPE_OTHER; for (int i = 0; i < sz; i++) { DuffCode dc = (DuffCode)glyphList.get(i); String wylie = TibetanMachineWeb.getWylieForGlyph(dc, noSuchWylie); boolean buildingUpSanskritNext = false; if ((buildingUpSanskritNext = TibetanMachineWeb.isWylieSanskritConsonantStack(wylie)) || TibetanMachineWeb.isWylieTibetanConsonantOrConsonantStack(wylie)) { if (buildingUpVowel.length() > 0 || null != nonVowelWylie) { gcs.add(new TGCPair(nonVowelWylie, buildingUpVowel.toString(), pairType)); buildingUpVowel.delete(0, buildingUpVowel.length()); } // We want {p-y}, not {py}. nonVowelWylie = TibetanMachineWeb.getHashKeyForGlyph(dc.getFontNum(), dc.getCharNum()); pairType = (buildingUpSanskritNext ? TGCPair.TYPE_SANSKRIT : TGCPair.TYPE_TIBETAN); } else if (TibetanMachineWeb.isWylieAdornmentAndContainsVowel(wylie) || TibetanMachineWeb.isWylieAdornment(wylie)) { buildingUpVowel.append(wylie); } else { // number or weird thing: if (buildingUpVowel.length() > 0 || null != nonVowelWylie) { gcs.add(new TGCPair(nonVowelWylie, buildingUpVowel.toString(), pairType)); buildingUpVowel.delete(0, buildingUpVowel.length()); nonVowelWylie = null; } gcs.add(new TGCPair(wylie, null, TGCPair.TYPE_OTHER)); pairType = TGCPair.TYPE_OTHER; } } if (buildingUpVowel.length() > 0 || null != nonVowelWylie) { gcs.add(new TGCPair(nonVowelWylie, buildingUpVowel.toString(), pairType)); } return gcs; } /** Returns a string that classifies gcs as a legal Tibetan tsheg * bar, a single Sanskrit grapheme cluster * ("single-sanskrit-gc"), or invalid ("invalid"). If * noPrefixTests is true, then ggyi will be seen as a * "prefix-root", even though gya doesn't take a ga prefix. */ public static String getClassificationOfTshegBar(TGCList gcs, // DLC the warnings are Wylie-specific StringBuffer warnings, boolean noPrefixTests) { String candidateType = null; // Now that we have grapheme clusters, see if they match any // of the "legal tsheg bars": int sz = gcs.size(); if (sz == 1) { TGCPair tp = gcs.get(0); int cls = tp.classification; if (TGCPair.SANSKRIT_WITHOUT_VOWEL == cls || TGCPair.SANSKRIT_WITH_VOWEL == cls) return "single-sanskrit-gc"; } TGCPair lastPair = null; for (int i = 0; i < sz; i++) { TGCPair tp = gcs.get(i); int cls = tp.classification; String wylie = tp.getWylie(); if (TGCPair.OTHER == cls) { if (TibetanMachineWeb.isWylieNumber(wylie)) { if (null == candidateType) { candidateType = "number"; } else { if ("number" != candidateType) { if (null != warnings) warnings.append("Found something odd; the wylie is " + wylie + "\n"); candidateType = "invalid"; break; } } } else { if (null != warnings) warnings.append("Found something odd; the wylie is " + wylie + "\n"); candidateType = "invalid"; break; } } else if (TGCPair.SANSKRIT_WITHOUT_VOWEL == cls || TGCPair.SANSKRIT_WITH_VOWEL == cls) { candidateType = "invalid"; break; } else if (TGCPair.CONSONANTAL_WITHOUT_VOWEL == cls || TGCPair.CONSONANTAL_WITH_VOWEL == cls) { if (null == candidateType) { if (TibetanMachineWeb.isWylieLeft(wylie)) { candidateType = "prefix/root"; } else { candidateType = "root"; } } else { if ("prefix/root" == candidateType) { if (ACHUNG.equals(wylie)) { // peek ahead to distinguish between ba's, // ba'ala and ba'am: TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null; String nextwylie = (nexttp == null) ? "" : nexttp.getWylie(); if (isAppendageNonVowelWylie(nextwylie)) { candidateType = "maybe-appendaged-prefix/root"; } else { if (noPrefixTests || isLegalPrefixRootCombo(lastPair.getConsonantWylie(), tp.getConsonantWylie())) candidateType = "prefix/root-root/suffix"; else candidateType = "root-suffix"; } } else if (TibetanMachineWeb.isWylieRight(wylie)) { if (noPrefixTests || isLegalPrefixRootCombo(lastPair.getConsonantWylie(), tp.getConsonantWylie())) candidateType = "prefix/root-root/suffix"; else candidateType = "root-suffix"; } else if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) { candidateType = "appendaged-prefix/root"; } else { if (noPrefixTests || isLegalPrefixRootCombo(lastPair.getConsonantWylie(), tp.getConsonantWylie())) candidateType = "prefix-root"; else { if (null != warnings) warnings.append("Found what would be a prefix-root combo, but the root stack with wylie " + wylie + " does not take the prefix with wylie " + lastPair.getConsonantWylie()); candidateType = "invalid"; break; } } } else if ("root" == candidateType) { if (ACHUNG.equals(wylie)) { // peek ahead to distinguish between pa's, // pa'ala and pa'am: TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null; String nextwylie = (nexttp == null) ? "" : nexttp.getWylie(); if (isAppendageNonVowelWylie(nextwylie)) { candidateType = "maybe-appendaged-root"; } else { candidateType = "root-suffix"; } } else if (TibetanMachineWeb.isWylieRight(wylie)) { candidateType = "root-suffix"; } else if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) { candidateType = "appendaged-root"; } else { if (null != warnings) warnings.append("Found a non-prefix consonant or consonant stack followed by a consonant or consonant stack that is not simply a suffix; that thing's wylie is " + wylie + "\n"); candidateType = "invalid"; break; } } else if ("prefix-root" == candidateType) { if (ACHUNG.equals(wylie)) { // peek ahead to distinguish between bpa's, // bpa'ala and bpa'am: TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null; String nextwylie = (nexttp == null) ? "" : nexttp.getWylie(); if (isAppendageNonVowelWylie(nextwylie)) { candidateType = "maybe-appendaged-prefix-root"; } else { candidateType = "prefix-root-suffix"; } } else if (TibetanMachineWeb.isWylieRight(wylie)) { candidateType = "prefix-root-suffix"; } else if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) { candidateType = "appendaged-prefix-root"; } else { if (null != warnings) warnings.append("Found a prefix plus a root stack plus a non-suffix consonant or consonant stack whose wylie is " + wylie + "\n"); candidateType = "invalid"; break; } } else if ("prefix/root-root/suffix" == candidateType) { // this has no peekahead, gag'am works. if (ACHUNG.equals(wylie)) { // peek ahead to distinguish between // gga'am and gaga'ala: TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null; String nextwylie = (nexttp == null) ? "" : nexttp.getWylie(); if (isAppendageNonVowelWylie(nextwylie)) { candidateType = "maybe-appendaged-prefix/root-root/suffix"; } else { candidateType = "prefix-root-suffix"; } } else if (TibetanMachineWeb.isWylieFarRight(wylie)) { candidateType = "prefix/root-root/suffix-suffix/postsuffix"; } else if (TibetanMachineWeb.isWylieRight(wylie)) { candidateType = "prefix-root-suffix"; } else if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) { candidateType = "appendaged-prefix/root-root/suffix"; } else { if (null != warnings) warnings.append("Found a prefix/root stack plus a suffix/root stack plus a non-suffix, non-postsuffix consonant or consonant stack whose wylie is " + wylie + "\n"); candidateType = "invalid"; break; } } else if ("root-suffix" == candidateType) { // This has no peekahead w.r.t. 'am and 'ang, // but it needs none because we peeked to be // sure that this was root-suffix and not // maybe-appendaged-root. if (TibetanMachineWeb.isWylieFarRight(wylie)) { candidateType = "root-suffix-postsuffix"; } else if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) { candidateType = "appendaged-root-suffix"; } else if (ACHUNG.equals(wylie)) { candidateType = "maybe-appendaged-root-suffix"; } else { if (null != warnings) warnings.append("Found a root stack plus a suffix plus a non-postsuffix consonant or consonant stack whose wylie is " + wylie + "\n"); candidateType = "invalid"; break; } } else if ("prefix/root-root/suffix-suffix/postsuffix" == candidateType || "prefix-root-suffix" == candidateType) { // this has no peekahead and needs none. if (TibetanMachineWeb.isWylieFarRight(wylie)) { candidateType = "prefix-root-suffix-postsuffix"; } else if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) { // if we simply prepended to // candidateType, we wouldn't get interned // strings. candidateType = ("appendaged-" + candidateType).intern(); } else if (ACHUNG.equals(wylie)) { candidateType = ("maybe-appendaged-" + candidateType).intern(); } else { if (null != warnings) warnings.append("Found a prefix/root stack plus a suffix/root stack plus a suffix/postsuffix plus a non-postsuffix consonant or consonant stack whose wylie is " + wylie + "\n"); candidateType = "invalid"; break; } } else if ("prefix-root-suffix-postsuffix" == candidateType) { // this has no peekahead and needs none. if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) { candidateType = "appendaged-prefix-root-suffix-postsuffix"; } else if (ACHUNG.equals(wylie)) { candidateType = "maybe-appendaged-prefix-root-suffix-postsuffix"; } else { if (null != warnings) warnings.append("Found a prefix plus root stack plus suffix plus postsuffix; then found yet another consonant or consonant stack whose wylie is " + wylie + "\n"); candidateType = "invalid"; break; } } else if ("root-suffix-postsuffix" == candidateType) { // this has no peekahead and needs none. if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) { candidateType = "appendaged-root-suffix-postsuffix"; } else if (ACHUNG.equals(wylie)) { candidateType = "maybe-appendaged-root-suffix-postsuffix"; } else { if (null != warnings) warnings.append("Found a root stack plus suffix plus postsuffix; then found yet another consonant or consonant stack whose wylie is " + wylie + "\n"); candidateType = "invalid"; break; } } else if (candidateType.startsWith("maybe-appendaged-")) { if (isAppendageNonVowelWylie(wylie)) { candidateType = candidateType.substring("maybe-".length()).intern(); // So that we get 'am, not 'm; 'ang, not 'ng: // FIXME: cludge: weird place to do this. // pa'am, not pa'm is what we want, sure, // but doing this here is ugly. tp.setWylie(WYLIE_aVOWEL + tp.getWylie()); } else { if (null != warnings) warnings.append("Found a tsheg bar that has an achung (" + ACHUNG + ") tacked on, followed by some other thing whose wylie is " + wylie + "\n"); candidateType = "invalid"; break; } } else if (candidateType.startsWith("appendaged-")) { if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) { // candidateType stays what it is. } else if (ACHUNG.equals(wylie)) { candidateType = ("maybe-" + candidateType).intern(); } else { if (null != warnings) warnings.append("Found a tsheg bar that has a 'i, 'e, 'o, 'u, or 'ang 'am appendage already and then found yet another consonant or consonant stack whose wylie is " + wylie + "\n"); candidateType = "invalid"; break; } } else { if ("invalid" == candidateType) throw new Error("forgot to break out of the for loop after diagnosing invalidity."); if ("number" != candidateType) throw new Error("missed a case; case is " + candidateType); if (null != warnings) warnings.append("Found a consonant or consonant stack after something odd; the consonantish thing has wylie " + wylie + "\n"); candidateType = "invalid"; break; } } } else if (TGCPair.LONE_VOWEL == cls) { if (null != warnings) warnings.append("Found a vowel that did not follow either a Tibetan consonant or consonant stack or another vowel."); candidateType = "invalid"; break; } else { throw new Error("bad cls"); } lastPair = tp; } if (candidateType.startsWith("maybe-appendaged-")) { if (null != warnings) warnings.append("Found a tsheg bar that has an extra achung (" + ACHUNG + ") tacked on\n"); candidateType = "invalid"; } return candidateType; } /** Appends to translitBuffer the EWTS/ACIP for the glyph list glyphList (which should be an ArrayList for speed). This will be very user-friendly for "legal tsheg bars" and will be valid, but possibly ugly (interspersed with disambiguators or extra vowels, etc.) Wylie/ACIP for other things, such as Sanskrit transliteration. Updates warnings and noSuch like the caller does.

What constitutes a legal, non-punctuation, non-whitespace tsheg bar? The following are the only such:

A "tyllable" is, by definition, one of the following:

When there are three unadorned consonant stacks in a tyllable, a hard-coded list of valid Tibetan tsheg bars is relied upon to determine if the 'a'/'A' vowel comes after the first or the second consonant.

*/ private static void getTshegBarTranslit(boolean EWTSNotACIP, java.util.List glyphList, boolean noSuch[], StringBuffer warnings, StringBuffer translitBuffer) { TGCList gcs = breakTshegBarIntoGraphemeClusters(glyphList, noSuch); String candidateType = getClassificationOfTshegBar(gcs, warnings, false); int sz = gcs.size(); if (candidateType == "invalid" || candidateType == "single-sanskrit-gc") { // Forget beauty and succintness -- just be sure to // generate transliteration that can be converted // unambiguously into Tibetan. Use a disambiguator or // vowel after each grapheme cluster. // // If we truly didn't care about beauty, we'd just lump // SANSKRIT_WITHOUT_VOWEL and SANSKRIT_WITH_VOWEL into // OTHER. for (int i = 0; i < sz; i++) { TGCPair tp = (TGCPair)gcs.get(i); int cls = tp.classification; String wylie = tp.getWylie(); String translit = (EWTSNotACIP) ? wylie : tp.getACIP(); translitBuffer.append(translit); if (TibetanMachineWeb.isWylieTibetanConsonantOrConsonantStack(wylie) || TibetanMachineWeb.isWylieSanskritConsonantStack(wylie)) { translitBuffer.append(aVowelToUseAfter(EWTSNotACIP, wylie)); } else { if (TGCPair.CONSONANTAL_WITH_VOWEL != cls && TGCPair.SANSKRIT_WITH_VOWEL != cls) translitBuffer.append(EWTSNotACIP ? WYLIE_DISAMBIGUATING_KEY : '-'); } } } else { // Generate perfect, beautiful transliteration, using the // minimum number of vowels and disambiguators. int leftover = sz + 1; // Appendaged vs. not appendaged? it affects nothing at // this stage. candidateType = getCandidateTypeModuloAppendage(candidateType); if ("prefix/root-root/suffix-suffix/postsuffix" == candidateType) { /* Yes, this is ambiguous. How do we handle it? See * this from Andres: * * I'm posting this upon David Chandler's * request. According to Lobsang Thonden in Modern * Tibetan Grammar Language (page 42), with regards to * identifying the root letter in 3 lettered words * there are only 23 ambiguous cases. He writes: * * If the last letter is 'sa' and the first two * letters are affixes, then the SECOND ONE is the * root letter in the following 9 WORDS ONLY: * * gdas gnas gsas dgas dmas bdas mdas 'gas 'das [NOTE: * Andres later came across 'bad, so we handle it this * way also] * * And the FIRST is the root letter in the following * 14 WORDS ONLY: * * rags lags nags bags bangs gangs rangs langs nangs * sangs babs rabs rams nams * * As I mentioned before, I think that the best * solution for now is to hard-wire these cases. Even * if the list is not exhaustive, at least we'll have * most cases covered. */ leftover = 3; /* FIXME: these constants are hard-wired here, rather * than in TibetanMachineWeb, because I'm lazy. */ String wylie1 = ((TGCPair)gcs.get(0)).getWylie(); String wylie2 = ((TGCPair)gcs.get(1)).getWylie(); String wylie3 = ((TGCPair)gcs.get(2)).getWylie(); String acip1 = (EWTSNotACIP) ? null : ((TGCPair)gcs.get(0)).getACIP(); String acip2 = (EWTSNotACIP) ? null : ((TGCPair)gcs.get(1)).getACIP(); String acip3 = (EWTSNotACIP) ? null : ((TGCPair)gcs.get(2)).getACIP(); if ((wylie1.equals("g") && (wylie2.equals("d") || wylie2.equals("n") || wylie2.equals("s"))) || (wylie1.equals("d") && (wylie2.equals("g") || wylie2.equals("m"))) || (wylie1.equals("b") && wylie2.equals("d")) || (wylie1.equals("m") && wylie2.equals("d")) || (wylie1.equals("'") && (wylie2.equals("g") || wylie2.equals("d") || wylie2.equals("b")))) { if (TibetanMachineWeb.isAmbiguousWylie(wylie1, wylie2)) if (EWTSNotACIP) translitBuffer.append(wylie1 + WYLIE_DISAMBIGUATING_KEY + wylie2); else translitBuffer.append(acip1 + '-' + acip2); else if (EWTSNotACIP) translitBuffer.append(wylie1 + wylie2); else translitBuffer.append(acip1 + acip2); translitBuffer.append(aVowelToUseAfter(EWTSNotACIP, wylie2) + (EWTSNotACIP ? wylie3 : acip3)); } else { if (EWTSNotACIP) translitBuffer.append(wylie1 + aVowelToUseAfter(EWTSNotACIP, wylie1) + unambiguousPostAVowelTranslit(EWTSNotACIP, wylie2, wylie3, acip2, acip3)); else translitBuffer.append(acip1 + aVowelToUseAfter(EWTSNotACIP, wylie1) + unambiguousPostAVowelTranslit(EWTSNotACIP, wylie2, wylie3, acip2, acip3)); } } else if ("root" == candidateType || "prefix/root-root/suffix" == candidateType || "prefix/root" == candidateType || "root-suffix-postsuffix" == candidateType || "root-suffix" == candidateType) { String wylie1 = ((TGCPair)gcs.get(0)).getWylie(); String acip1 = (EWTSNotACIP) ? null : ((TGCPair)gcs.get(0)).getACIP(); leftover = 1; translitBuffer.append((EWTSNotACIP) ? wylie1 : acip1); if (((TGCPair)gcs.get(0)).classification != TGCPair.CONSONANTAL_WITH_VOWEL) { ThdlDebug.verify(TGCPair.CONSONANTAL_WITHOUT_VOWEL == ((TGCPair)gcs.get(0)).classification); translitBuffer.append(aVowelToUseAfter(EWTSNotACIP, wylie1)); if (debug) System.out.println("DEBUG: appending vowel"); } else { if (debug) System.out.println("DEBUG: already has vowel 2"); } if ("root-suffix-postsuffix" == candidateType) { leftover = 3; String wylie2 = ((TGCPair)gcs.get(1)).getWylie(); String wylie3 = ((TGCPair)gcs.get(2)).getWylie(); String acip2 = (EWTSNotACIP) ? null : ((TGCPair)gcs.get(1)).getACIP(); String acip3 = (EWTSNotACIP) ? null : ((TGCPair)gcs.get(2)).getACIP(); translitBuffer.append(unambiguousPostAVowelTranslit(EWTSNotACIP, wylie2, wylie3, acip2, acip3)); } } else if ("prefix-root-suffix" == candidateType || "prefix-root" == candidateType || "prefix-root-suffix-postsuffix" == candidateType) { String wylie1 = ((TGCPair)gcs.get(0)).getWylie(); String wylie2 = ((TGCPair)gcs.get(1)).getWylie(); String acip1 = (EWTSNotACIP) ? null : ((TGCPair)gcs.get(0)).getACIP(); String acip2 = (EWTSNotACIP) ? null : ((TGCPair)gcs.get(1)).getACIP(); leftover = 2; if (TibetanMachineWeb.isAmbiguousWylie(wylie1, wylie2)) if (EWTSNotACIP) translitBuffer.append(wylie1 + WYLIE_DISAMBIGUATING_KEY + wylie2); else translitBuffer.append(acip1 + '-' + acip2); else if (EWTSNotACIP) translitBuffer.append(wylie1 + wylie2); else translitBuffer.append(acip1 + acip2); if (((TGCPair)gcs.get(1)).classification != TGCPair.CONSONANTAL_WITH_VOWEL) { ThdlDebug.verify(TGCPair.CONSONANTAL_WITHOUT_VOWEL == ((TGCPair)gcs.get(1)).classification); if (debug) System.out.println("DEBUG: appending vowel"); translitBuffer.append(aVowelToUseAfter(EWTSNotACIP, wylie2)); } else { if (debug) System.out.println("DEBUG: already has vowel 1"); } if ("prefix-root-suffix-postsuffix" == candidateType) { leftover = 4; String wylie3 = ((TGCPair)gcs.get(2)).getWylie(); String wylie4 = ((TGCPair)gcs.get(3)).getWylie(); String acip3 = (EWTSNotACIP) ? null : ((TGCPair)gcs.get(2)).getACIP(); String acip4 = (EWTSNotACIP) ? null : ((TGCPair)gcs.get(3)).getACIP(); translitBuffer.append(unambiguousPostAVowelTranslit(EWTSNotACIP, wylie3, wylie4, acip3, acip4)); } } else if ("number" == candidateType) { leftover = 0; } else { throw new Error("missed a case down here"); } // append the wylie/ACIP left over: for (int i = leftover; i < sz; i++) { TGCPair tp = (TGCPair)gcs.get(i); translitBuffer.append(EWTSNotACIP ? tp.getWylie() : tp.getACIP()); } } } /** * Gets the Extended Wylie/ACIP for a sequence of glyphs. This works * as follows: * *

We run along until we hit whitespace or punctuation. We take * everything before that and we see if it's a legal Tibetan tsheg bar, * either a number or a word fragment. If it is, we insert only one * vowel in the correct place. If not, then we throw a disambiguating * key or a vowel after each stack. * * @param EWTSNotACIP true if you want THDL Extended Wylie, false if * you want ACIP * @param dcs an array of glyphs * @param noSuch an array which will not be touched if this is * successful; however, if there is no THDL Extended Wylie/ACIP * corresponding to these glyphs, then noSuch[0] will be set to true * @param warnings either null or a buffer to which will be appended * warnings about illegal tsheg bars * @return the Extended Wylie/ACIP corresponding to these glyphs, or * null */ private static String getTranslitImplementation(boolean EWTSNotACIP, DuffCode[] dcs, boolean noSuch[], StringBuffer warnings) { if (dcs.length == 0) return null; ArrayList glyphList = new ArrayList(); StringBuffer translitBuffer = new StringBuffer(); for (int i=0; i 0) return translitBuffer.toString(); else return null; } /** Returns "root" instead of "appendaged-root", for example. */ private static final String getCandidateTypeModuloAppendage(String candidateType) { if (candidateType.startsWith("appendaged-")) { candidateType = candidateType.substring("appendaged-".length()).intern(); } return candidateType; } /** Returns an array of size 2 that lists all the possible indices * of the root stack given the chosen candidate type. A negative * number appears if there are not that many possible positions * for the root. (You'll get two negative numbers if there is no * root stack.) */ public static final int[] getIndicesOfRootForCandidateType(String candidateType) { // Appendaged vs. not appendaged? it affects nothing. candidateType = getCandidateTypeModuloAppendage(candidateType); int[] rv = new int[] { -1, -1 }; if (candidateType == "prefix/root" || candidateType.startsWith("root")) { rv[0] = 0; } else if (candidateType.startsWith("prefix/root-")) { rv[0] = 0; rv[1] = 1; } else if (candidateType.startsWith("prefix-root")) { rv[0] = 1; } return rv; } /** Returns true if and only if the stack with Wylie root * can take the prefix prefix. */ private static boolean isLegalPrefixRootCombo(String prefix, String root) { // This will be decomposed enough. If you can decompose it, // then it doesn't take a prefix! if (!TibetanMachineWeb.isKnownHashKey(root)) { root = root.replace('+', '-'); if (!TibetanMachineWeb.isKnownHashKey(root)) { throw new Error("root is, now, " + root); // FIXME: make this an assertion } } String ru = TibetanMachineWeb.getUnicodeForWylieForGlyph(root); // ru may be for (head, root, sub), (head, root), (root), or // (root, sub). Try all possibilities that are possible with // a String of length ru. If there's a wa-zur, then we say // (FIXME: do we say correctly?) that a stack with wa-zur can // take a prefix if and only if the stack without can take a // prefix. if (ru == null) throw new Error("how? root is " + root); // FIXME: make this an assertion int rl = ru.length(); if (ru.charAt(rl - 1) == UnicodeConstants.EWSUB_wa_zur) --rl; // forget about wa-zur: see above. if (rl == 2) { char ch0 = ru.charAt(0); char ch1 = UnicodeUtils.getNominalRepresentationOfSubscribedConsonant(ru.charAt(1)); // (head, root) and (root, sub) are possibilities. if (ACHUNG.equals(prefix)) { return LegalTshegBar.takesAchungPrefix(ch0, ch1, UnicodeConstants.EW_ABSENT) || LegalTshegBar.takesAchungPrefix(UnicodeConstants.EW_ABSENT, ch0, ch1); } else if ("b".equals(prefix)) { return LegalTshegBar.takesBao(ch0, ch1, UnicodeConstants.EW_ABSENT) || LegalTshegBar.takesBao(UnicodeConstants.EW_ABSENT, ch0, ch1); } else if ("m".equals(prefix)) { return LegalTshegBar.takesMao(ch0, ch1, UnicodeConstants.EW_ABSENT) || LegalTshegBar.takesMao(UnicodeConstants.EW_ABSENT, ch0, ch1); } else if ("g".equals(prefix)) { return LegalTshegBar.takesGao(ch0, ch1, UnicodeConstants.EW_ABSENT) || LegalTshegBar.takesGao(UnicodeConstants.EW_ABSENT, ch0, ch1); } else if ("d".equals(prefix)) { return LegalTshegBar.takesDao(ch0, ch1, UnicodeConstants.EW_ABSENT) || LegalTshegBar.takesDao(UnicodeConstants.EW_ABSENT, ch0, ch1); } else { throw new IllegalArgumentException("prefix is " + prefix); } } else if (rl == 1) { char ch0 = ru.charAt(0); // (root) is the only choice. if (ACHUNG.equals(prefix)) { return LegalTshegBar.takesAchungPrefix(UnicodeConstants.EW_ABSENT, ch0, UnicodeConstants.EW_ABSENT); } else if ("b".equals(prefix)) { return LegalTshegBar.takesBao(UnicodeConstants.EW_ABSENT, ch0, UnicodeConstants.EW_ABSENT); } else if ("m".equals(prefix)) { return LegalTshegBar.takesMao(UnicodeConstants.EW_ABSENT, ch0, UnicodeConstants.EW_ABSENT); } else if ("g".equals(prefix)) { return LegalTshegBar.takesGao(UnicodeConstants.EW_ABSENT, ch0, UnicodeConstants.EW_ABSENT); } else if ("d".equals(prefix)) { return LegalTshegBar.takesDao(UnicodeConstants.EW_ABSENT, ch0, UnicodeConstants.EW_ABSENT); } else { throw new IllegalArgumentException("prefix is " + prefix); } } else if (rl == 3) { char ch0 = ru.charAt(0); char ch1 = UnicodeUtils.getNominalRepresentationOfSubscribedConsonant(ru.charAt(1)); char ch2 = UnicodeUtils.getNominalRepresentationOfSubscribedConsonant(ru.charAt(2)); // (head, root, sub) is the only choice. if (ACHUNG.equals(prefix)) { return LegalTshegBar.takesAchungPrefix(ch0, ch1, ch2); } else if ("b".equals(prefix)) { return LegalTshegBar.takesBao(ch0, ch1, ch2); } else if ("m".equals(prefix)) { return LegalTshegBar.takesMao(ch0, ch1, ch2); } else if ("g".equals(prefix)) { return LegalTshegBar.takesGao(ch0, ch1, ch2); } else if ("d".equals(prefix)) { return LegalTshegBar.takesDao(ch0, ch1, ch2); } else { throw new IllegalArgumentException("prefix is " + prefix); } } else { return false; } } }