/* The contents of this file are subject to the THDL Open Community License Version 1.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License on the THDL web site (http://www.thdl.org/). Software distributed under the License is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for the specific terms governing rights and limitations under the License. The Initial Developer of this software is the Tibetan and Himalayan Digital Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL. All Rights Reserved. Contributor(s): ______________________________________. */ package org.thdl.tib.text; import java.util.*; import javax.swing.*; import javax.swing.text.*; import javax.swing.text.rtf.RTFEditorKit; import java.io.*; import org.thdl.util.ThdlDebug; /** * Provides methods for converting back and forth between Extended * Wylie and Tibetan represented in TibetanMachineWeb glyphs. This * class is not instantiable. * *

* The class provides a variety of static methods for converting * back and forth between Extended Wylie and TibetanMachineWeb. The * Wylie can be accessed as a String, while the TibetanMachineWeb can * be exported as Rich Text Format. * * @author Edward Garrett, Tibetan and Himalayan Digital Library */ public class TibTextUtils implements THDLWylieConstants { /** Do not use this contructor. */ private TibTextUtils() { super(); } /** * Converts a list of glyphs into an array of {@link DuffData DuffData}. * The motivation for this is that most processes - for example using * TibetanMachineWeb in HTML - only need to know what * text to output, and when to change fonts. In general, they don't * need to have an explicit indication for each glyph of the font * for that glyph. * @param glyphs the list of TibetanMachineWeb glyphs * you want to convert * @return an array of DuffData corresponding to this * list of glyphs */ public static DuffData[] convertGlyphs(List glyphs) { if (glyphs.size() == 0) return null; List data = new ArrayList(); StringBuffer sb = new StringBuffer(); Iterator iter = glyphs.iterator(); DuffCode dc = (DuffCode)iter.next(); int lastfont = dc.fontNum; sb.append(dc.character); while (iter.hasNext()) { dc = (DuffCode)iter.next(); if (dc.fontNum == lastfont) sb.append(dc.character); else { data.add(new DuffData(sb.toString(), lastfont)); lastfont = dc.fontNum; sb = new StringBuffer(); sb.append(dc.character); } } data.add(new DuffData(sb.toString(), lastfont)); DuffData[] dd = new DuffData[0]; dd = (DuffData[])data.toArray(dd); return dd; } /** * Figures out how to arrange a list of characters into glyphs. For * example, if the user types 'bsgr' using the Extended Wylie keyboard, * this method figures out that this should be represented as a 'b' * glyph followed by a 's-g-r' glyph. If you know that the characters * do not contain Sanskrit stacks, or do not contain Tibetan stacks, * then you can specify this to speed the process up. Otherwise, the * method will first check to see if the characters correspond to any * Tibetan stacks, and if not, then it will check for Sanskrit stacks. * @param chars the list of Tibetan characters you want to find glyphs * for * @param areStacksOnRight whether stacking should try to maximize from * right to left (true) or from left to right (false). In the Extended * Wylie keyboard, you try to stack from right to left. Thus, the * character sequence r-g-r would be stacked as r followed by gr, * rather than rg followed by r. In the Sambhota and TCC keyboards, the * stack direction is reversed. * @param definitelyTibetan should be true if the characters are known * to be Tibetan and not Sanskrit * @param definitelySanskrit should be true if the characters are known * to be Sanskrit and not Tibetan */ public static List getGlyphs(List chars, boolean areStacksOnRight, boolean definitelyTibetan, boolean definitelySanskrit) { StringBuffer tibBuffer, sanBuffer; String tibCluster, sanCluster; boolean checkTibetan, checkSanskrit; if (!(definitelyTibetan || definitelySanskrit)) { checkTibetan = true; checkSanskrit = true; } else { checkTibetan = definitelyTibetan; checkSanskrit = definitelySanskrit; } int length = chars.size(); List glyphs = new ArrayList(); glyphs.clear(); if (areStacksOnRight) { for (int i=0; i-1; i--) { tibBuffer = new StringBuffer(); tibCluster = null; sanBuffer = new StringBuffer(); sanCluster = null; Iterator iter = chars.iterator(); for (int k=0; k 1) { dc = (DuffCode)glyphs.get(glyphs.size()-1); if (!TibetanMachineWeb.isWyliePunc(TibetanMachineWeb.getWylieForGlyph(dc))) { DuffCode dc_2 = (DuffCode)glyphs.removeLast(); DuffCode dc_1 = (DuffCode)glyphs.removeLast(); glyphs.addAll(getVowel(dc_1, dc_2, next)); break vowel_block; } } DuffCode[] dc_array = (DuffCode[])TibetanMachineWeb.getTibHash().get(ACHEN); dc = dc_array[TibetanMachineWeb.TMW]; glyphs.addAll(getVowel(dc, next)); } chars.clear(); } isSanskrit = false; } else if (TibetanMachineWeb.isWylieChar(next)) { if (!isSanskrit) //add char to list - it is not sanskrit chars.add(next); else if (wasLastSanskritStackingKey) { //add char to list - it is still part of sanskrit stack chars.add(next); wasLastSanskritStackingKey = false; } else { //char is no longer part of sanskrit stack, therefore compute and add previous stack glyphs.addAll(getGlyphs(chars, true, !isSanskrit, isSanskrit)); chars.clear(); chars.add(next); isSanskrit = false; wasLastSanskritStackingKey = false; } } else if (next.equals(String.valueOf(WYLIE_DISAMBIGUATING_KEY))) { if (!chars.isEmpty()) glyphs.addAll(getGlyphs(chars, true, !isSanskrit, isSanskrit)); chars.clear(); isSanskrit = false; } else if (next.equals(String.valueOf(WYLIE_SANSKRIT_STACKING_KEY))) { if (!isSanskrit) { //begin sanskrit stack switch (chars.size()) { case 0: break; //'+' is not "pre-stacking" key case 1: isSanskrit = true; wasLastSanskritStackingKey = true; break; default: String top_char = (String)chars.get(chars.size()-1); chars.remove(chars.size()-1); glyphs.addAll(getGlyphs(chars, true, !isSanskrit, isSanskrit)); chars.clear(); chars.add(top_char); isSanskrit = true; wasLastSanskritStackingKey = true; break; } } } else if (TibetanMachineWeb.isFormatting(next.charAt(0))) { if (!chars.isEmpty()) glyphs.addAll(getGlyphs(chars, true, !isSanskrit, isSanskrit)); dc = new DuffCode(1,next.charAt(0)); glyphs.add(dc); chars.clear(); isSanskrit = false; } if (next != null) start += next.length(); } if (!chars.isEmpty()) { glyphs.addAll(getGlyphs(chars, true, !isSanskrit, isSanskrit)); chars.clear(); } DuffData[] dd = convertGlyphs(glyphs); return dd; } /** * Gets the bindu sequence for a given context. * In the TibetanMachineWeb fonts, bindu (anusvara) is realized * differently depending on which vowel it attaches to. Although * the default bindu glyph is affixed to consonants and subscript vowels, * for superscript vowels (i, e, o, etc), there is a single glyph * which merges the bindu and that vowel together. When you pass this * method a glyph context, it will return a List of glyphs which * will either consist of the original glyph followed by the default * bindu glyph, or a composite vowel+bindu glyph. * Note that there is only one glyph in the context. This means that * bindus will not affix properly if superscript vowels are allowed to directly * precede subscript vowels (e.g. pou). * @param dc the DuffCode of the glyph you * want to attach a bindu to * @return a List of DuffCode glyphs that include the * original dc, as well as a bindu */ public static List getBindu(DuffCode dc) { List bindus = new ArrayList(); if (null == dc) { bindus.add(TibetanMachineWeb.getGlyph(String.valueOf(BINDU))); return bindus; } if (!TibetanMachineWeb.getBinduMap().containsKey(dc)) { bindus.add(dc); bindus.add(TibetanMachineWeb.getGlyph(String.valueOf(BINDU))); return bindus; } bindus.add((DuffCode)TibetanMachineWeb.getBinduMap().get(dc)); return bindus; } /** * Gets the vowel sequence for a given vowel in a given context. * Given a context, this method affixes a vowel and returns the * context plus the vowel. Generally, it is enough to provide just * one glyph for context. * @param context the glyph preceding the vowel you want to affix * @param vowel the vowel you want to affix, in Wylie * @return a List of glyphs equal to the vowel in context */ public static List getVowel(DuffCode context, String vowel) { return getVowel(null, context, vowel); } /** * Gets the vowel sequence for a given vowel in a given context. * Given a context, this method affixes a vowel and returns the context plus the vowel. * Since the choice of vowel glyph depends on the consonant to which it is attached, * generally it is enough to provide just the immediately preceding context. However, * in some cases, double vowels are allowed - for example 'buo'. To find the correct * glyph for 'o', we need 'b' in this case, not 'u'. Note also that some Extended * Wylie vowels correspond to multiple glyphs in TibetanMachineWeb. For example, * the vowel I consists of both an achung and a reverse gigu. All required glyphs * are part of the returned List. * @param context_1 the glyph occurring two glyphs before the vowel you want to affix * @param context_2 the glyph immediately before the vowel you want to affix * @param vowel the vowel you want to affix, in Wylie * @return a List of glyphs equal to the vowel in context */ public static List getVowel(DuffCode context_1, DuffCode context_2, String vowel) { List vowels = new ArrayList(); //this vowel doesn't correspond to a glyph - //so you just return the original context if ( vowel.equals(WYLIE_aVOWEL) || TibetanMachineWeb.isTopVowel(context_2)) { if (context_1 != null) vowels.add(context_1); vowels.add(context_2); return vowels; } //first, the three easiest cases: ai, au, and = 0 && TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(effectiveSize)).equals(ACHUNG)) { if (null == tailEndWylie) tailEndWylie = new StringBuffer(); // prepend: tailEndWylie.insert(0, ACHUNG + aVowelToUseAfter(ACHUNG) + TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(effectiveSize + 1))); effectiveSize -= 2; } if (null != tailEndWylie) { return (withA(glyphList.subList(0, effectiveSize + 2)) + tailEndWylie.toString()); } } if (makeIllegalTibetanGoEndToEnd && (size > 4 // this is too many glyphs to be legal // this is illegal because it doesn't begin // with a prefix: || (size == 4 && (!TibetanMachineWeb.isWylieLeft(TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(0))) // this is illegal because it doesn't have a // suffix in the proper place, e.g. mjskad: || !TibetanMachineWeb.isWylieRight(TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(size - 2))) // this is illegal because it doesn't have a // postsuffix in the proper place, // e.g. 'lan.g, which would otherwise become // 'lang (with nga, not na and then ga): || !TibetanMachineWeb.isWylieFarRight(TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(size - 1))))))) { for (int i = 0; i < size; i++) { wylie = TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(i)); if (TibetanMachineWeb.isAmbiguousWylie(lastWylie, wylie) || (i != 0 && wylie.equals(ACHEN))) sb.append(WYLIE_DISAMBIGUATING_KEY); sb.append(wylie + aVowelToUseAfter(wylie)); lastWylie = wylie; } return sb.toString(); } /* Else, chew up all the glyphs except for the last two. Then decide. */ int i = 0; while (i+2 < size) { wylie = TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(i)); if (TibetanMachineWeb.isAmbiguousWylie(lastWylie, wylie) || (i != 0 && wylie.equals(ACHEN))) sb.append(WYLIE_DISAMBIGUATING_KEY); sb.append(wylie); lastWylie = wylie; i++; } String wylie1 = TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(i)); String wylie2 = TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(i + 1)); if (size == 3) { String wylie0 = lastWylie; // Let's see if wylie0+wylie1+wylie2 is ambiguous // -- if wylie0 could be a prefix and if wylie1 // could be a suffix, and if wylie2 is "s". If // it's ambigous, let's look up // wylie0+wylie1+wylie2 in our magic table. // Otherwise, see if we have a prefix, and if we // do, the "a" vowel comes after wylie1. Else the // "a" vowel comes after wylie0. if (TibetanMachineWeb.isWylieLeft(wylie0)) { /* is it ambiguous? */ if (TibetanMachineWeb.isWylieRight(wylie1) && SA.equals(wylie2) /* isWylieFarRight would * work, but the list of * 9 words doesn't have * any ending with d -- * all end with s. */) { /* Yes, this is ambiguous. How do we handle * it? See this from Andres: * * I'm posting this upon David Chandler's * request. According to Lobsang Thonden in * Modern Tibetan Grammar Language (page 42), * with regards to identifying the root letter * in 3 lettered words there are only 23 * ambiguous cases. He writes: * * If the last letter is 'sa' and the first * two letters are affixes, then the SECOND * ONE is the root letter in the following 9 * WORDS ONLY: * * gdas gnas gsas dgas dmas bdas mdas 'gas * 'das * * And the FIRST is the root letter in the * following 14 WORDS ONLY: * * rags lags nags bags bangs gangs rangs langs * nangs sangs babs rabs rams nams * * As I mentioned before, I think that the * best solution for now is to hard-wire these * cases. Even if the list is not exhaustive, * at least we'll have most cases covered. */ /* FIXME: these constants are hard-wired here, * rather than in TibetanMachineWeb, because * I'm lazy. */ if ((wylie0.equals("g") && (wylie1.equals("d") || wylie1.equals("n") || wylie1.equals("s"))) || (wylie0.equals("d") && (wylie1.equals("g") || wylie1.equals("m"))) || (wylie0.equals("b") && wylie1.equals("d")) || (wylie0.equals("m") && wylie1.equals("d")) || (wylie0.equals("'") && (wylie1.equals("g") || wylie1.equals("d")))) { sb.append(wylie1 + aVowelToUseAfter(wylie1) + wylie2); } else { sb.append(aVowelToUseAfter(wylie0) + unambiguousPostAVowelWylie(wylie1, wylie2)); } } else { /* no ambiguity. the "a" vowel comes after * wylie1. */ if (TibetanMachineWeb.isAmbiguousWylie(wylie0, wylie1)) sb.append(WYLIE_DISAMBIGUATING_KEY); sb.append(wylie1 + aVowelToUseAfter(wylie1) + wylie2); } } else { if (makeIllegalTibetanGoEndToEnd && !(TibetanMachineWeb.isWylieRight(wylie1) && TibetanMachineWeb.isWylieFarRight(wylie2))) { /* handle skaskaska, e.g. */ sb.append(aVowelToUseAfter(wylie0) + wylie1 + aVowelToUseAfter(wylie1) + wylie2 + aVowelToUseAfter(wylie2)); } else { /* no ambiguity. the "a" vowel comes after * wylie0. */ sb.append(aVowelToUseAfter(wylie0) + unambiguousPostAVowelWylie(wylie1, wylie2)); } } } else { /* If size==4, then we assume this is legal. If * size==5, anything will do! So assume we have a * prefix, a root letter, a suffix, and a postsuffix. * The "a" vowel comes after the root letter. */ sb.append(aVowelToUseAfter(lastWylie) + unambiguousPostAVowelWylie(wylie1, wylie2)); } return sb.toString(); } } /** * Gets the Extended Wylie for a list of glyphs. * Passed a list of TibetanMachineWeb glyphs that constitute a partial * or complete syllable, this method scans the list, and then returns a * string of Wylie corresponding to this sequence. No 'a' vowel is * inserted because it is assumed that the glyph list already contains * some other vowel. If the glyph list does not already contain a vowel, * then this method should not be called. * * @param glyphList a list of TibetanMachineWeb glyphs, i.e. {@link org.thdl.tib.text.DuffCode DuffCodes} * @return the Wylie string corresponding to this glyph list */ public static String withoutA(java.util.ArrayList glyphList) { StringBuffer sb = new StringBuffer(); Iterator iter = glyphList.iterator(); DuffCode dc; String currWylie; String lastWylie = new String(); while (iter.hasNext()) { dc = (DuffCode)iter.next(); currWylie = TibetanMachineWeb.getWylieForGlyph(dc); if (TibetanMachineWeb.isAmbiguousWylie(lastWylie, currWylie) || (!lastWylie.equals("") && currWylie.equals(ACHEN))) sb.append(WYLIE_DISAMBIGUATING_KEY); /* le'ang, not le'ng, to be consistent w.r.t. pa'am * vs. pa'm: */ if (lastWylie.equals(ACHUNG)) sb.append(WYLIE_aVOWEL); sb.append(currWylie); lastWylie = currWylie; } // DLC FIXME: type jeskada, convert Tibetan->Wylie. You get // the wrong thing in makeIllegalTibetanGoEndToEnd mode. Fix // it here. return sb.toString(); } /** * Gets the Extended Wylie for a sequence of glyphs. * @param dcs an array of glyphs * @return the Extended Wylie corresponding to these glyphs */ public static String getWylie(DuffCode[] dcs) { if (dcs.length == 0) return null; char ch; String wylie; ArrayList glyphList = new ArrayList(); boolean needsVowel = true; boolean isLastVowel = false; int start = 0; StringBuffer wylieBuffer = new StringBuffer(); for (int i=start; i 0 || !glyphList.isEmpty()) { String thisPart; if (needsVowel) thisPart = withA(glyphList); else thisPart = withoutA(glyphList); wylieBuffer.append(thisPart); glyphList.clear(); needsVowel = true; isLastVowel = false; } wylieBuffer.append(ch); } else { wylie = TibetanMachineWeb.getWylieForGlyph(dcs[i]); boolean containsBindu = false; if (wylie.length() > 1 && wylie.charAt(wylie.length()-1) == BINDU) { char[] cArray = wylie.toCharArray(); wylie = new String(cArray, 0, wylie.length()-1); containsBindu = true; } process_block: { if (TibetanMachineWeb.isWyliePunc(wylie)) { isLastVowel = false; if (glyphList.isEmpty()) { wylieBuffer.append(wylie); } else { String thisPart; if (needsVowel) thisPart = withA(glyphList); else thisPart = withoutA(glyphList); wylieBuffer.append(thisPart); wylieBuffer.append(wylie); //append the punctuation glyphList.clear(); } needsVowel = true; //next consonants are syllable onset, so we are awaiting vowel } else if (TibetanMachineWeb.isWylieChar(wylie)) { //isChar must come before isVowel because ACHEN has priority over WYLIE_aVOWEL isLastVowel = false; glyphList.add(dcs[i]); } else if (TibetanMachineWeb.isWylieVowel(wylie)) { if (isLastVowel) { int len = wylieBuffer.length(); int A_len = A_VOWEL.length(); if (wylieBuffer.substring(len-A_len).equals(A_VOWEL)) { try { if (wylie.equals(i_VOWEL)) { wylieBuffer.delete(len-A_len, len); wylieBuffer.append(I_VOWEL); isLastVowel = false; break process_block; } else if (wylie.equals(reverse_i_VOWEL)) { wylieBuffer.delete(len-A_len, len); wylieBuffer.append(reverse_I_VOWEL); isLastVowel = false; break process_block; } } catch (StringIndexOutOfBoundsException se) { ThdlDebug.noteIffyCode(); } wylieBuffer.append(wylie); //append current vowel isLastVowel = false; } else wylieBuffer.append(wylie); //append current vowel } else { int glyphCount = glyphList.size(); boolean insertDisAmbig = false; if (0 != glyphCount) { DuffCode top_dc = (DuffCode)glyphList.get(glyphCount-1); String top_wylie = TibetanMachineWeb.getWylieForGlyph(top_dc); if (top_wylie.equals(ACHEN)) { glyphList.remove(glyphCount-1); if (glyphCount-1 == 0) { top_dc = null; } else { insertDisAmbig = true; top_dc = (DuffCode)glyphList.get(glyphCount-2); } } if (top_dc == null || !TibetanMachineWeb.getWylieForGlyph(top_dc).equals(ACHUNG)) { String thisPart = withoutA(glyphList); wylieBuffer.append(thisPart); //append consonants in glyphList } else { glyphCount = glyphList.size(); glyphList.remove(glyphCount-1); if (glyphCount-1 != 0) { String thisPart = withA(glyphList); wylieBuffer.append(thisPart); } wylieBuffer.append(ACHUNG); } } if (insertDisAmbig) wylieBuffer.append(WYLIE_DISAMBIGUATING_KEY); wylieBuffer.append(wylie); //append vowel glyphList.clear(); isLastVowel = true; needsVowel = false; } } else { //must be a stack isLastVowel = false; glyphList.add(dcs[i]); } } if (containsBindu) { isLastVowel = false; wylieBuffer.append(withoutA(glyphList)); wylieBuffer.append(BINDU); //append the bindu glyphList.clear(); } } } //replace TMW with Wylie if (!glyphList.isEmpty()) { String thisPart; if (needsVowel) thisPart = withA(glyphList); else thisPart = withoutA(glyphList); wylieBuffer.append(thisPart); } if (wylieBuffer.length() > 0) return wylieBuffer.toString(); else return null; } }