/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text;
import java.util.*;
import javax.swing.*;
import javax.swing.text.*;
import javax.swing.text.rtf.RTFEditorKit;
import java.io.*;
import org.thdl.util.ThdlDebug;
/**
* Provides methods for converting back and forth between Extended
* Wylie and TibetanMachineWeb. This class is not instantiable.
*
*
* The class provides a variety of static methods for converting
* back and forth between Extended Wylie and TibetanMachineWeb. The
* Wylie can be accessed as a String, while the TibetanMachineWeb can
* be exported as Rich Text Format.
*
* @author Edward Garrett, Tibetan and Himalayan Digital Library */
public class TibTextUtils {
/** Do not use this contructor. */
private TibTextUtils() { super(); }
/**
* Converts a list of glyphs into an array of {@link DuffData DuffData}.
* The motivation for this is that most processes - for example using
* TibetanMachineWeb in HTML - only need to know what
* text to output, and when to change fonts. In general, they don't
* need to have an explicit indication for each glyph of the font
* for that glyph.
* @param glyphs the list of TibetanMachineWeb glyphs
* you want to convert
* @return an array of DuffData corresponding to this
* list of glyphs
*/
public static DuffData[] convertGlyphs(List glyphs) {
if (glyphs.size() == 0)
return null;
List data = new ArrayList();
StringBuffer sb = new StringBuffer();
Iterator iter = glyphs.iterator();
DuffCode dc = (DuffCode)iter.next();
int lastfont = dc.fontNum;
sb.append(dc.character);
while (iter.hasNext()) {
dc = (DuffCode)iter.next();
if (dc.fontNum == lastfont)
sb.append(dc.character);
else {
data.add(new DuffData(sb.toString(), lastfont));
lastfont = dc.fontNum;
sb = new StringBuffer();
sb.append(dc.character);
}
}
data.add(new DuffData(sb.toString(), lastfont));
DuffData[] dd = new DuffData[0];
dd = (DuffData[])data.toArray(dd);
return dd;
}
/**
* Figures out how to arrange a list of characters into glyphs. For example, if the user types 'bsgr'
* using the Extended Wylie keyboard, this method figures out that this should be represented
* as a 'b' glyph followed by a 's-g-r' glyph. If you know that the characters do not
* contain Sanskrit stacks, or do not contain Tibetan stacks, then you can specify this
* to speed the process up. Otherwise, the method will first check to see if the characters
* correspond to any Tibetan stacks, and if not, then it will check for Sanskrit stacks.
* @param chars the list of Tibetan characters you want to find glyphs for
* @param areStacksOnRight whether stacking should try to maximize from right to left (true)
* or from left to right (false). In the Extended Wylie keyboard, you try to stack from
* right to left. Thus, the character sequence r-g-r would be stacked as r followed by gr,
* rather than rg followed by r. In the Sambhota and TCC keyboards, the stack direction
* is reversed.
* @param definitelyTibetan should be true if the characters are known to be Tibetan and
* not Sanskrit
* @param definitelySanskrit should be true if the characters are known to be Sanskrit and
* not Tibetan
*/
public static List getGlyphs(List chars, boolean areStacksOnRight, boolean definitelyTibetan, boolean definitelySanskrit) {
StringBuffer tibBuffer, sanBuffer;
String tibCluster, sanCluster;
boolean checkTibetan, checkSanskrit;
if (!(definitelyTibetan || definitelySanskrit)) {
checkTibetan = true;
checkSanskrit = true;
}
else {
checkTibetan = definitelyTibetan;
checkSanskrit = definitelySanskrit;
}
int length = chars.size();
List glyphs = new ArrayList();
glyphs.clear();
if (areStacksOnRight) {
for (int i=0; i-1; i--) {
tibBuffer = new StringBuffer();
tibCluster = null;
sanBuffer = new StringBuffer();
sanCluster = null;
Iterator iter = chars.iterator();
for (int k=0; k 1) {
dc = (DuffCode)glyphs.get(glyphs.size()-1);
if (!TibetanMachineWeb.isWyliePunc(TibetanMachineWeb.getWylieForGlyph(dc))) {
DuffCode dc_2 = (DuffCode)glyphs.removeLast();
DuffCode dc_1 = (DuffCode)glyphs.removeLast();
glyphs.addAll(getVowel(dc_1, dc_2, next));
break vowel_block;
}
}
DuffCode[] dc_array = (DuffCode[])TibetanMachineWeb.getTibHash().get(TibetanMachineWeb.ACHEN);
dc = dc_array[TibetanMachineWeb.TMW];
glyphs.addAll(getVowel(dc, next));
}
chars.clear();
}
isSanskrit = false;
}
else if (TibetanMachineWeb.isWylieChar(next)) {
if (!isSanskrit) //add char to list - it is not sanskrit
chars.add(next);
else if (wasLastSanskritStackingKey) { //add char to list - it is still part of sanskrit stack
chars.add(next);
wasLastSanskritStackingKey = false;
}
else { //char is no longer part of sanskrit stack, therefore compute and add previous stack
glyphs.addAll(getGlyphs(chars, true, !isSanskrit, isSanskrit));
chars.clear();
chars.add(next);
isSanskrit = false;
wasLastSanskritStackingKey = false;
}
}
else if (next.equals(String.valueOf(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY))) {
if (!chars.isEmpty())
glyphs.addAll(getGlyphs(chars, true, !isSanskrit, isSanskrit));
chars.clear();
isSanskrit = false;
}
else if (next.equals(String.valueOf(TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY))) {
if (!isSanskrit) { //begin sanskrit stack
switch (chars.size()) {
case 0:
break; //'+' is not "pre-stacking" key
case 1:
isSanskrit = true;
wasLastSanskritStackingKey = true;
break;
default:
String top_char = (String)chars.get(chars.size()-1);
chars.remove(chars.size()-1);
glyphs.addAll(getGlyphs(chars, true, !isSanskrit, isSanskrit));
chars.clear();
chars.add(top_char);
isSanskrit = true;
wasLastSanskritStackingKey = true;
break;
}
}
}
else if (TibetanMachineWeb.isFormatting(next.charAt(0))) {
if (!chars.isEmpty())
glyphs.addAll(getGlyphs(chars, true, !isSanskrit, isSanskrit));
dc = new DuffCode(1,next.charAt(0));
glyphs.add(dc);
chars.clear();
isSanskrit = false;
}
if (next != null)
start += next.length();
}
if (!chars.isEmpty()) {
glyphs.addAll(getGlyphs(chars, true, !isSanskrit, isSanskrit));
chars.clear();
}
DuffData[] dd = convertGlyphs(glyphs);
return dd;
}
/**
* Gets the bindu sequence for a given context.
* In the TibetanMachineWeb fonts, bindu (anusvara) is realized
* differently depending on which vowel it attaches to. Although
* the default bindu glyph is affixed to consonants and subscript vowels,
* for superscript vowels (i, e, o, etc), there is a single glyph
* which merges the bindu and that vowel together. When you pass this
* method a glyph context, it will return a List of glyphs which
* will either consist of the original glyph followed by the default
* bindu glyph, or a composite vowel+bindu glyph.
* Note that there is only one glyph in the context. This means that
* bindus will not affix properly if superscript vowels are allowed to directly
* precede subscript vowels (e.g. pou).
* @param dc the DuffCode of the glyph you
* want to attach a bindu to
* @return a List of DuffCode glyphs that include the
* original dc, as well as a bindu
*/
public static List getBindu(DuffCode dc) {
List bindus = new ArrayList();
if (null == dc) {
bindus.add(TibetanMachineWeb.getGlyph(String.valueOf(TibetanMachineWeb.BINDU)));
return bindus;
}
if (!TibetanMachineWeb.getBinduMap().containsKey(dc)) {
bindus.add(dc);
bindus.add(TibetanMachineWeb.getGlyph(String.valueOf(TibetanMachineWeb.BINDU)));
return bindus;
}
bindus.add((DuffCode)TibetanMachineWeb.getBinduMap().get(dc));
return bindus;
}
/**
* Gets the vowel sequence for a given vowel in a given context.
* Given a context, this method affixes a vowel and returns the
* context plus the vowel. Generally, it is enough to provide just
* one glyph for context.
* @param context the glyph preceding the vowel you want to affix
* @param vowel the vowel you want to affix, in Wylie
* @return a List of glyphs equal to the vowel in context
*/
public static List getVowel(DuffCode context, String vowel) {
return getVowel(null, context, vowel);
}
/**
* Gets the vowel sequence for a given vowel in a given context.
* Given a context, this method affixes a vowel and returns the context plus the vowel.
* Since the choice of vowel glyph depends on the consonant to which it is attached,
* generally it is enough to provide just the immediately preceding context. However,
* in some cases, double vowels are allowed - for example 'buo'. To find the correct
* glyph for 'o', we need 'b' in this case, not 'u'. Note also that some Extended
* Wylie vowels correspond to multiple glyphs in TibetanMachineWeb. For example,
* the vowel I consists of both an achung and a reverse gigu. All required glyphs
* are part of the returned List.
* @param context_1 the glyph occurring two glyphs before the vowel you want to affix
* @param context_2 the glyph immediately before the vowel you want to affix
* @param vowel the vowel you want to affix, in Wylie
* @return a List of glyphs equal to the vowel in context
*/
public static List getVowel(DuffCode context_1, DuffCode context_2, String vowel) {
List vowels = new ArrayList();
//this vowel doesn't correspond to a glyph -
//so you just return the original context
if ( vowel.equals(TibetanMachineWeb.WYLIE_aVOWEL) ||
TibetanMachineWeb.isTopVowel(context_2)) {
if (context_1 != null)
vowels.add(context_1);
vowels.add(context_2);
return vowels;
}
//first, the three easiest cases: ai, au, and = 0
&& TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(effectiveSize)).equals(TibetanMachineWeb.ACHUNG)) {
if (null == tailEndWylie) tailEndWylie = new StringBuffer();
// prepend:
tailEndWylie.insert(0,
TibetanMachineWeb.ACHUNG
+ aVowelToUseAfter(TibetanMachineWeb.ACHUNG)
+ TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(effectiveSize + 1)));
effectiveSize -= 2;
}
if (null != tailEndWylie) {
return (withA(glyphList.subList(0, effectiveSize + 2))
+ tailEndWylie.toString());
}
}
if (makeIllegalTibetanGoEndToEnd
&& (size > 4 // this is too many glyphs to be legal
// this is illegal because it doesn't begin
// with a prefix:
|| (size == 4
&& (!TibetanMachineWeb.isWylieLeft(TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(0)))
// this is illegal because it doesn't have a
// suffix in the proper place, e.g. mjskad:
|| !TibetanMachineWeb.isWylieRight(TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(size - 2)))
// this is illegal because it doesn't have a
// postsuffix in the proper place,
// e.g. 'lan.g, which would otherwise become
// 'lang (with nga, not na and then ga):
|| !TibetanMachineWeb.isWylieFarRight(TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(size - 1))))))) {
for (int i = 0; i < size; i++) {
wylie = TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(i));
if ((lastWylie.equals("g") && wylie.equals("y"))
|| (i != 0 && wylie.equals(TibetanMachineWeb.ACHEN)))
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
sb.append(wylie + aVowelToUseAfter(wylie));
lastWylie = wylie;
}
return sb.toString();
}
/* Else, chew up all the glyphs except for the last two. Then decide. */
int i = 0;
while (i+2 < size) {
wylie = TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(i));
if ((lastWylie.equals("g") && wylie.equals("y"))
|| (i != 0 && wylie.equals(TibetanMachineWeb.ACHEN)))
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
sb.append(wylie);
lastWylie = wylie;
i++;
}
String wylie1
= TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(i));
String wylie2
= TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(i + 1));
if (size == 3) {
String wylie0 = lastWylie;
// Let's see if wylie0+wylie1+wylie2 is ambiguous
// -- if wylie0 could be a prefix and if wylie1
// could be a suffix, and if wylie2 is "s". If
// it's ambigous, let's look up
// wylie0+wylie1+wylie2 in our magic table.
// Otherwise, see if we have a prefix, and if we
// do, the "a" vowel comes after wylie1. Else the
// "a" vowel comes after wylie0.
if (TibetanMachineWeb.isWylieLeft(wylie0)) {
/* is it ambiguous? */
if (TibetanMachineWeb.isWylieRight(wylie1)
&& TibetanMachineWeb.SA.equals(wylie2)) {
/* Yes, this is ambiguous. How do we handle it? See this from Andres:
I'm posting this upon David Chandler's request. According to Lobsang
Thonden in Modern Tibetan Grammar Language (page 42), with regards to
identifying the root letter in 3 lettered words there are only 23
ambiguous cases. He writes:
If the last letter is 'sa' and the first two letters are affixes, then
the SECOND ONE is the root letter in the following 9 WORDS ONLY:
gdas gnas gsas dgas dmas bdas mdas 'gas 'das
And the FIRST is the root letter in the following 14 WORDS ONLY:
rags lags nags bags bangs gangs rangs langs nangs sangs
babs rabs rams nams
As I mentioned before, I think that the best solution for now is to
hard-wire these cases. Even if the list is not exhaustive, at least
we'll have most cases covered.
*/
/* FIXME: these constants are hard-wired here,
* rather than in TibetanMachineWeb, because
* I'm lazy. */
if ((wylie0.equals("g") && (wylie1.equals("d") || wylie1.equals("n") || wylie1.equals("s")))
|| (wylie0.equals("d") && (wylie1.equals("g") || wylie1.equals("m")))
|| (wylie0.equals("b") && wylie1.equals("d"))
|| (wylie0.equals("m") && wylie1.equals("d"))
|| (wylie0.equals("'") && (wylie1.equals("g") || wylie1.equals("d")))) {
sb.append(wylie1
+ aVowelToUseAfter(wylie1)
+ wylie2);
} else {
sb.append(aVowelToUseAfter(wylie0)
+ unambiguousPostAVowelWylie(wylie1,
wylie2));
}
// DLC FIXME: what about ambiguity between
// wa-zur and wa? dwa vs. d.wa, e.g.?
// DLC FIXME: disambiguators are needed for
// this case too, as b.lag vs. blag
// illustrates. Use something based on this,
// from LegalTshegBar.java:
//
// boolean disambiguatorNeeded = false;
// char prefix = getPrefix();
// sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(prefix));
// if (!hasHeadLetter()) {
// if (EWC_ya == rootLetter) {
// if (isConsonantThatTakesYaBtags(prefix))
// disambiguatorNeeded = true;
// } else if (EWC_ra == rootLetter) {
// if (isConsonantThatTakesRaBtags(prefix))
// disambiguatorNeeded = true;
// } else if (EWC_la == rootLetter) {
// if (isConsonantThatTakesLaBtags(prefix))
// disambiguatorNeeded = true;
// } else if (EWC_wa == rootLetter) {
// if (isConsonantThatTakesWaZur(prefix))
// disambiguatorNeeded = true;
// }
// }
// if (disambiguatorNeeded)
// sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
} else {
/* no ambiguity. the "a" vowel comes after
* wylie1. */
sb.append(wylie1
+ aVowelToUseAfter(wylie1)
+ wylie2);
}
} else {
if (makeIllegalTibetanGoEndToEnd
&& !(TibetanMachineWeb.isWylieRight(wylie1)
&& TibetanMachineWeb.isWylieFarRight(wylie2))) {
/* handle skaskaska, e.g. */
sb.append(aVowelToUseAfter(wylie0)
+ wylie1
+ aVowelToUseAfter(wylie1)
+ wylie2
+ aVowelToUseAfter(wylie2));
} else {
/* no ambiguity. the "a" vowel comes after
* wylie0. */
sb.append(aVowelToUseAfter(wylie0)
+ unambiguousPostAVowelWylie(wylie1,
wylie2));
}
}
} else {
/* If size==4, then we assume this is legal. If
* size==5, anything will do! So assume we have a
* prefix, a root letter, a suffix, and a postsuffix.
* The "a" vowel comes after the root letter. */
sb.append(aVowelToUseAfter(lastWylie)
+ unambiguousPostAVowelWylie(wylie1,
wylie2));
}
return sb.toString();
}
}
/**
* Gets the Extended Wylie for a list of glyphs.
* Passed a list of TibetanMachineWeb glyphs that constitute a partial
* or complete syllable, this method scans the list, and then returns a
* string of Wylie corresponding to this sequence. No 'a' vowel is
* inserted because it is assumed that the glyph list already contains
* some other vowel. If the glyph list does not already contain a vowel,
* then this method should not be called.
*
* @param glyphList a list of TibetanMachineWeb glyphs, i.e. {@link org.thdl.tib.text.DuffCode DuffCodes}
* @return the Wylie string corresponding to this glyph list
*/
public static String withoutA(java.util.ArrayList glyphList) {
StringBuffer sb = new StringBuffer();
Iterator iter = glyphList.iterator();
DuffCode dc;
String currWylie;
String lastWylie = new String();
while (iter.hasNext()) {
dc = (DuffCode)iter.next();
currWylie = TibetanMachineWeb.getWylieForGlyph(dc);
//note: "g" and "y" should not be hard-coded
// instead, TibetanMachineWeb should introduce relevant sets
if ((lastWylie.equals("g") && currWylie.equals("y"))
|| (!lastWylie.equals("")
&& currWylie.equals(TibetanMachineWeb.ACHEN)))
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
sb.append(currWylie);
lastWylie = currWylie;
}
return sb.toString();
}
/**
* Gets the Extended Wylie for a sequence of glyphs.
* @param dcs an array of glyphs
* @return the Extended Wylie corresponding to these glyphs
*/
public static String getWylie(DuffCode[] dcs) {
if (dcs.length == 0)
return null;
char ch;
String wylie;
ArrayList glyphList = new ArrayList();
boolean needsVowel = true;
boolean isLastVowel = false;
int start = 0;
StringBuffer wylieBuffer = new StringBuffer();
for (int i=start; i 0 || !glyphList.isEmpty()) {
String thisPart;
if (needsVowel)
thisPart = withA(glyphList);
else
thisPart = withoutA(glyphList);
wylieBuffer.append(thisPart);
glyphList.clear();
needsVowel = true;
isLastVowel = false;
}
wylieBuffer.append(ch);
} else {
wylie = TibetanMachineWeb.getWylieForGlyph(dcs[i]);
boolean containsBindu = false;
if (wylie.length() > 1 && wylie.charAt(wylie.length()-1) == TibetanMachineWeb.BINDU) {
char[] cArray = wylie.toCharArray();
wylie = new String(cArray, 0, wylie.length()-1);
containsBindu = true;
}
process_block: {
if (TibetanMachineWeb.isWyliePunc(wylie)) {
isLastVowel = false;
if (glyphList.isEmpty()) {
wylieBuffer.append(wylie);
} else {
String thisPart;
if (needsVowel)
thisPart = withA(glyphList);
else
thisPart = withoutA(glyphList);
wylieBuffer.append(thisPart);
wylieBuffer.append(wylie); //append the punctuation
glyphList.clear();
}
needsVowel = true; //next consonants are syllable onset, so we are awaiting vowel
} else if (TibetanMachineWeb.isWylieChar(wylie)) {
//isChar must come before isVowel because ACHEN has priority over WYLIE_aVOWEL
isLastVowel = false;
glyphList.add(dcs[i]);
} else if (TibetanMachineWeb.isWylieVowel(wylie)) {
if (isLastVowel) {
int len = wylieBuffer.length();
int A_len = TibetanMachineWeb.A_VOWEL.length();
if (wylieBuffer.substring(len-A_len).equals(TibetanMachineWeb.A_VOWEL)) {
try {
if (wylie.equals(TibetanMachineWeb.i_VOWEL)) {
wylieBuffer.delete(len-A_len, len);
wylieBuffer.append(TibetanMachineWeb.I_VOWEL);
isLastVowel = false;
break process_block;
} else if (wylie.equals(TibetanMachineWeb.reverse_i_VOWEL)) {
wylieBuffer.delete(len-A_len, len);
wylieBuffer.append(TibetanMachineWeb.reverse_I_VOWEL);
isLastVowel = false;
break process_block;
}
}
catch (StringIndexOutOfBoundsException se) {
ThdlDebug.noteIffyCode();
}
wylieBuffer.append(wylie); //append current vowel
isLastVowel = false;
} else
wylieBuffer.append(wylie); //append current vowel
} else {
int glyphCount = glyphList.size();
boolean insertDisAmbig = false;
if (0 != glyphCount) {
DuffCode top_dc = (DuffCode)glyphList.get(glyphCount-1);
String top_wylie = TibetanMachineWeb.getWylieForGlyph(top_dc);
if (top_wylie.equals(TibetanMachineWeb.ACHEN)) {
glyphList.remove(glyphCount-1);
if (glyphCount-1 == 0) {
top_dc = null;
} else {
insertDisAmbig = true;
top_dc = (DuffCode)glyphList.get(glyphCount-2);
}
}
if (top_dc == null || !TibetanMachineWeb.getWylieForGlyph(top_dc).equals(TibetanMachineWeb.ACHUNG)) {
String thisPart = withoutA(glyphList);
wylieBuffer.append(thisPart); //append consonants in glyphList
} else {
glyphCount = glyphList.size();
glyphList.remove(glyphCount-1);
if (glyphCount-1 != 0) {
String thisPart = withA(glyphList);
wylieBuffer.append(thisPart);
}
wylieBuffer.append(TibetanMachineWeb.ACHUNG);
}
}
if (insertDisAmbig)
wylieBuffer.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
wylieBuffer.append(wylie); //append vowel
glyphList.clear();
isLastVowel = true;
needsVowel = false;
}
} else { //must be a stack
isLastVowel = false;
glyphList.add(dcs[i]);
}
}
if (containsBindu) {
isLastVowel = false;
wylieBuffer.append(withoutA(glyphList));
wylieBuffer.append(TibetanMachineWeb.BINDU); //append the bindu
glyphList.clear();
}
}
}
//replace TMW with Wylie
if (!glyphList.isEmpty()) {
String thisPart;
if (needsVowel)
thisPart = withA(glyphList);
else
thisPart = withoutA(glyphList);
wylieBuffer.append(thisPart);
}
if (wylieBuffer.length() > 0)
return wylieBuffer.toString();
else
return null;
}
}