I hope that Revamped the "Tools>Convert Tibetan To Wylie" feature that

converts TibetanMachineWeb glyphs to THDL Wylie.  Three-glyph and
four-glyph sequences with implicit "a" vowels are now handled
correctly, except for disambiguation w.r.t. things like b-la-g
vs. bla-g and d-wa vs. dwa.

pa'am, pa'ang etc. now work too.

Illegal Tibetan sequences now become very ugly, but "correct" Wylie.
Correct in the sense that converting it back to glyphs should get you
the glyphs you started with.

I also made a change to TibetanMachineWeb.java that I hope will clear
up problems with this feature when keyboards other than "Extended
Wylie" are selected.

Took nga out of the farRightSet [postsuffixes]; only da and sa belong
there, right?

I tried to get the system in a state such that I could run automated
tests of this stuff, but I ran into difficulties.  I have some manual
test cases; ask if you're interested.
This commit is contained in:
dchandler 2003-03-30 02:31:16 +00:00
parent 2b81020b0e
commit 58f7371e66
2 changed files with 459 additions and 290 deletions

View file

@ -10,7 +10,7 @@ License for the specific terms governing rights and limitations under the
License. License.
The Initial Developer of this software is the Tibetan and Himalayan Digital The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2001 THDL. Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL.
All Rights Reserved. All Rights Reserved.
Contributor(s): ______________________________________. Contributor(s): ______________________________________.
@ -319,7 +319,7 @@ public class TibTextUtils {
} }
else { //could not convert - throw exception else { //could not convert - throw exception
if (start+5 < wylie.length()) if (start+5 < wylie.length())
System.out.println("Bad wylie: "+wylie.substring(start,5)); System.out.println("Bad wylie: "+wylie.substring(start,5)); // FIXME: we're printing to stdout!
else else
System.out.println("Bad wylie: "+wylie.substring(start)); System.out.println("Bad wylie: "+wylie.substring(start));
throw new InvalidWylieException(wylie, start); throw new InvalidWylieException(wylie, start);
@ -752,6 +752,39 @@ public class TibTextUtils {
return null; return null;
} }
/**
* True if you want TibetanMachineWeb-to-Extended-Wylie conversion
* to produce Wylie that, if typed, will produce the same sequence
* of TibetanMachineWeb glyphs. Without it, converting the glyphs
* you get from typing jskad, skaska, skaskaska, skaskaskaska,
* etc. will not give you Wylie, that, if typed in again, will
* produce the original glyphs. Hence, if this is true, then you
* get working, end-to-end Wylie for syntactically illegal
* sequences of glyphs. */
private static final boolean makeIllegalTibetanGoEndToEnd = true;
/** Returns "a", unless wylie is already "a". */
private static String aVowelToUseAfter(String wylie) {
if (wylie.equals(TibetanMachineWeb.ACHEN))
return "";
else
return TibetanMachineWeb.WYLIE_aVOWEL;
}
private static String unambiguousPostAVowelWylie(String wylie1,
String wylie2) {
String disambiguator = "";
// type "lard" vs. "lar.d", and you'll see the need for this
// disambiguation of suffix and postsuffix. sa doesn't take
// any head letters, so only da needs to be considered.
if (TibetanMachineWeb.isWylieTop(wylie1)
&& wylie2.equals(/* FIXME: hard-coded */ "d"))
disambiguator
= new String(new char[] { TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY });
return wylie1 + disambiguator + wylie2;
}
/** /**
* Scans a list of glyphs and returns an Extended Wylie string with 'a' inserted. * Scans a list of glyphs and returns an Extended Wylie string with 'a' inserted.
* Passed a list of TibetanMachineWeb glyphs that constitute a partial * Passed a list of TibetanMachineWeb glyphs that constitute a partial
@ -760,126 +793,256 @@ public class TibTextUtils {
* of Wylie corresponding to this sequence. This method is used * of Wylie corresponding to this sequence. This method is used
* heavily during TibetanMachineWeb to Extended Wylie conversion, * heavily during TibetanMachineWeb to Extended Wylie conversion,
* since there is no glyph corresponding to the Extended Wylie 'a' vowel. * since there is no glyph corresponding to the Extended Wylie 'a' vowel.
* @param glyphList a list of TibetanMachine glyphs, i.e. {@link org.thdl.tib.text.DuffCode DuffCodes}. * @param glyphList a list of TibetanMachineWeb glyphs, i.e. {@link
* org.thdl.tib.text.DuffCode DuffCodes}. Pass in an ArrayList if you
* care at all for speed.
* @return the Wylie string corresponding to this glyph list, with 'a' inserted. * @return the Wylie string corresponding to this glyph list, with 'a' inserted.
*/ */
public static String withA(java.util.List glyphList) { public static String withA(java.util.List glyphList) {
StringBuffer sb = new StringBuffer(); StringBuffer sb = new StringBuffer();
Iterator iter = glyphList.iterator();
int size = glyphList.size(); int size = glyphList.size();
DuffCode dc;
String wylie; String wylie;
String lastWylie = new String(); String lastWylie = "";
switch (size) { switch (size) {
case 0: case 0:
return ""; return "";
case 1: //only one character: 'a' goes after it case 1: //only one glyph: 'a' goes after it
dc = (DuffCode)iter.next(); wylie = TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(0));
wylie = TibetanMachineWeb.getWylieForGlyph(dc); sb.append(wylie);
sb.append(wylie); sb.append(aVowelToUseAfter(wylie));
if (!wylie.equals(TibetanMachineWeb.ACHEN))
sb.append(TibetanMachineWeb.WYLIE_aVOWEL);
return sb.toString(); return sb.toString();
case 2: //two characters: 'a' either goes after first or after both case 2: //two glyphs: 'a' either goes after first or after both
dc = (DuffCode)iter.next(); lastWylie = TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(0));
lastWylie = TibetanMachineWeb.getWylieForGlyph(dc); sb.append(lastWylie);
sb.append(lastWylie); wylie = TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(1));
dc = (DuffCode)iter.next(); if (TibetanMachineWeb.isWylieRight(wylie)) {
wylie = TibetanMachineWeb.getWylieForGlyph(dc); sb.append(aVowelToUseAfter(lastWylie));
if (TibetanMachineWeb.isWylieRight(wylie)) { sb.append(wylie);
if (!lastWylie.equals(TibetanMachineWeb.ACHEN)) } else {
sb.append(TibetanMachineWeb.WYLIE_aVOWEL); /* handle illegal two-glyph combinations,
* e.g., skaska */
if (makeIllegalTibetanGoEndToEnd
&& !TibetanMachineWeb.isWylieLeft(lastWylie)) {
sb.append(aVowelToUseAfter(lastWylie));
}
sb.append(wylie); // FIXME: "g" and "y" should not be hard-coded here.
} // Instead, TibetanMachineWeb should introduce relevant sets
else {
//note: "g" and "y" should not be hard-coded in DuffPane if (lastWylie.equals("g") && wylie.equals("y"))
// instead, TibetanMachineWeb should introduce relevant sets sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
if (lastWylie.equals("g") && wylie.equals("y")) if (!wylie.equals(TibetanMachineWeb.ACHEN)) {
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY); sb.append(wylie);
sb.append(TibetanMachineWeb.WYLIE_aVOWEL);
} else {
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
sb.append(wylie);
}
}
return sb.toString();
if (!wylie.equals(TibetanMachineWeb.ACHEN)) { default:
sb.append(wylie); /* Three or more characters: 'a' goes before last two,
sb.append(TibetanMachineWeb.WYLIE_aVOWEL); * between last two, or in final position, unless we have
} * something like pa'am, in which case the vowel comes
else { * before the first ACHEN. */
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
sb.append(wylie);
}
}
return sb.toString();
default: //three or more characters: 'a' goes before last two, between last two, or in final position /* First, allow for pa'am, and even pa'am'ang, and
int i = 0; * even bskyars'am'ang. 'i, 'o, 'i, 'u, etc. will not
* occur because this is a call to withA, so vowels
* aren't in the glyphList. We will look at the end
* of the glyphList (and no, with an ArrayList, this
* is not O(glyphList.size()), it is O(1)) and work
* our way backward, building up tailEndWylie as we
* go. */
{
StringBuffer tailEndWylie = null;
int effectiveSize = size - 2;
while (effectiveSize >= 0
&& TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(effectiveSize)).equals(TibetanMachineWeb.ACHUNG)) {
if (null == tailEndWylie) tailEndWylie = new StringBuffer();
// prepend:
tailEndWylie.insert(0,
TibetanMachineWeb.ACHUNG
+ aVowelToUseAfter(TibetanMachineWeb.ACHUNG)
+ TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(effectiveSize + 1)));
effectiveSize -= 2;
}
if (null != tailEndWylie) {
return (withA(glyphList.subList(0, effectiveSize + 2))
+ tailEndWylie.toString());
}
}
while (iter.hasNext() && i+2 < size) { if (makeIllegalTibetanGoEndToEnd
dc = (DuffCode)iter.next(); && (size > 4 // this is too many glyphs to be legal
wylie = TibetanMachineWeb.getWylieForGlyph(dc); // this is illegal because it doesn't begin
// with a prefix:
|| (size == 4
&& (!TibetanMachineWeb.isWylieLeft(TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(0)))
// this is illegal because it doesn't have a
// suffix in the proper place, e.g. mjskad:
|| !TibetanMachineWeb.isWylieRight(TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(size - 2)))
// this is illegal because it doesn't have a
// postsuffix in the proper place,
// e.g. 'lan.g, which would otherwise become
// 'lang (with nga, not na and then ga):
|| !TibetanMachineWeb.isWylieFarRight(TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(size - 1))))))) {
for (int i = 0; i < size; i++) {
wylie = TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(i));
if ((lastWylie.equals("g") && wylie.equals("y"))
|| (i != 0 && wylie.equals(TibetanMachineWeb.ACHEN)))
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
if (lastWylie.equals("g") && wylie.equals("y") sb.append(wylie + aVowelToUseAfter(wylie));
|| !lastWylie.equals("") && wylie.equals(TibetanMachineWeb.ACHEN)) lastWylie = wylie;
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY); }
return sb.toString();
}
sb.append(wylie); /* Else, chew up all the glyphs except for the last two. Then decide. */
lastWylie = wylie; int i = 0;
i++; while (i+2 < size) {
} wylie = TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(i));
if ((lastWylie.equals("g") && wylie.equals("y"))
|| (i != 0 && wylie.equals(TibetanMachineWeb.ACHEN)))
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
DuffCode dc1, dc2; sb.append(wylie);
String wylie1, wylie2; lastWylie = wylie;
i++;
}
dc1 = (DuffCode)iter.next(); String wylie1
wylie1 = TibetanMachineWeb.getWylieForGlyph(dc1); = TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(i));
String wylie2
dc2 = (DuffCode)iter.next(); = TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(i + 1));
wylie2 = TibetanMachineWeb.getWylieForGlyph(dc2);
if (TibetanMachineWeb.isWylieLeft(lastWylie) && TibetanMachineWeb.isWylieRight(wylie2)) { if (size == 3) {
if (lastWylie.equals("g") && wylie1.equals("y")) String wylie0 = lastWylie;
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY); // Let's see if wylie0+wylie1+wylie2 is ambiguous
// -- if wylie0 could be a prefix and if wylie1
// could be a suffix, and if wylie2 is "s". If
// it's ambigous, let's look up
// wylie0+wylie1+wylie2 in our magic table.
// Otherwise, see if we have a prefix, and if we
// do, the "a" vowel comes after wylie1. Else the
// "a" vowel comes after wylie0.
if (TibetanMachineWeb.isWylieLeft(wylie0)) {
/* is it ambiguous? */
if (TibetanMachineWeb.isWylieRight(wylie1)
&& TibetanMachineWeb.SA.equals(wylie2)) {
/* Yes, this is ambiguous. How do we handle it? See this from Andres:
if (!wylie1.equals(TibetanMachineWeb.ACHEN)) { I'm posting this upon David Chandler's request. According to Lobsang
sb.append(wylie1); Thonden in Modern Tibetan Grammar Language (page 42), with regards to
sb.append(TibetanMachineWeb.WYLIE_aVOWEL); identifying the root letter in 3 lettered words there are only 23
} ambiguous cases. He writes:
else {
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
sb.append(wylie1);
}
sb.append(wylie2); If the last letter is 'sa' and the first two letters are affixes, then
} the SECOND ONE is the root letter in the following 9 WORDS ONLY:
else if (TibetanMachineWeb.isWylieRight(wylie1) && TibetanMachineWeb.isWylieFarRight(wylie2)) {
if (!lastWylie.equals(TibetanMachineWeb.ACHEN))
sb.append(TibetanMachineWeb.WYLIE_aVOWEL);
sb.append(wylie1); gdas gnas gsas dgas dmas bdas mdas 'gas 'das
sb.append(wylie2);
}
else {
sb.append(wylie1);
if (wylie1.equals("g") && wylie2.equals("y"))
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
if (!wylie2.equals(TibetanMachineWeb.ACHEN)) { And the FIRST is the root letter in the following 14 WORDS ONLY:
sb.append(wylie2);
sb.append(TibetanMachineWeb.WYLIE_aVOWEL);
}
else {
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
sb.append(wylie2);
}
}
return sb.toString(); rags lags nags bags bangs gangs rangs langs nangs sangs
} babs rabs rams nams
As I mentioned before, I think that the best solution for now is to
hard-wire these cases. Even if the list is not exhaustive, at least
we'll have most cases covered.
*/
/* FIXME: these constants are hard-wired here,
* rather than in TibetanMachineWeb, because
* I'm lazy. */
if ((wylie0.equals("g") && (wylie1.equals("d") || wylie1.equals("n") || wylie1.equals("s")))
|| (wylie0.equals("d") && (wylie1.equals("g") || wylie1.equals("m")))
|| (wylie0.equals("b") && wylie1.equals("d"))
|| (wylie0.equals("m") && wylie1.equals("d"))
|| (wylie0.equals("'") && (wylie1.equals("g") || wylie1.equals("d")))) {
sb.append(wylie1
+ aVowelToUseAfter(wylie1)
+ wylie2);
} else {
sb.append(aVowelToUseAfter(wylie0)
+ unambiguousPostAVowelWylie(wylie1,
wylie2));
}
// DLC FIXME: what about ambiguity between
// wa-zur and wa? dwa vs. d.wa, e.g.?
// DLC FIXME: disambiguators are needed for
// this case too, as b.lag vs. blag
// illustrates. Use something based on this,
// from LegalTshegBar.java:
//
// boolean disambiguatorNeeded = false;
// char prefix = getPrefix();
// sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(prefix));
// if (!hasHeadLetter()) {
// if (EWC_ya == rootLetter) {
// if (isConsonantThatTakesYaBtags(prefix))
// disambiguatorNeeded = true;
// } else if (EWC_ra == rootLetter) {
// if (isConsonantThatTakesRaBtags(prefix))
// disambiguatorNeeded = true;
// } else if (EWC_la == rootLetter) {
// if (isConsonantThatTakesLaBtags(prefix))
// disambiguatorNeeded = true;
// } else if (EWC_wa == rootLetter) {
// if (isConsonantThatTakesWaZur(prefix))
// disambiguatorNeeded = true;
// }
// }
// if (disambiguatorNeeded)
// sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
} else {
/* no ambiguity. the "a" vowel comes after
* wylie1. */
sb.append(wylie1
+ aVowelToUseAfter(wylie1)
+ wylie2);
}
} else {
if (makeIllegalTibetanGoEndToEnd
&& !(TibetanMachineWeb.isWylieRight(wylie1)
&& TibetanMachineWeb.isWylieFarRight(wylie2))) {
/* handle skaskaska, e.g. */
sb.append(aVowelToUseAfter(wylie0)
+ wylie1
+ aVowelToUseAfter(wylie1)
+ wylie2
+ aVowelToUseAfter(wylie2));
} else {
/* no ambiguity. the "a" vowel comes after
* wylie0. */
sb.append(aVowelToUseAfter(wylie0)
+ unambiguousPostAVowelWylie(wylie1,
wylie2));
}
}
} else {
/* If size==4, then we assume this is legal. If
* size==5, anything will do! So assume we have a
* prefix, a root letter, a suffix, and a postsuffix.
* The "a" vowel comes after the root letter. */
sb.append(aVowelToUseAfter(lastWylie)
+ unambiguousPostAVowelWylie(wylie1,
wylie2));
}
return sb.toString();
}
} }
/** /**
@ -891,10 +1054,10 @@ public class TibTextUtils {
* some other vowel. If the glyph list does not already contain a vowel, * some other vowel. If the glyph list does not already contain a vowel,
* then this method should not be called. * then this method should not be called.
* *
* @param glyphList a list of TibetanMachine glyphs, i.e. {@link org.thdl.tib.text.DuffCode DuffCodes} * @param glyphList a list of TibetanMachineWeb glyphs, i.e. {@link org.thdl.tib.text.DuffCode DuffCodes}
* @return the Wylie string corresponding to this glyph list * @return the Wylie string corresponding to this glyph list
*/ */
public static String withoutA(java.util.List glyphList) { public static String withoutA(java.util.ArrayList glyphList) {
StringBuffer sb = new StringBuffer(); StringBuffer sb = new StringBuffer();
Iterator iter = glyphList.iterator(); Iterator iter = glyphList.iterator();
DuffCode dc; DuffCode dc;
@ -908,9 +1071,10 @@ public class TibTextUtils {
//note: "g" and "y" should not be hard-coded //note: "g" and "y" should not be hard-coded
// instead, TibetanMachineWeb should introduce relevant sets // instead, TibetanMachineWeb should introduce relevant sets
if (lastWylie.equals("g") && currWylie.equals("y") if ((lastWylie.equals("g") && currWylie.equals("y"))
|| !lastWylie.equals("") && currWylie.equals(TibetanMachineWeb.ACHEN)) || (!lastWylie.equals("")
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY); && currWylie.equals(TibetanMachineWeb.ACHEN)))
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
sb.append(currWylie); sb.append(currWylie);
@ -921,7 +1085,7 @@ public class TibTextUtils {
} }
/** /**
* Gets the Extended Wylie for a set of glyphs. * Gets the Extended Wylie for a sequence of glyphs.
* @param dcs an array of glyphs * @param dcs an array of glyphs
* @return the Extended Wylie corresponding to these glyphs * @return the Extended Wylie corresponding to these glyphs
*/ */
@ -932,166 +1096,165 @@ public class TibTextUtils {
char ch; char ch;
String wylie; String wylie;
List glyphList = new ArrayList(); ArrayList glyphList = new ArrayList();
boolean needsVowel = true; boolean needsVowel = true;
boolean isLastVowel = false; boolean isLastVowel = false;
int start = 0; int start = 0;
StringBuffer wylieBuffer = new StringBuffer(); StringBuffer wylieBuffer = new StringBuffer();
for (int i=start; i<dcs.length; i++) { for (int i=start; i<dcs.length; i++) {
ch = dcs[i].character; ch = dcs[i].character;
int k = dcs[i].charNum; int k = dcs[i].charNum;
// int fontNum = dcs[i].fontNum; // int fontNum = dcs[i].fontNum;
if (k < 32) { if (k < 32) {
if (wylieBuffer.length() > 0 || !glyphList.isEmpty()) { if (wylieBuffer.length() > 0 || !glyphList.isEmpty()) {
if (needsVowel) String thisPart;
wylieBuffer.append(withA(glyphList)); if (needsVowel)
else thisPart = withA(glyphList);
wylieBuffer.append(withoutA(glyphList)); else
thisPart = withoutA(glyphList);
wylieBuffer.append(thisPart);
glyphList.clear(); glyphList.clear();
needsVowel = true; needsVowel = true;
isLastVowel = false; isLastVowel = false;
} }
wylieBuffer.append(ch); wylieBuffer.append(ch);
} } else {
else { wylie = TibetanMachineWeb.getWylieForGlyph(dcs[i]);
wylie = TibetanMachineWeb.getWylieForGlyph(dcs[i]);
boolean containsBindu = false; boolean containsBindu = false;
if (wylie.length() > 1 && wylie.charAt(wylie.length()-1) == TibetanMachineWeb.BINDU) { if (wylie.length() > 1 && wylie.charAt(wylie.length()-1) == TibetanMachineWeb.BINDU) {
char[] cArray = wylie.toCharArray(); char[] cArray = wylie.toCharArray();
wylie = new String(cArray, 0, wylie.length()-1); wylie = new String(cArray, 0, wylie.length()-1);
containsBindu = true; containsBindu = true;
} }
process_block: { process_block: {
if (TibetanMachineWeb.isWyliePunc(wylie)) { if (TibetanMachineWeb.isWyliePunc(wylie)) {
isLastVowel = false; isLastVowel = false;
if (glyphList.isEmpty()) if (glyphList.isEmpty()) {
wylieBuffer.append(wylie); wylieBuffer.append(wylie);
} else {
String thisPart;
if (needsVowel)
thisPart = withA(glyphList);
else
thisPart = withoutA(glyphList);
wylieBuffer.append(thisPart);
else { wylieBuffer.append(wylie); //append the punctuation
if (needsVowel)
wylieBuffer.append(withA(glyphList));
else
wylieBuffer.append(withoutA(glyphList));
wylieBuffer.append(wylie); //append the punctuation
glyphList.clear();
}
needsVowel = true; //next consonants are syllable onset, so we are awaiting vowel
}
glyphList.clear();
}
needsVowel = true; //next consonants are syllable onset, so we are awaiting vowel
} else if (TibetanMachineWeb.isWylieChar(wylie)) {
//isChar must come before isVowel because ACHEN has priority over WYLIE_aVOWEL //isChar must come before isVowel because ACHEN has priority over WYLIE_aVOWEL
else if (TibetanMachineWeb.isWylieChar(wylie)) { isLastVowel = false;
isLastVowel = false; glyphList.add(dcs[i]);
glyphList.add(dcs[i]); } else if (TibetanMachineWeb.isWylieVowel(wylie)) {
} if (isLastVowel) {
int len = wylieBuffer.length();
int A_len = TibetanMachineWeb.A_VOWEL.length();
else if (TibetanMachineWeb.isWylieVowel(wylie)) { if (wylieBuffer.substring(len-A_len).equals(TibetanMachineWeb.A_VOWEL)) {
if (isLastVowel) { try {
int len = wylieBuffer.length(); if (wylie.equals(TibetanMachineWeb.i_VOWEL)) {
int A_len = TibetanMachineWeb.A_VOWEL.length(); wylieBuffer.delete(len-A_len, len);
wylieBuffer.append(TibetanMachineWeb.I_VOWEL);
isLastVowel = false;
break process_block;
} else if (wylie.equals(TibetanMachineWeb.reverse_i_VOWEL)) {
wylieBuffer.delete(len-A_len, len);
wylieBuffer.append(TibetanMachineWeb.reverse_I_VOWEL);
isLastVowel = false;
break process_block;
}
}
catch (StringIndexOutOfBoundsException se) {
ThdlDebug.noteIffyCode();
}
if (wylieBuffer.substring(len-A_len).equals(TibetanMachineWeb.A_VOWEL)) { wylieBuffer.append(wylie); //append current vowel
try { isLastVowel = false;
if (wylie.equals(TibetanMachineWeb.i_VOWEL)) { } else
wylieBuffer.delete(len-A_len, len); wylieBuffer.append(wylie); //append current vowel
wylieBuffer.append(TibetanMachineWeb.I_VOWEL); } else {
isLastVowel = false; int glyphCount = glyphList.size();
break process_block; boolean insertDisAmbig = false;
}
else if (wylie.equals(TibetanMachineWeb.reverse_i_VOWEL)) {
wylieBuffer.delete(len-A_len, len);
wylieBuffer.append(TibetanMachineWeb.reverse_I_VOWEL);
isLastVowel = false;
break process_block;
}
}
catch (StringIndexOutOfBoundsException se) {
ThdlDebug.noteIffyCode();
}
wylieBuffer.append(wylie); //append current vowel if (0 != glyphCount) {
isLastVowel = false; DuffCode top_dc = (DuffCode)glyphList.get(glyphCount-1);
} String top_wylie = TibetanMachineWeb.getWylieForGlyph(top_dc);
else
wylieBuffer.append(wylie); //append current vowel
}
else {
int glyphCount = glyphList.size();
boolean insertDisAmbig = false;
if (0 != glyphCount) { if (top_wylie.equals(TibetanMachineWeb.ACHEN)) {
DuffCode top_dc = (DuffCode)glyphList.get(glyphCount-1); glyphList.remove(glyphCount-1);
String top_wylie = TibetanMachineWeb.getWylieForGlyph(top_dc);
if (top_wylie.equals(TibetanMachineWeb.ACHEN)) {
glyphList.remove(glyphCount-1);
if (glyphCount-1 == 0) if (glyphCount-1 == 0) {
top_dc = null; top_dc = null;
else { } else {
insertDisAmbig = true; insertDisAmbig = true;
top_dc = (DuffCode)glyphList.get(glyphCount-2); top_dc = (DuffCode)glyphList.get(glyphCount-2);
} }
} }
if (top_dc == null || !TibetanMachineWeb.getWylieForGlyph(top_dc).equals(TibetanMachineWeb.ACHUNG)) if (top_dc == null || !TibetanMachineWeb.getWylieForGlyph(top_dc).equals(TibetanMachineWeb.ACHUNG)) {
wylieBuffer.append(withoutA(glyphList)); //append consonants in glyphList String thisPart = withoutA(glyphList);
else { wylieBuffer.append(thisPart); //append consonants in glyphList
glyphCount = glyphList.size(); } else {
glyphList.remove(glyphCount-1); glyphCount = glyphList.size();
glyphList.remove(glyphCount-1);
if (glyphCount-1 != 0) if (glyphCount-1 != 0) {
wylieBuffer.append(withA(glyphList)); String thisPart = withA(glyphList);
wylieBuffer.append(thisPart);
}
wylieBuffer.append(TibetanMachineWeb.ACHUNG); wylieBuffer.append(TibetanMachineWeb.ACHUNG);
} }
} }
if (insertDisAmbig) if (insertDisAmbig)
wylieBuffer.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY); wylieBuffer.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
wylieBuffer.append(wylie); //append vowel wylieBuffer.append(wylie); //append vowel
glyphList.clear(); glyphList.clear();
isLastVowel = true; isLastVowel = true;
needsVowel = false; needsVowel = false;
} }
} } else { //must be a stack
else { //must be a stack isLastVowel = false;
isLastVowel = false; glyphList.add(dcs[i]);
glyphList.add(dcs[i]); }
} }
}
if (containsBindu) { if (containsBindu) {
isLastVowel = false; isLastVowel = false;
wylieBuffer.append(withoutA(glyphList)); wylieBuffer.append(withoutA(glyphList));
wylieBuffer.append(TibetanMachineWeb.BINDU); //append the bindu wylieBuffer.append(TibetanMachineWeb.BINDU); //append the bindu
glyphList.clear(); glyphList.clear();
} }
} }
} }
//replace TMW with Wylie //replace TMW with Wylie
if (!glyphList.isEmpty()) { if (!glyphList.isEmpty()) {
if (needsVowel) String thisPart;
wylieBuffer.append(withA(glyphList)); if (needsVowel)
else thisPart = withA(glyphList);
wylieBuffer.append(withoutA(glyphList)); else
} thisPart = withoutA(glyphList);
wylieBuffer.append(thisPart);
}
if (wylieBuffer.length() > 0) if (wylieBuffer.length() > 0)
return wylieBuffer.toString(); return wylieBuffer.toString();
else else
return null; return null;
} }
} }

View file

@ -61,6 +61,7 @@ public class TibetanMachineWeb {
private static Set charSet = null; private static Set charSet = null;
private static Set vowelSet = null; private static Set vowelSet = null;
private static Set puncSet = null; private static Set puncSet = null;
private static Set topSet = null;
private static Set leftSet = null; private static Set leftSet = null;
private static Set rightSet = null; private static Set rightSet = null;
private static Set farRightSet = null; private static Set farRightSet = null;
@ -135,6 +136,10 @@ public class TibetanMachineWeb {
*/ */
public static final String ACHUNG = "'"; public static final String ACHUNG = "'";
/** /**
* the Wylie for the 28th of the 30 consonants, sa:
*/
public static final String SA = "s";
/**
* the Wylie for achen * the Wylie for achen
*/ */
public static final String ACHEN = "a"; public static final String ACHEN = "a";
@ -238,9 +243,14 @@ public class TibetanMachineWeb {
*/ */
public static final int HALF_C = 10; public static final int HALF_C = 10;
/** head letters, superscribed letters */
private static final String tops = "r,s,l";
/** prefixes */
private static final String lefts = "g,d,b,m,'"; private static final String lefts = "g,d,b,m,'";
/** suffixes */
private static final String rights = "g,ng,d,n,b,m,r,l,s,',T"; private static final String rights = "g,ng,d,n,b,m,r,l,s,',T";
private static final String farrights = "d,s,ng"; /** postsuffixes */
private static final String farrights = "d,s"; // DLC FIXME: why was nga here in past revisions?
static { static {
@ -324,10 +334,15 @@ public class TibetanMachineWeb {
} }
StringTokenizer sTok; StringTokenizer sTok;
topSet = new HashSet();
leftSet = new HashSet(); leftSet = new HashSet();
rightSet = new HashSet(); rightSet = new HashSet();
farRightSet = new HashSet(); farRightSet = new HashSet();
sTok = new StringTokenizer(tops, ",");
while (sTok.hasMoreTokens())
topSet.add(sTok.nextToken());
sTok = new StringTokenizer(lefts, ","); sTok = new StringTokenizer(lefts, ",");
while (sTok.hasMoreTokens()) while (sTok.hasMoreTokens())
leftSet.add(sTok.nextToken()); leftSet.add(sTok.nextToken());
@ -634,10 +649,7 @@ public static boolean isChar(String s) {
* Extended Wylie transliteration, false if not * Extended Wylie transliteration, false if not
*/ */
public static boolean isWylieChar(String s) { public static boolean isWylieChar(String s) {
if (charSet.contains(s)) return charSet.contains(s);
return true;
return false;
} }
/** /**
@ -648,17 +660,10 @@ public static boolean isWylieChar(String s) {
* keyboard, false if not * keyboard, false if not
*/ */
public static boolean isPunc(String s) { public static boolean isPunc(String s) {
if (currentKeyboardIsExtendedWylie()) { if (currentKeyboardIsExtendedWylie())
if (puncSet.contains(s)) return puncSet.contains(s);
return true;
else
return false;
}
else else
if (keyboard.isPunc(s)) return keyboard.isPunc(s);
return true;
else
return false;
} }
/** /**
@ -669,10 +674,7 @@ public static boolean isPunc(String s) {
* Extended Wylie transliteration, false if not * Extended Wylie transliteration, false if not
*/ */
public static boolean isWyliePunc(String s) { public static boolean isWyliePunc(String s) {
if (puncSet.contains(s)) return puncSet.contains(s);
return true;
return false;
} }
/** /**
@ -683,17 +685,10 @@ public static boolean isWyliePunc(String s) {
* keyboard, false if not * keyboard, false if not
*/ */
public static boolean isVowel(String s) { public static boolean isVowel(String s) {
if (currentKeyboardIsExtendedWylie()) { if (currentKeyboardIsExtendedWylie())
if (vowelSet.contains(s)) return vowelSet.contains(s);
return true;
else
return false;
}
else else
if (keyboard.isVowel(s)) return keyboard.isVowel(s);
return true;
else
return false;
} }
/** /**
@ -704,28 +699,23 @@ public static boolean isVowel(String s) {
* Extended Wylie transliteration, false if not * Extended Wylie transliteration, false if not
*/ */
public static boolean isWylieVowel(String s) { public static boolean isWylieVowel(String s) {
if (vowelSet.contains(s)) return vowelSet.contains(s);
return true;
return false;
} }
/** /**
* Returns true iff this Wylie is valid as a leftmost character in a * Returns true iff this Wylie is valid as a leftmost character in a
* Tibetan syllable. For example, in the syllable 'brgyad', 'b' is the * Tibetan syllable. For example, in the syllable 'brgyad', 'b' is the
* leftmost character. Valid leftmost characters include g, d, b, and * leftmost character. Valid leftmost characters include g, d, b, ',
* m. * and m.
* @param s the (Wylie) string to be checked * @param s the (Wylie) string to be checked
* @return true if s is a possible leftmost character in a Tibetan * @return true if s is a possible leftmost character in a Tibetan
* syllable, false if not. */ * syllable, false if not. */
public static boolean isWylieLeft(String s) { public static boolean isWylieLeft(String s) {
if (keyboard != null) if (useReallyIffyCode) {
s = keyboard.getWylieForChar(s); if (keyboard != null)
s = keyboard.getWylieForChar(s);
if (leftSet.contains(s)) }
return true; return leftSet.contains(s);
else
return false;
} }
/** /**
@ -737,29 +727,45 @@ public static boolean isWylieLeft(String s) {
* @return true if s is a possible right character in a Tibetan * @return true if s is a possible right character in a Tibetan
* syllable, false if not. */ * syllable, false if not. */
public static boolean isWylieRight(String s) { public static boolean isWylieRight(String s) {
if (keyboard != null) if (useReallyIffyCode) {
s = keyboard.getWylieForChar(s); if (keyboard != null)
s = keyboard.getWylieForChar(s);
if (rightSet.contains(s)) }
return true; return rightSet.contains(s);
else
return false;
} }
/** /**
* Returns true iff this Wylie is valid as a leftmost character in a * Returns true iff this Wylie is valid as a postsuffix in a
* Tibetan syllable. * Tibetan syllable.
* @param s the string to be checked * @param s the string to be checked
* @return true if s is a possible leftmost character in a Tibetan * @return true if s is a possible postsuffix in a Tibetan
* syllable, false if not. */ * syllable, false if not. */
public static boolean isWylieFarRight(String s) { public static boolean isWylieFarRight(String s) {
if (keyboard != null) if (useReallyIffyCode) {
s = keyboard.getWylieForChar(s); if (keyboard != null)
s = keyboard.getWylieForChar(s);
}
return farRightSet.contains(s);
}
if (farRightSet.contains(s)) /** DLC FIXME: what is the point of this code? TibTextUtils
return true; doesn't work for TCC#1 and the like, does it? I bet this
else explains why TMW=>Wylie conversion fails when the Wylie
return false; keyboard isn't in use. */
private static final boolean useReallyIffyCode = false;
/**
* Returns true iff this Wylie is valid as a head letter in a Tibetan
* syllable.
* @param s the string to be checked
* @return true if s is a possible superscribed letter in a Tibetan
* syllable, false if not. */
public static boolean isWylieTop(String s) {
if (useReallyIffyCode) {
if (keyboard != null)
s = keyboard.getWylieForChar(s);
}
return topSet.contains(s);
} }
/** /**