I hope that Revamped the "Tools>Convert Tibetan To Wylie" feature that
converts TibetanMachineWeb glyphs to THDL Wylie. Three-glyph and four-glyph sequences with implicit "a" vowels are now handled correctly, except for disambiguation w.r.t. things like b-la-g vs. bla-g and d-wa vs. dwa. pa'am, pa'ang etc. now work too. Illegal Tibetan sequences now become very ugly, but "correct" Wylie. Correct in the sense that converting it back to glyphs should get you the glyphs you started with. I also made a change to TibetanMachineWeb.java that I hope will clear up problems with this feature when keyboards other than "Extended Wylie" are selected. Took nga out of the farRightSet [postsuffixes]; only da and sa belong there, right? I tried to get the system in a state such that I could run automated tests of this stuff, but I ran into difficulties. I have some manual test cases; ask if you're interested.
This commit is contained in:
parent
2b81020b0e
commit
58f7371e66
2 changed files with 459 additions and 290 deletions
|
@ -10,7 +10,7 @@ License for the specific terms governing rights and limitations under the
|
|||
License.
|
||||
|
||||
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||
Library (THDL). Portions created by the THDL are Copyright 2001 THDL.
|
||||
Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL.
|
||||
All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
|
@ -319,7 +319,7 @@ public class TibTextUtils {
|
|||
}
|
||||
else { //could not convert - throw exception
|
||||
if (start+5 < wylie.length())
|
||||
System.out.println("Bad wylie: "+wylie.substring(start,5));
|
||||
System.out.println("Bad wylie: "+wylie.substring(start,5)); // FIXME: we're printing to stdout!
|
||||
else
|
||||
System.out.println("Bad wylie: "+wylie.substring(start));
|
||||
throw new InvalidWylieException(wylie, start);
|
||||
|
@ -752,6 +752,39 @@ public class TibTextUtils {
|
|||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* True if you want TibetanMachineWeb-to-Extended-Wylie conversion
|
||||
* to produce Wylie that, if typed, will produce the same sequence
|
||||
* of TibetanMachineWeb glyphs. Without it, converting the glyphs
|
||||
* you get from typing jskad, skaska, skaskaska, skaskaskaska,
|
||||
* etc. will not give you Wylie, that, if typed in again, will
|
||||
* produce the original glyphs. Hence, if this is true, then you
|
||||
* get working, end-to-end Wylie for syntactically illegal
|
||||
* sequences of glyphs. */
|
||||
private static final boolean makeIllegalTibetanGoEndToEnd = true;
|
||||
|
||||
|
||||
/** Returns "a", unless wylie is already "a". */
|
||||
private static String aVowelToUseAfter(String wylie) {
|
||||
if (wylie.equals(TibetanMachineWeb.ACHEN))
|
||||
return "";
|
||||
else
|
||||
return TibetanMachineWeb.WYLIE_aVOWEL;
|
||||
}
|
||||
|
||||
private static String unambiguousPostAVowelWylie(String wylie1,
|
||||
String wylie2) {
|
||||
String disambiguator = "";
|
||||
// type "lard" vs. "lar.d", and you'll see the need for this
|
||||
// disambiguation of suffix and postsuffix. sa doesn't take
|
||||
// any head letters, so only da needs to be considered.
|
||||
if (TibetanMachineWeb.isWylieTop(wylie1)
|
||||
&& wylie2.equals(/* FIXME: hard-coded */ "d"))
|
||||
disambiguator
|
||||
= new String(new char[] { TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY });
|
||||
return wylie1 + disambiguator + wylie2;
|
||||
}
|
||||
|
||||
/**
|
||||
* Scans a list of glyphs and returns an Extended Wylie string with 'a' inserted.
|
||||
* Passed a list of TibetanMachineWeb glyphs that constitute a partial
|
||||
|
@ -760,126 +793,256 @@ public class TibTextUtils {
|
|||
* of Wylie corresponding to this sequence. This method is used
|
||||
* heavily during TibetanMachineWeb to Extended Wylie conversion,
|
||||
* since there is no glyph corresponding to the Extended Wylie 'a' vowel.
|
||||
* @param glyphList a list of TibetanMachine glyphs, i.e. {@link org.thdl.tib.text.DuffCode DuffCodes}.
|
||||
* @param glyphList a list of TibetanMachineWeb glyphs, i.e. {@link
|
||||
* org.thdl.tib.text.DuffCode DuffCodes}. Pass in an ArrayList if you
|
||||
* care at all for speed.
|
||||
* @return the Wylie string corresponding to this glyph list, with 'a' inserted.
|
||||
*/
|
||||
public static String withA(java.util.List glyphList) {
|
||||
StringBuffer sb = new StringBuffer();
|
||||
Iterator iter = glyphList.iterator();
|
||||
int size = glyphList.size();
|
||||
DuffCode dc;
|
||||
String wylie;
|
||||
String lastWylie = new String();
|
||||
String lastWylie = "";
|
||||
|
||||
switch (size) {
|
||||
case 0:
|
||||
return "";
|
||||
case 0:
|
||||
return "";
|
||||
|
||||
case 1: //only one character: 'a' goes after it
|
||||
dc = (DuffCode)iter.next();
|
||||
wylie = TibetanMachineWeb.getWylieForGlyph(dc);
|
||||
sb.append(wylie);
|
||||
if (!wylie.equals(TibetanMachineWeb.ACHEN))
|
||||
sb.append(TibetanMachineWeb.WYLIE_aVOWEL);
|
||||
case 1: //only one glyph: 'a' goes after it
|
||||
wylie = TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(0));
|
||||
sb.append(wylie);
|
||||
sb.append(aVowelToUseAfter(wylie));
|
||||
|
||||
return sb.toString();
|
||||
return sb.toString();
|
||||
|
||||
case 2: //two characters: 'a' either goes after first or after both
|
||||
dc = (DuffCode)iter.next();
|
||||
lastWylie = TibetanMachineWeb.getWylieForGlyph(dc);
|
||||
sb.append(lastWylie);
|
||||
dc = (DuffCode)iter.next();
|
||||
wylie = TibetanMachineWeb.getWylieForGlyph(dc);
|
||||
if (TibetanMachineWeb.isWylieRight(wylie)) {
|
||||
if (!lastWylie.equals(TibetanMachineWeb.ACHEN))
|
||||
sb.append(TibetanMachineWeb.WYLIE_aVOWEL);
|
||||
case 2: //two glyphs: 'a' either goes after first or after both
|
||||
lastWylie = TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(0));
|
||||
sb.append(lastWylie);
|
||||
wylie = TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(1));
|
||||
if (TibetanMachineWeb.isWylieRight(wylie)) {
|
||||
sb.append(aVowelToUseAfter(lastWylie));
|
||||
sb.append(wylie);
|
||||
} else {
|
||||
/* handle illegal two-glyph combinations,
|
||||
* e.g., skaska */
|
||||
if (makeIllegalTibetanGoEndToEnd
|
||||
&& !TibetanMachineWeb.isWylieLeft(lastWylie)) {
|
||||
sb.append(aVowelToUseAfter(lastWylie));
|
||||
}
|
||||
|
||||
sb.append(wylie);
|
||||
}
|
||||
else {
|
||||
// FIXME: "g" and "y" should not be hard-coded here.
|
||||
// Instead, TibetanMachineWeb should introduce relevant sets
|
||||
|
||||
//note: "g" and "y" should not be hard-coded in DuffPane
|
||||
// instead, TibetanMachineWeb should introduce relevant sets
|
||||
if (lastWylie.equals("g") && wylie.equals("y"))
|
||||
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
|
||||
|
||||
if (lastWylie.equals("g") && wylie.equals("y"))
|
||||
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
|
||||
if (!wylie.equals(TibetanMachineWeb.ACHEN)) {
|
||||
sb.append(wylie);
|
||||
sb.append(TibetanMachineWeb.WYLIE_aVOWEL);
|
||||
} else {
|
||||
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
|
||||
sb.append(wylie);
|
||||
}
|
||||
}
|
||||
return sb.toString();
|
||||
|
||||
if (!wylie.equals(TibetanMachineWeb.ACHEN)) {
|
||||
sb.append(wylie);
|
||||
sb.append(TibetanMachineWeb.WYLIE_aVOWEL);
|
||||
}
|
||||
else {
|
||||
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
|
||||
sb.append(wylie);
|
||||
}
|
||||
}
|
||||
return sb.toString();
|
||||
default:
|
||||
/* Three or more characters: 'a' goes before last two,
|
||||
* between last two, or in final position, unless we have
|
||||
* something like pa'am, in which case the vowel comes
|
||||
* before the first ACHEN. */
|
||||
|
||||
default: //three or more characters: 'a' goes before last two, between last two, or in final position
|
||||
int i = 0;
|
||||
/* First, allow for pa'am, and even pa'am'ang, and
|
||||
* even bskyars'am'ang. 'i, 'o, 'i, 'u, etc. will not
|
||||
* occur because this is a call to withA, so vowels
|
||||
* aren't in the glyphList. We will look at the end
|
||||
* of the glyphList (and no, with an ArrayList, this
|
||||
* is not O(glyphList.size()), it is O(1)) and work
|
||||
* our way backward, building up tailEndWylie as we
|
||||
* go. */
|
||||
{
|
||||
StringBuffer tailEndWylie = null;
|
||||
int effectiveSize = size - 2;
|
||||
while (effectiveSize >= 0
|
||||
&& TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(effectiveSize)).equals(TibetanMachineWeb.ACHUNG)) {
|
||||
if (null == tailEndWylie) tailEndWylie = new StringBuffer();
|
||||
// prepend:
|
||||
tailEndWylie.insert(0,
|
||||
TibetanMachineWeb.ACHUNG
|
||||
+ aVowelToUseAfter(TibetanMachineWeb.ACHUNG)
|
||||
+ TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(effectiveSize + 1)));
|
||||
effectiveSize -= 2;
|
||||
}
|
||||
if (null != tailEndWylie) {
|
||||
return (withA(glyphList.subList(0, effectiveSize + 2))
|
||||
+ tailEndWylie.toString());
|
||||
}
|
||||
}
|
||||
|
||||
while (iter.hasNext() && i+2 < size) {
|
||||
dc = (DuffCode)iter.next();
|
||||
wylie = TibetanMachineWeb.getWylieForGlyph(dc);
|
||||
if (makeIllegalTibetanGoEndToEnd
|
||||
&& (size > 4 // this is too many glyphs to be legal
|
||||
// this is illegal because it doesn't begin
|
||||
// with a prefix:
|
||||
|| (size == 4
|
||||
&& (!TibetanMachineWeb.isWylieLeft(TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(0)))
|
||||
// this is illegal because it doesn't have a
|
||||
// suffix in the proper place, e.g. mjskad:
|
||||
|| !TibetanMachineWeb.isWylieRight(TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(size - 2)))
|
||||
// this is illegal because it doesn't have a
|
||||
// postsuffix in the proper place,
|
||||
// e.g. 'lan.g, which would otherwise become
|
||||
// 'lang (with nga, not na and then ga):
|
||||
|| !TibetanMachineWeb.isWylieFarRight(TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(size - 1))))))) {
|
||||
for (int i = 0; i < size; i++) {
|
||||
wylie = TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(i));
|
||||
if ((lastWylie.equals("g") && wylie.equals("y"))
|
||||
|| (i != 0 && wylie.equals(TibetanMachineWeb.ACHEN)))
|
||||
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
|
||||
|
||||
if (lastWylie.equals("g") && wylie.equals("y")
|
||||
|| !lastWylie.equals("") && wylie.equals(TibetanMachineWeb.ACHEN))
|
||||
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
|
||||
sb.append(wylie + aVowelToUseAfter(wylie));
|
||||
lastWylie = wylie;
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
sb.append(wylie);
|
||||
lastWylie = wylie;
|
||||
i++;
|
||||
}
|
||||
/* Else, chew up all the glyphs except for the last two. Then decide. */
|
||||
int i = 0;
|
||||
while (i+2 < size) {
|
||||
wylie = TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(i));
|
||||
if ((lastWylie.equals("g") && wylie.equals("y"))
|
||||
|| (i != 0 && wylie.equals(TibetanMachineWeb.ACHEN)))
|
||||
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
|
||||
|
||||
DuffCode dc1, dc2;
|
||||
String wylie1, wylie2;
|
||||
sb.append(wylie);
|
||||
lastWylie = wylie;
|
||||
i++;
|
||||
}
|
||||
|
||||
dc1 = (DuffCode)iter.next();
|
||||
wylie1 = TibetanMachineWeb.getWylieForGlyph(dc1);
|
||||
|
||||
dc2 = (DuffCode)iter.next();
|
||||
wylie2 = TibetanMachineWeb.getWylieForGlyph(dc2);
|
||||
String wylie1
|
||||
= TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(i));
|
||||
String wylie2
|
||||
= TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(i + 1));
|
||||
|
||||
if (TibetanMachineWeb.isWylieLeft(lastWylie) && TibetanMachineWeb.isWylieRight(wylie2)) {
|
||||
if (lastWylie.equals("g") && wylie1.equals("y"))
|
||||
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
|
||||
if (size == 3) {
|
||||
String wylie0 = lastWylie;
|
||||
// Let's see if wylie0+wylie1+wylie2 is ambiguous
|
||||
// -- if wylie0 could be a prefix and if wylie1
|
||||
// could be a suffix, and if wylie2 is "s". If
|
||||
// it's ambigous, let's look up
|
||||
// wylie0+wylie1+wylie2 in our magic table.
|
||||
// Otherwise, see if we have a prefix, and if we
|
||||
// do, the "a" vowel comes after wylie1. Else the
|
||||
// "a" vowel comes after wylie0.
|
||||
if (TibetanMachineWeb.isWylieLeft(wylie0)) {
|
||||
/* is it ambiguous? */
|
||||
if (TibetanMachineWeb.isWylieRight(wylie1)
|
||||
&& TibetanMachineWeb.SA.equals(wylie2)) {
|
||||
/* Yes, this is ambiguous. How do we handle it? See this from Andres:
|
||||
|
||||
if (!wylie1.equals(TibetanMachineWeb.ACHEN)) {
|
||||
sb.append(wylie1);
|
||||
sb.append(TibetanMachineWeb.WYLIE_aVOWEL);
|
||||
}
|
||||
else {
|
||||
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
|
||||
sb.append(wylie1);
|
||||
}
|
||||
I'm posting this upon David Chandler's request. According to Lobsang
|
||||
Thonden in Modern Tibetan Grammar Language (page 42), with regards to
|
||||
identifying the root letter in 3 lettered words there are only 23
|
||||
ambiguous cases. He writes:
|
||||
|
||||
sb.append(wylie2);
|
||||
}
|
||||
else if (TibetanMachineWeb.isWylieRight(wylie1) && TibetanMachineWeb.isWylieFarRight(wylie2)) {
|
||||
if (!lastWylie.equals(TibetanMachineWeb.ACHEN))
|
||||
sb.append(TibetanMachineWeb.WYLIE_aVOWEL);
|
||||
If the last letter is 'sa' and the first two letters are affixes, then
|
||||
the SECOND ONE is the root letter in the following 9 WORDS ONLY:
|
||||
|
||||
sb.append(wylie1);
|
||||
sb.append(wylie2);
|
||||
}
|
||||
else {
|
||||
sb.append(wylie1);
|
||||
|
||||
if (wylie1.equals("g") && wylie2.equals("y"))
|
||||
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
|
||||
gdas gnas gsas dgas dmas bdas mdas 'gas 'das
|
||||
|
||||
if (!wylie2.equals(TibetanMachineWeb.ACHEN)) {
|
||||
sb.append(wylie2);
|
||||
sb.append(TibetanMachineWeb.WYLIE_aVOWEL);
|
||||
}
|
||||
else {
|
||||
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
|
||||
sb.append(wylie2);
|
||||
}
|
||||
}
|
||||
And the FIRST is the root letter in the following 14 WORDS ONLY:
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
rags lags nags bags bangs gangs rangs langs nangs sangs
|
||||
babs rabs rams nams
|
||||
|
||||
As I mentioned before, I think that the best solution for now is to
|
||||
hard-wire these cases. Even if the list is not exhaustive, at least
|
||||
we'll have most cases covered.
|
||||
|
||||
*/
|
||||
|
||||
/* FIXME: these constants are hard-wired here,
|
||||
* rather than in TibetanMachineWeb, because
|
||||
* I'm lazy. */
|
||||
if ((wylie0.equals("g") && (wylie1.equals("d") || wylie1.equals("n") || wylie1.equals("s")))
|
||||
|| (wylie0.equals("d") && (wylie1.equals("g") || wylie1.equals("m")))
|
||||
|| (wylie0.equals("b") && wylie1.equals("d"))
|
||||
|| (wylie0.equals("m") && wylie1.equals("d"))
|
||||
|| (wylie0.equals("'") && (wylie1.equals("g") || wylie1.equals("d")))) {
|
||||
sb.append(wylie1
|
||||
+ aVowelToUseAfter(wylie1)
|
||||
+ wylie2);
|
||||
} else {
|
||||
sb.append(aVowelToUseAfter(wylie0)
|
||||
+ unambiguousPostAVowelWylie(wylie1,
|
||||
wylie2));
|
||||
}
|
||||
|
||||
// DLC FIXME: what about ambiguity between
|
||||
// wa-zur and wa? dwa vs. d.wa, e.g.?
|
||||
|
||||
// DLC FIXME: disambiguators are needed for
|
||||
// this case too, as b.lag vs. blag
|
||||
// illustrates. Use something based on this,
|
||||
// from LegalTshegBar.java:
|
||||
//
|
||||
// boolean disambiguatorNeeded = false;
|
||||
// char prefix = getPrefix();
|
||||
// sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(prefix));
|
||||
// if (!hasHeadLetter()) {
|
||||
// if (EWC_ya == rootLetter) {
|
||||
// if (isConsonantThatTakesYaBtags(prefix))
|
||||
// disambiguatorNeeded = true;
|
||||
// } else if (EWC_ra == rootLetter) {
|
||||
// if (isConsonantThatTakesRaBtags(prefix))
|
||||
// disambiguatorNeeded = true;
|
||||
// } else if (EWC_la == rootLetter) {
|
||||
// if (isConsonantThatTakesLaBtags(prefix))
|
||||
// disambiguatorNeeded = true;
|
||||
// } else if (EWC_wa == rootLetter) {
|
||||
// if (isConsonantThatTakesWaZur(prefix))
|
||||
// disambiguatorNeeded = true;
|
||||
// }
|
||||
// }
|
||||
// if (disambiguatorNeeded)
|
||||
// sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
|
||||
|
||||
|
||||
} else {
|
||||
/* no ambiguity. the "a" vowel comes after
|
||||
* wylie1. */
|
||||
sb.append(wylie1
|
||||
+ aVowelToUseAfter(wylie1)
|
||||
+ wylie2);
|
||||
}
|
||||
} else {
|
||||
if (makeIllegalTibetanGoEndToEnd
|
||||
&& !(TibetanMachineWeb.isWylieRight(wylie1)
|
||||
&& TibetanMachineWeb.isWylieFarRight(wylie2))) {
|
||||
/* handle skaskaska, e.g. */
|
||||
sb.append(aVowelToUseAfter(wylie0)
|
||||
+ wylie1
|
||||
+ aVowelToUseAfter(wylie1)
|
||||
+ wylie2
|
||||
+ aVowelToUseAfter(wylie2));
|
||||
} else {
|
||||
/* no ambiguity. the "a" vowel comes after
|
||||
* wylie0. */
|
||||
sb.append(aVowelToUseAfter(wylie0)
|
||||
+ unambiguousPostAVowelWylie(wylie1,
|
||||
wylie2));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* If size==4, then we assume this is legal. If
|
||||
* size==5, anything will do! So assume we have a
|
||||
* prefix, a root letter, a suffix, and a postsuffix.
|
||||
* The "a" vowel comes after the root letter. */
|
||||
sb.append(aVowelToUseAfter(lastWylie)
|
||||
+ unambiguousPostAVowelWylie(wylie1,
|
||||
wylie2));
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -891,10 +1054,10 @@ public class TibTextUtils {
|
|||
* some other vowel. If the glyph list does not already contain a vowel,
|
||||
* then this method should not be called.
|
||||
*
|
||||
* @param glyphList a list of TibetanMachine glyphs, i.e. {@link org.thdl.tib.text.DuffCode DuffCodes}
|
||||
* @param glyphList a list of TibetanMachineWeb glyphs, i.e. {@link org.thdl.tib.text.DuffCode DuffCodes}
|
||||
* @return the Wylie string corresponding to this glyph list
|
||||
*/
|
||||
public static String withoutA(java.util.List glyphList) {
|
||||
public static String withoutA(java.util.ArrayList glyphList) {
|
||||
StringBuffer sb = new StringBuffer();
|
||||
Iterator iter = glyphList.iterator();
|
||||
DuffCode dc;
|
||||
|
@ -908,9 +1071,10 @@ public class TibTextUtils {
|
|||
//note: "g" and "y" should not be hard-coded
|
||||
// instead, TibetanMachineWeb should introduce relevant sets
|
||||
|
||||
if (lastWylie.equals("g") && currWylie.equals("y")
|
||||
|| !lastWylie.equals("") && currWylie.equals(TibetanMachineWeb.ACHEN))
|
||||
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
|
||||
if ((lastWylie.equals("g") && currWylie.equals("y"))
|
||||
|| (!lastWylie.equals("")
|
||||
&& currWylie.equals(TibetanMachineWeb.ACHEN)))
|
||||
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
|
||||
|
||||
sb.append(currWylie);
|
||||
|
||||
|
@ -921,7 +1085,7 @@ public class TibTextUtils {
|
|||
}
|
||||
|
||||
/**
|
||||
* Gets the Extended Wylie for a set of glyphs.
|
||||
* Gets the Extended Wylie for a sequence of glyphs.
|
||||
* @param dcs an array of glyphs
|
||||
* @return the Extended Wylie corresponding to these glyphs
|
||||
*/
|
||||
|
@ -932,166 +1096,165 @@ public class TibTextUtils {
|
|||
char ch;
|
||||
String wylie;
|
||||
|
||||
List glyphList = new ArrayList();
|
||||
ArrayList glyphList = new ArrayList();
|
||||
boolean needsVowel = true;
|
||||
boolean isLastVowel = false;
|
||||
int start = 0;
|
||||
StringBuffer wylieBuffer = new StringBuffer();
|
||||
|
||||
for (int i=start; i<dcs.length; i++) {
|
||||
ch = dcs[i].character;
|
||||
int k = dcs[i].charNum;
|
||||
// int fontNum = dcs[i].fontNum;
|
||||
for (int i=start; i<dcs.length; i++) {
|
||||
ch = dcs[i].character;
|
||||
int k = dcs[i].charNum;
|
||||
// int fontNum = dcs[i].fontNum;
|
||||
|
||||
if (k < 32) {
|
||||
if (wylieBuffer.length() > 0 || !glyphList.isEmpty()) {
|
||||
if (needsVowel)
|
||||
wylieBuffer.append(withA(glyphList));
|
||||
else
|
||||
wylieBuffer.append(withoutA(glyphList));
|
||||
if (k < 32) {
|
||||
if (wylieBuffer.length() > 0 || !glyphList.isEmpty()) {
|
||||
String thisPart;
|
||||
if (needsVowel)
|
||||
thisPart = withA(glyphList);
|
||||
else
|
||||
thisPart = withoutA(glyphList);
|
||||
wylieBuffer.append(thisPart);
|
||||
|
||||
glyphList.clear();
|
||||
needsVowel = true;
|
||||
isLastVowel = false;
|
||||
}
|
||||
glyphList.clear();
|
||||
needsVowel = true;
|
||||
isLastVowel = false;
|
||||
}
|
||||
|
||||
wylieBuffer.append(ch);
|
||||
}
|
||||
else {
|
||||
wylie = TibetanMachineWeb.getWylieForGlyph(dcs[i]);
|
||||
wylieBuffer.append(ch);
|
||||
} else {
|
||||
wylie = TibetanMachineWeb.getWylieForGlyph(dcs[i]);
|
||||
|
||||
boolean containsBindu = false;
|
||||
if (wylie.length() > 1 && wylie.charAt(wylie.length()-1) == TibetanMachineWeb.BINDU) {
|
||||
char[] cArray = wylie.toCharArray();
|
||||
wylie = new String(cArray, 0, wylie.length()-1);
|
||||
containsBindu = true;
|
||||
}
|
||||
boolean containsBindu = false;
|
||||
if (wylie.length() > 1 && wylie.charAt(wylie.length()-1) == TibetanMachineWeb.BINDU) {
|
||||
char[] cArray = wylie.toCharArray();
|
||||
wylie = new String(cArray, 0, wylie.length()-1);
|
||||
containsBindu = true;
|
||||
}
|
||||
|
||||
process_block: {
|
||||
if (TibetanMachineWeb.isWyliePunc(wylie)) {
|
||||
isLastVowel = false;
|
||||
process_block: {
|
||||
if (TibetanMachineWeb.isWyliePunc(wylie)) {
|
||||
isLastVowel = false;
|
||||
|
||||
if (glyphList.isEmpty())
|
||||
wylieBuffer.append(wylie);
|
||||
if (glyphList.isEmpty()) {
|
||||
wylieBuffer.append(wylie);
|
||||
} else {
|
||||
String thisPart;
|
||||
if (needsVowel)
|
||||
thisPart = withA(glyphList);
|
||||
else
|
||||
thisPart = withoutA(glyphList);
|
||||
wylieBuffer.append(thisPart);
|
||||
|
||||
else {
|
||||
if (needsVowel)
|
||||
wylieBuffer.append(withA(glyphList));
|
||||
else
|
||||
wylieBuffer.append(withoutA(glyphList));
|
||||
|
||||
wylieBuffer.append(wylie); //append the punctuation
|
||||
|
||||
glyphList.clear();
|
||||
}
|
||||
needsVowel = true; //next consonants are syllable onset, so we are awaiting vowel
|
||||
}
|
||||
wylieBuffer.append(wylie); //append the punctuation
|
||||
|
||||
glyphList.clear();
|
||||
}
|
||||
needsVowel = true; //next consonants are syllable onset, so we are awaiting vowel
|
||||
} else if (TibetanMachineWeb.isWylieChar(wylie)) {
|
||||
//isChar must come before isVowel because ACHEN has priority over WYLIE_aVOWEL
|
||||
else if (TibetanMachineWeb.isWylieChar(wylie)) {
|
||||
isLastVowel = false;
|
||||
glyphList.add(dcs[i]);
|
||||
}
|
||||
isLastVowel = false;
|
||||
glyphList.add(dcs[i]);
|
||||
} else if (TibetanMachineWeb.isWylieVowel(wylie)) {
|
||||
if (isLastVowel) {
|
||||
int len = wylieBuffer.length();
|
||||
int A_len = TibetanMachineWeb.A_VOWEL.length();
|
||||
|
||||
else if (TibetanMachineWeb.isWylieVowel(wylie)) {
|
||||
if (isLastVowel) {
|
||||
int len = wylieBuffer.length();
|
||||
int A_len = TibetanMachineWeb.A_VOWEL.length();
|
||||
if (wylieBuffer.substring(len-A_len).equals(TibetanMachineWeb.A_VOWEL)) {
|
||||
try {
|
||||
if (wylie.equals(TibetanMachineWeb.i_VOWEL)) {
|
||||
wylieBuffer.delete(len-A_len, len);
|
||||
wylieBuffer.append(TibetanMachineWeb.I_VOWEL);
|
||||
isLastVowel = false;
|
||||
break process_block;
|
||||
} else if (wylie.equals(TibetanMachineWeb.reverse_i_VOWEL)) {
|
||||
wylieBuffer.delete(len-A_len, len);
|
||||
wylieBuffer.append(TibetanMachineWeb.reverse_I_VOWEL);
|
||||
isLastVowel = false;
|
||||
break process_block;
|
||||
}
|
||||
}
|
||||
catch (StringIndexOutOfBoundsException se) {
|
||||
ThdlDebug.noteIffyCode();
|
||||
}
|
||||
|
||||
if (wylieBuffer.substring(len-A_len).equals(TibetanMachineWeb.A_VOWEL)) {
|
||||
try {
|
||||
if (wylie.equals(TibetanMachineWeb.i_VOWEL)) {
|
||||
wylieBuffer.delete(len-A_len, len);
|
||||
wylieBuffer.append(TibetanMachineWeb.I_VOWEL);
|
||||
isLastVowel = false;
|
||||
break process_block;
|
||||
}
|
||||
else if (wylie.equals(TibetanMachineWeb.reverse_i_VOWEL)) {
|
||||
wylieBuffer.delete(len-A_len, len);
|
||||
wylieBuffer.append(TibetanMachineWeb.reverse_I_VOWEL);
|
||||
isLastVowel = false;
|
||||
break process_block;
|
||||
}
|
||||
}
|
||||
catch (StringIndexOutOfBoundsException se) {
|
||||
ThdlDebug.noteIffyCode();
|
||||
}
|
||||
wylieBuffer.append(wylie); //append current vowel
|
||||
isLastVowel = false;
|
||||
} else
|
||||
wylieBuffer.append(wylie); //append current vowel
|
||||
} else {
|
||||
int glyphCount = glyphList.size();
|
||||
boolean insertDisAmbig = false;
|
||||
|
||||
wylieBuffer.append(wylie); //append current vowel
|
||||
isLastVowel = false;
|
||||
}
|
||||
else
|
||||
wylieBuffer.append(wylie); //append current vowel
|
||||
}
|
||||
else {
|
||||
int glyphCount = glyphList.size();
|
||||
boolean insertDisAmbig = false;
|
||||
if (0 != glyphCount) {
|
||||
DuffCode top_dc = (DuffCode)glyphList.get(glyphCount-1);
|
||||
String top_wylie = TibetanMachineWeb.getWylieForGlyph(top_dc);
|
||||
|
||||
if (0 != glyphCount) {
|
||||
DuffCode top_dc = (DuffCode)glyphList.get(glyphCount-1);
|
||||
String top_wylie = TibetanMachineWeb.getWylieForGlyph(top_dc);
|
||||
|
||||
if (top_wylie.equals(TibetanMachineWeb.ACHEN)) {
|
||||
glyphList.remove(glyphCount-1);
|
||||
if (top_wylie.equals(TibetanMachineWeb.ACHEN)) {
|
||||
glyphList.remove(glyphCount-1);
|
||||
|
||||
if (glyphCount-1 == 0)
|
||||
top_dc = null;
|
||||
else {
|
||||
insertDisAmbig = true;
|
||||
top_dc = (DuffCode)glyphList.get(glyphCount-2);
|
||||
}
|
||||
}
|
||||
if (glyphCount-1 == 0) {
|
||||
top_dc = null;
|
||||
} else {
|
||||
insertDisAmbig = true;
|
||||
top_dc = (DuffCode)glyphList.get(glyphCount-2);
|
||||
}
|
||||
}
|
||||
|
||||
if (top_dc == null || !TibetanMachineWeb.getWylieForGlyph(top_dc).equals(TibetanMachineWeb.ACHUNG))
|
||||
wylieBuffer.append(withoutA(glyphList)); //append consonants in glyphList
|
||||
else {
|
||||
glyphCount = glyphList.size();
|
||||
glyphList.remove(glyphCount-1);
|
||||
if (top_dc == null || !TibetanMachineWeb.getWylieForGlyph(top_dc).equals(TibetanMachineWeb.ACHUNG)) {
|
||||
String thisPart = withoutA(glyphList);
|
||||
wylieBuffer.append(thisPart); //append consonants in glyphList
|
||||
} else {
|
||||
glyphCount = glyphList.size();
|
||||
glyphList.remove(glyphCount-1);
|
||||
|
||||
if (glyphCount-1 != 0)
|
||||
wylieBuffer.append(withA(glyphList));
|
||||
if (glyphCount-1 != 0) {
|
||||
String thisPart = withA(glyphList);
|
||||
wylieBuffer.append(thisPart);
|
||||
}
|
||||
|
||||
wylieBuffer.append(TibetanMachineWeb.ACHUNG);
|
||||
}
|
||||
}
|
||||
wylieBuffer.append(TibetanMachineWeb.ACHUNG);
|
||||
}
|
||||
}
|
||||
|
||||
if (insertDisAmbig)
|
||||
wylieBuffer.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
|
||||
if (insertDisAmbig)
|
||||
wylieBuffer.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
|
||||
|
||||
wylieBuffer.append(wylie); //append vowel
|
||||
wylieBuffer.append(wylie); //append vowel
|
||||
|
||||
glyphList.clear();
|
||||
isLastVowel = true;
|
||||
needsVowel = false;
|
||||
}
|
||||
}
|
||||
else { //must be a stack
|
||||
isLastVowel = false;
|
||||
glyphList.add(dcs[i]);
|
||||
}
|
||||
}
|
||||
glyphList.clear();
|
||||
isLastVowel = true;
|
||||
needsVowel = false;
|
||||
}
|
||||
} else { //must be a stack
|
||||
isLastVowel = false;
|
||||
glyphList.add(dcs[i]);
|
||||
}
|
||||
}
|
||||
|
||||
if (containsBindu) {
|
||||
isLastVowel = false;
|
||||
wylieBuffer.append(withoutA(glyphList));
|
||||
wylieBuffer.append(TibetanMachineWeb.BINDU); //append the bindu
|
||||
glyphList.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
if (containsBindu) {
|
||||
isLastVowel = false;
|
||||
wylieBuffer.append(withoutA(glyphList));
|
||||
wylieBuffer.append(TibetanMachineWeb.BINDU); //append the bindu
|
||||
glyphList.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//replace TMW with Wylie
|
||||
//replace TMW with Wylie
|
||||
|
||||
if (!glyphList.isEmpty()) {
|
||||
if (needsVowel)
|
||||
wylieBuffer.append(withA(glyphList));
|
||||
else
|
||||
wylieBuffer.append(withoutA(glyphList));
|
||||
}
|
||||
if (!glyphList.isEmpty()) {
|
||||
String thisPart;
|
||||
if (needsVowel)
|
||||
thisPart = withA(glyphList);
|
||||
else
|
||||
thisPart = withoutA(glyphList);
|
||||
wylieBuffer.append(thisPart);
|
||||
}
|
||||
|
||||
if (wylieBuffer.length() > 0)
|
||||
return wylieBuffer.toString();
|
||||
else
|
||||
return null;
|
||||
if (wylieBuffer.length() > 0)
|
||||
return wylieBuffer.toString();
|
||||
else
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -61,6 +61,7 @@ public class TibetanMachineWeb {
|
|||
private static Set charSet = null;
|
||||
private static Set vowelSet = null;
|
||||
private static Set puncSet = null;
|
||||
private static Set topSet = null;
|
||||
private static Set leftSet = null;
|
||||
private static Set rightSet = null;
|
||||
private static Set farRightSet = null;
|
||||
|
@ -135,6 +136,10 @@ public class TibetanMachineWeb {
|
|||
*/
|
||||
public static final String ACHUNG = "'";
|
||||
/**
|
||||
* the Wylie for the 28th of the 30 consonants, sa:
|
||||
*/
|
||||
public static final String SA = "s";
|
||||
/**
|
||||
* the Wylie for achen
|
||||
*/
|
||||
public static final String ACHEN = "a";
|
||||
|
@ -238,9 +243,14 @@ public class TibetanMachineWeb {
|
|||
*/
|
||||
public static final int HALF_C = 10;
|
||||
|
||||
/** head letters, superscribed letters */
|
||||
private static final String tops = "r,s,l";
|
||||
/** prefixes */
|
||||
private static final String lefts = "g,d,b,m,'";
|
||||
/** suffixes */
|
||||
private static final String rights = "g,ng,d,n,b,m,r,l,s,',T";
|
||||
private static final String farrights = "d,s,ng";
|
||||
/** postsuffixes */
|
||||
private static final String farrights = "d,s"; // DLC FIXME: why was nga here in past revisions?
|
||||
|
||||
static {
|
||||
|
||||
|
@ -324,10 +334,15 @@ public class TibetanMachineWeb {
|
|||
}
|
||||
|
||||
StringTokenizer sTok;
|
||||
topSet = new HashSet();
|
||||
leftSet = new HashSet();
|
||||
rightSet = new HashSet();
|
||||
farRightSet = new HashSet();
|
||||
|
||||
sTok = new StringTokenizer(tops, ",");
|
||||
while (sTok.hasMoreTokens())
|
||||
topSet.add(sTok.nextToken());
|
||||
|
||||
sTok = new StringTokenizer(lefts, ",");
|
||||
while (sTok.hasMoreTokens())
|
||||
leftSet.add(sTok.nextToken());
|
||||
|
@ -634,10 +649,7 @@ public static boolean isChar(String s) {
|
|||
* Extended Wylie transliteration, false if not
|
||||
*/
|
||||
public static boolean isWylieChar(String s) {
|
||||
if (charSet.contains(s))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
return charSet.contains(s);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -648,17 +660,10 @@ public static boolean isWylieChar(String s) {
|
|||
* keyboard, false if not
|
||||
*/
|
||||
public static boolean isPunc(String s) {
|
||||
if (currentKeyboardIsExtendedWylie()) {
|
||||
if (puncSet.contains(s))
|
||||
return true;
|
||||
else
|
||||
return false;
|
||||
}
|
||||
if (currentKeyboardIsExtendedWylie())
|
||||
return puncSet.contains(s);
|
||||
else
|
||||
if (keyboard.isPunc(s))
|
||||
return true;
|
||||
else
|
||||
return false;
|
||||
return keyboard.isPunc(s);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -669,10 +674,7 @@ public static boolean isPunc(String s) {
|
|||
* Extended Wylie transliteration, false if not
|
||||
*/
|
||||
public static boolean isWyliePunc(String s) {
|
||||
if (puncSet.contains(s))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
return puncSet.contains(s);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -683,17 +685,10 @@ public static boolean isWyliePunc(String s) {
|
|||
* keyboard, false if not
|
||||
*/
|
||||
public static boolean isVowel(String s) {
|
||||
if (currentKeyboardIsExtendedWylie()) {
|
||||
if (vowelSet.contains(s))
|
||||
return true;
|
||||
else
|
||||
return false;
|
||||
}
|
||||
if (currentKeyboardIsExtendedWylie())
|
||||
return vowelSet.contains(s);
|
||||
else
|
||||
if (keyboard.isVowel(s))
|
||||
return true;
|
||||
else
|
||||
return false;
|
||||
return keyboard.isVowel(s);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -704,28 +699,23 @@ public static boolean isVowel(String s) {
|
|||
* Extended Wylie transliteration, false if not
|
||||
*/
|
||||
public static boolean isWylieVowel(String s) {
|
||||
if (vowelSet.contains(s))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
return vowelSet.contains(s);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true iff this Wylie is valid as a leftmost character in a
|
||||
* Tibetan syllable. For example, in the syllable 'brgyad', 'b' is the
|
||||
* leftmost character. Valid leftmost characters include g, d, b, and
|
||||
* m.
|
||||
* leftmost character. Valid leftmost characters include g, d, b, ',
|
||||
* and m.
|
||||
* @param s the (Wylie) string to be checked
|
||||
* @return true if s is a possible leftmost character in a Tibetan
|
||||
* syllable, false if not. */
|
||||
public static boolean isWylieLeft(String s) {
|
||||
if (keyboard != null)
|
||||
s = keyboard.getWylieForChar(s);
|
||||
|
||||
if (leftSet.contains(s))
|
||||
return true;
|
||||
else
|
||||
return false;
|
||||
if (useReallyIffyCode) {
|
||||
if (keyboard != null)
|
||||
s = keyboard.getWylieForChar(s);
|
||||
}
|
||||
return leftSet.contains(s);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -737,29 +727,45 @@ public static boolean isWylieLeft(String s) {
|
|||
* @return true if s is a possible right character in a Tibetan
|
||||
* syllable, false if not. */
|
||||
public static boolean isWylieRight(String s) {
|
||||
if (keyboard != null)
|
||||
s = keyboard.getWylieForChar(s);
|
||||
|
||||
if (rightSet.contains(s))
|
||||
return true;
|
||||
else
|
||||
return false;
|
||||
if (useReallyIffyCode) {
|
||||
if (keyboard != null)
|
||||
s = keyboard.getWylieForChar(s);
|
||||
}
|
||||
return rightSet.contains(s);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true iff this Wylie is valid as a leftmost character in a
|
||||
* Returns true iff this Wylie is valid as a postsuffix in a
|
||||
* Tibetan syllable.
|
||||
* @param s the string to be checked
|
||||
* @return true if s is a possible leftmost character in a Tibetan
|
||||
* @return true if s is a possible postsuffix in a Tibetan
|
||||
* syllable, false if not. */
|
||||
public static boolean isWylieFarRight(String s) {
|
||||
if (keyboard != null)
|
||||
s = keyboard.getWylieForChar(s);
|
||||
if (useReallyIffyCode) {
|
||||
if (keyboard != null)
|
||||
s = keyboard.getWylieForChar(s);
|
||||
}
|
||||
return farRightSet.contains(s);
|
||||
}
|
||||
|
||||
if (farRightSet.contains(s))
|
||||
return true;
|
||||
else
|
||||
return false;
|
||||
/** DLC FIXME: what is the point of this code? TibTextUtils
|
||||
doesn't work for TCC#1 and the like, does it? I bet this
|
||||
explains why TMW=>Wylie conversion fails when the Wylie
|
||||
keyboard isn't in use. */
|
||||
private static final boolean useReallyIffyCode = false;
|
||||
|
||||
/**
|
||||
* Returns true iff this Wylie is valid as a head letter in a Tibetan
|
||||
* syllable.
|
||||
* @param s the string to be checked
|
||||
* @return true if s is a possible superscribed letter in a Tibetan
|
||||
* syllable, false if not. */
|
||||
public static boolean isWylieTop(String s) {
|
||||
if (useReallyIffyCode) {
|
||||
if (keyboard != null)
|
||||
s = keyboard.getWylieForChar(s);
|
||||
}
|
||||
return topSet.contains(s);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
Loading…
Reference in a new issue