I really hesitate to commit this because I'm not sure what it brings to the

table exactly and I fear that it makes the ACIP->Tibetan converter code
a lot uglier.  The TODO(DLC)[EWTS->Tibetan] comments littered throughout
are part of the ugliness; they point to the ugliness.  If each were addressed,
cleanliness could perhaps be achieved.

I've largely forgotten exactly what this change does, but it attempts to
improve EWTS->Tibetan conversion.  The lexer is probably really, really
primitive.  I concentrate here on converting a single tsheg bar rather than
a whole document.

Eclipse was used during part of my journey here and some imports were
reorganized merely because I could.  :)

(Eclipse was needed when the usual ant build failed to run a new test
EWTSTest.  And I wanted its debugger.)

Next steps: end-to-end EWTS tests should bring many problems to light.  Fix
those.  Triage all the TODO comments.

I don't know that I'll ever really trust the implementation.  The tests are
valuable, though.  A clean implementation of EWTS->Tibetan in Jython
might hold enough interest for me; I'd like to learn Python.
This commit is contained in:
dchandler 2005-06-20 06:18:00 +00:00
parent f64bae8ea6
commit 7198f23361
45 changed files with 1666 additions and 695 deletions

View file

@ -16,10 +16,15 @@ All Rights Reserved.
Contributor(s): ______________________________________.
*/
// TODO(DLC)[EWTS->Tibetan]: TibetanMachineWeb has duplication of much of this!
package org.thdl.tib.text.ttt;
import java.util.ArrayList;
import org.thdl.tib.text.DuffCode;
import org.thdl.tib.text.TibetanMachineWeb;
import org.thdl.util.ThdlDebug;
/** A singleton class that should contain (but due to laziness and
* ignorance probably does not contain) all the traits that make EWTS
@ -46,41 +51,68 @@ public final class EWTSTraits implements TTraits {
/** Returns '.'. */
public char disambiguatorChar() { return '.'; }
// TODO(DLC)[EWTS->Tibetan]: isClearlyIllegal and hasSimpleError are different why?
public boolean hasSimpleError(TPair p) {
return ("a".equals(p.getLeft()) && null == p.getRight()); // TODO(DLC)[EWTS->Tibetan]: (a.e) is bad, one of (.a) or (a.) is bad
if (pairHasBadWowel(p)) return true;
return (("a".equals(p.getLeft()) && null == p.getRight())
|| ("a".equals(p.getLeft())
&& null != p.getRight()
&& TibetanMachineWeb.isWylieVowel(p.getRight()))); // TODO(DLC)[EWTS->Tibetan]: or Unicode wowels? test "a\u0f74" and "a\u0f7e"
// TODO(DLC)[EWTS->Tibetan]: (a.e) is bad, one of (.a) or (a.) is bad
}
/** {tsh}, the longest consonant, has 3 characters, so this is
* three. */
public int maxConsonantLength() { return 3; }
/** {-i~M`}, in a tie for the longest wowel, has 6 characters, so
* this is six. (No, 'l-i' and 'r-i' are not wowels (but '-i'
* is). */
public int maxWowelLength() { return 5; }
/** {-i~M`}, in a tie for the longest wowel, has 5 characters, so
* this is five. (No, 'l-i' and 'r-i' are not wowels (but '-i'
* is). (TODO(DLC)[EWTS->Tibetan]: this is crap! you can put arbitrary wowels
* together using plus signs or Unicode escapes) */
public int maxWowelLength() { return 3; /* a~M` (TODO(DLC)[EWTS->Tibetan]:! why the 'a'?) */}
public boolean isUnicodeConsonant(char ch) {
return ((ch != '\u0f48' && ch >= '\u0f40' && ch <= '\u0f6a')
|| (ch != '\u0f98' && ch >= '\u0f90' && ch <= '\u0fbc'));
}
public boolean isUnicodeWowel(char ch) {
// TODO(DLC)[EWTS->Tibetan]: what about combiners that combine only with digits? TEST
return ((ch >= '\u0f71' && ch <= '\u0f84')
|| isUnicodeWowelThatRequiresAChen(ch));
}
// TODO(DLC)[EWTS->Tibetan]: u,e,i,o? If not, document the special treatment in this function's comment
public boolean isConsonant(String s) {
if (s.length() == 1 && isUnicodeConsonant(s.charAt(0))) return true;
if (aVowel().equals(s)) return false; // In EWTS, "a" is both a consonant and a vowel, but we treat it as just a vowel and insert the implied a-chen if you have a TPair ( . a) (TODO(DLC)[EWTS->Tibetan]: right?)
// TODO(DLC)[EWTS->Tibetan]: numbers are consonants?
// TODO(DLC)[EWTS->Tibetan]: just g for now
return "g".equals(s);
return TibetanMachineWeb.isWylieChar(s);
}
public boolean isWowel(String s) {
return (getUnicodeForWowel(s) != null);
/* TODO(DLC)[EWTS->Tibetan]: test ko+m+e etc.
// TODO(DLC)[EWTS->Tibetan]: all non-consonant combiners? 0f71 0f87 etc.?
if (s.length() == 1 && isUnicodeWowel(s.charAt(0))) return true;
return ("a".equals(s)
|| "e".equals(s)
|| "i".equals(s)
|| "o".equals(s)
|| "u".equals(s)
|| "?".equals(s) // TODO(DLC)[EWTS->Tibetan]: 0f84 virama???
// TODO(DLC)[EWTS->Tibetan]: & ~M` ~M ???
|| "U".equals(s)
|| "I".equals(s)
|| "A".equals(s)
|| "-i".equals(s)
|| "-I".equals(s)
|| "H".equals(s)
|| "M".equals(s)); // TODO(DLC)[EWTS->Tibetan]:???
|| "au".equals(s)
|| "ai".equals(s)
|| isWowelThatRequiresAChen(s));
// TODO(DLC)[EWTS->Tibetan]:???
*/
}
public String aVowel() { return "a"; }
@ -125,5 +157,222 @@ public final class EWTSTraits implements TTraits {
throw new Error("TODO(DLC)[EWTS->Tibetan]");
}
public String getUnicodeFor(String l, boolean subscribed) { throw new Error("TODO(DLC)[EWTS->Tibetan]"); }
public String getUnicodeForWowel(String wowel) {
if ("a".equals(wowel))
return "";
return helpGetUnicodeForWowel(wowel);
}
private String helpGetUnicodeForWowel(String wowel) {
if ("a".equals(wowel))
return null; // ko+a+e is invalid, e.g.
if (wowel.length() == 1 && isUnicodeWowel(wowel.charAt(0)))
return wowel;
// handle o+u, etc.
int i;
if ((i = wowel.indexOf("+")) >= 0) {
// recurse.
// Chris Fynn says \u0f7c\u0f7c is different from \u0f7d.
// So o+o is not the same as au. e+e is not the same as
// ai.
String left = helpGetUnicodeForWowel(wowel.substring(0, i));
String right = helpGetUnicodeForWowel(wowel.substring(i + 1));
if (null != left && null != right)
return left + right;
else
return null;
} else {
// Handle vowels. (TODO(dchandler): tibwn.ini has this
// info, use that instead of duplicating it in this code.)
if ("i".equals(wowel)) return "\u0f72";
if ("u".equals(wowel)) return "\u0f74";
if ("A".equals(wowel)) return "\u0f71";
if ("U".equals(wowel)) return "\u0f71\u0f74"; // \u0f75 is discouraged
if ("e".equals(wowel)) return "\u0f7a";
if ("o".equals(wowel)) return "\u0f7c";
if ("-i".equals(wowel)) return "\u0f80";
if ("ai".equals(wowel)) return "\u0f7b";
if ("au".equals(wowel)) return "\u0f7d";
if ("-I".equals(wowel)) return "\u0f81";
if ("I".equals(wowel)) return "\u0f71\u0f72"; // \u0f73 is discouraged
// TODO(DLC)[EWTS->Tibetan]: fix me!
// DLC say ah if ("aM".equals(wowel)) return "\u0f7e";
if ("M".equals(wowel)) return "\u0f7e";
// DLC say ah if ("aH".equals(wowel)) return "\u0f7f";
if ("H".equals(wowel)) return "\u0f7f";
// DLC say ah if ("a?".equals(wowel)) return "\u0f84";
if ("?".equals(wowel)) return "\u0f84";
// DLC say ah if ("a~M".equals(wowel)) return "\u0f83";
if ("~M".equals(wowel)) return "\u0f83";
// DLC say ah if ("a~M`".equals(wowel)) return "\u0f82";
if ("~M`".equals(wowel)) return "\u0f82";
// DLC say ah if ("aX".equals(wowel)) return "\u0f37";
if ("X".equals(wowel)) return "\u0f37";
// DLC say ah if ("a~X".equals(wowel)) return "\u0f35";
if ("~X".equals(wowel)) return "\u0f35";
return null;
}
}
public String getUnicodeFor(String l, boolean subscribed) {
// First, handle "\u0f71\u0f84\u0f86", "", "\u0f74", etc.
{
boolean already_done = true;
for (int i = 0; i < l.length(); i++) {
if (!(l.charAt(0) >= '\u0f00' && l.charAt(0) <= '\u0fff')) {
already_done = false;
break;
}
}
if (already_done)
return l; // TODO(dchandler): \u0fff etc. are not valid code points, though. Do we handle that well?
}
// TODO(DLC)[EWTS->Tibetan]:: vowels !subscribed could mean (a . i)???? I doubt it but test "i"->"\u0f68\u0f72" etc.
if (subscribed) {
if ("R".equals(l)) return "\u0fbc";
if ("Y".equals(l)) return "\u0fbb";
if ("W".equals(l)) return "\u0fba";
// g+h etc. should not be inputs to this function, but for
// completeness they're here.
if ("k".equals(l)) return "\u0F90";
if ("kh".equals(l)) return "\u0F91";
if ("g".equals(l)) return "\u0F92";
if ("g+h".equals(l)) return "\u0F93";
if ("ng".equals(l)) return "\u0F94";
if ("c".equals(l)) return "\u0F95";
if ("ch".equals(l)) return "\u0F96";
if ("j".equals(l)) return "\u0F97";
if ("ny".equals(l)) return "\u0F99";
if ("T".equals(l)) return "\u0F9A";
if ("Th".equals(l)) return "\u0F9B";
if ("D".equals(l)) return "\u0F9C";
if ("D+h".equals(l)) return "\u0F9D";
if ("N".equals(l)) return "\u0F9E";
if ("t".equals(l)) return "\u0F9F";
if ("th".equals(l)) return "\u0FA0";
if ("d".equals(l)) return "\u0FA1";
if ("d+h".equals(l)) return "\u0FA2";
if ("n".equals(l)) return "\u0FA3";
if ("p".equals(l)) return "\u0FA4";
if ("ph".equals(l)) return "\u0FA5";
if ("b".equals(l)) return "\u0FA6";
if ("b+h".equals(l)) return "\u0FA7";
if ("m".equals(l)) return "\u0FA8";
if ("ts".equals(l)) return "\u0FA9";
if ("tsh".equals(l)) return "\u0FAA";
if ("dz".equals(l)) return "\u0FAB";
if ("dz+h".equals(l)) return "\u0FAC";
if ("w".equals(l)) return "\u0FAD"; // TODO(DLC)[EWTS->Tibetan]:: ???
if ("zh".equals(l)) return "\u0FAE";
if ("z".equals(l)) return "\u0FAF";
if ("'".equals(l)) return "\u0FB0";
if ("y".equals(l)) return "\u0FB1";
if ("r".equals(l)) return "\u0FB2";
if ("l".equals(l)) return "\u0FB3";
if ("sh".equals(l)) return "\u0FB4";
if ("Sh".equals(l)) return "\u0FB5";
if ("s".equals(l)) return "\u0FB6";
if ("h".equals(l)) return "\u0FB7";
if ("a".equals(l)) return "\u0FB8";
if ("k+Sh".equals(l)) return "\u0FB9";
if (false) throw new Error("TODO(DLC)[EWTS->Tibetan]:: subscribed for " + l);
return null;
} else {
if ("R".equals(l)) return "\u0f6a";
if ("Y".equals(l)) return "\u0f61";
if ("W".equals(l)) return "\u0f5d";
if (!TibetanMachineWeb.isKnownHashKey(l)) {
ThdlDebug.noteIffyCode();
return null;
}
String s = TibetanMachineWeb.getUnicodeForWylieForGlyph(l);
if (null == s)
ThdlDebug.noteIffyCode();
return s;
}
}
public String shortTranslitName() { return "EWTS"; }
private boolean pairHasBadWowel(TPair p) {
return (null != p.getRight()
&& !disambiguator().equals(p.getRight())
&& !"+".equals(p.getRight())
&& null == getUnicodeForWowel(p.getRight()));
}
public boolean isClearlyIllegal(TPair p) {
if (pairHasBadWowel(p)) return true;
if (p.getLeft() == null
&& (p.getRight() == null ||
(!disambiguator().equals(p.getRight())
&& !isWowel(p.getRight()))))
return true;
if ("+".equals(p.getLeft()))
return true;
if (p.getLeft() != null && isWowel(p.getLeft())
&& !aVowel().equals(p.getLeft())) // achen
return true;
return false;
}
public TPairList[] breakTshegBarIntoChunks(String tt, boolean sh) {
if (sh) throw new IllegalArgumentException("Don't do that, silly!");
try {
return TPairListFactory.breakEWTSIntoChunks(tt);
} catch (StackOverflowError e) {
throw new IllegalArgumentException("Input too large[1]: " + tt);
} catch (OutOfMemoryError e) {
throw new IllegalArgumentException("Input too large[2]: " + tt);
}
}
public boolean isACIP() { return false; }
public boolean vowelAloneImpliesAChen() { return true; }
public boolean vowelsMayStack() { return true; }
public boolean isWowelThatRequiresAChen(String s) {
// TODO(DLC)[EWTS->Tibetan]: fix me!
return ((s.length() == 1 && (isUnicodeWowelThatRequiresAChen(s.charAt(0))
|| "?MHX".indexOf(s.charAt(0)) >= 0))
// DLC say ah || "aM".equals(s) // DLC funny... (DLC NOW too funny! affects longest wowel length!)
// DLC say ah || "a?".equals(s) // DLC funny...
// DLC say ah || "aH".equals(s) // DLC funny...
// DLC say ah || "aX".equals(s) // DLC funny...
|| "~X".equals(s)
// DLC say ah || "a~X".equals(s) // DLC funny...
|| "~M".equals(s)
// DLC say ah || "a~M".equals(s) // DLC funny...
|| "~M`".equals(s)
// DLC say ah || "a~M`".equals(s) // DLC funny...
);
}
public boolean isUnicodeWowelThatRequiresAChen(char ch) {
// TODO(DLC)[EWTS->Tibetan]: ask if 18 19 3e 3f combine only with digits
return "\u0f35\u0f37\u0f18\u0f19\u0f3e\u0f3f\u0f86\u0f87\u0fc6".indexOf(ch) >= 0;
}
public boolean couldBeValidStack(TPairList pl) {
StringBuffer hashKey = new StringBuffer();
boolean allHavePlus = true;
for (int i = 0; i < pl.size(); i++) {
if (i + 1 < pl.size() && !"+".equals(pl.get(i).getRight()))
allHavePlus = false;
if (0 != hashKey.length())
hashKey.append('-');
hashKey.append(pl.get(i).getLeft());
}
return (allHavePlus
|| TibetanMachineWeb.hasGlyph(hashKey.toString())); // TODO(DLC)[EWTS->Tibetan]: test with smra and tsma and bdgya
}
}