From 6d419fe64186814bfc21fdbf7f43c6939e29ccad Mon Sep 17 00:00:00 2001 From: dchandler Date: Mon, 11 Jul 2005 02:51:06 +0000 Subject: [PATCH] Numerous EWTS->Unicode and especially EWTS->TMW improvements. Fixed ordering of Unicode wowels. [ku+A] gives the correct Unicode now, e.g. EWTS->TMW looks better for some wacky wowels like, I'm guessing here, [ku+A]. EWTS->TMW should now give errors any time the full input isn't used. Previously, wacky wowels like [kai+-i] would lead to some droppage. EWTS->TMW->Unicode testing is now in effect. This found a ton of EWTS->TMW bugs, most or all of which are fixed now. TMW->Unicode is improved/fixed for { \u5350,\u534D,\u0F88+k,\u0F88+kh,U }. (Why U? "\u0f75" is discouraged in favor of "\u0f71\u0f74".) NOTE: TMW_RTF_TO_THDL_WYLIETest is still disabled for the nightly builds' sake, but I ran it in my sandbox and it passed. --- source/org/thdl/tib/input/DuffPane.java | 2 +- ..._RTF_TO_THDL_WYLIETest2ResultACIP.expected | 8 +- ...O_THDL_WYLIETest2ResultConversion.expected | 8 +- .../org/thdl/tib/text/THDLWylieConstants.java | 107 ++- source/org/thdl/tib/text/TibTextUtils.java | 6 +- .../org/thdl/tib/text/TibetanMachineWeb.java | 20 +- source/org/thdl/tib/text/tibwn.ini | 75 +-- .../thdl/tib/text/tshegbar/UnicodeUtils.java | 15 +- source/org/thdl/tib/text/ttt/ACIPTraits.java | 8 +- .../tib/text/ttt/ConversionException.java | 30 + source/org/thdl/tib/text/ttt/EWTSTest.java | 633 +++++++++++------- source/org/thdl/tib/text/ttt/EWTSTraits.java | 410 +++++++----- .../tib/text/ttt/EWTSTshegBarScanner.java | 43 +- source/org/thdl/tib/text/ttt/TConverter.java | 13 +- source/org/thdl/tib/text/ttt/TPairList.java | 83 +-- .../thdl/tib/text/ttt/TPairListFactory.java | 90 ++- source/org/thdl/tib/text/ttt/TString.java | 5 +- source/org/thdl/tib/text/ttt/TTraits.java | 3 +- source/org/thdl/util/VerboseUnicodeDump.java | 2 +- 19 files changed, 1014 insertions(+), 547 deletions(-) create mode 100644 source/org/thdl/tib/text/ttt/ConversionException.java diff --git a/source/org/thdl/tib/input/DuffPane.java b/source/org/thdl/tib/input/DuffPane.java index ce0e9c3..2553cdf 100644 --- a/source/org/thdl/tib/input/DuffPane.java +++ b/source/org/thdl/tib/input/DuffPane.java @@ -1377,7 +1377,7 @@ public void paste(int offset) if (TibetanMachineWeb.isPunc(val)) { //punctuation val = TibetanMachineWeb.getWylieForPunc(val); - if (val.charAt(0) == TibetanMachineWeb.BINDU) + if (val.startsWith(THDLWylieConstants.BINDU)) putBindu(); else { diff --git a/source/org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIETest2ResultACIP.expected b/source/org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIETest2ResultACIP.expected index 960ad06..db9c669 100644 --- a/source/org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIETest2ResultACIP.expected +++ b/source/org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIETest2ResultACIP.expected @@ -242,8 +242,8 @@ \f1\fs144 >\f3 6\f1 >\f2\i0\b0\ul0 K+S+MA\fs28\i0\b0\ul0\cf0 font 2; ord 54\par \f1\fs144 >\f3 7\f1 >\f2\i0\b0\ul0 K+S+YA\fs28\i0\b0\ul0\cf0 font 2; ord 55\par \f1\fs144 >\f3 8\f1 >\f2\i0\b0\ul0 K+S+VA\fs28\i0\b0\ul0\cf0 font 2; ord 56\par -\f1\fs144 >\f3 9\f1 >\f2\i0\b0\ul0 [# JSKAD_TMW_TO_ACIP_ERROR_NO_SUCH_ACIP: Cannot convert glyph with THDL Extended Wylie <<[[JSKAD_TMW_TO_WYLIE_ERROR_NO_SUCH_WYLIE: Cannot convert to THDL Extended Wylie. Please see the documentation for the TM or TMW font and transcribe this yourself.]]>> to ACIP. Please transcribe this yourself.]\fs28\i0\b0\ul0\cf0 font 2; ord 57\par -\f1\fs144 >\f3 :\f1 >\f2\i0\b0\ul0 [# JSKAD_TMW_TO_ACIP_ERROR_NO_SUCH_ACIP: Cannot convert glyph with THDL Extended Wylie <<[[JSKAD_TMW_TO_WYLIE_ERROR_NO_SUCH_WYLIE: Cannot convert to THDL Extended Wylie. Please see the documentation for the TM or TMW font and transcribe this yourself.]]>> to ACIP. Please transcribe this yourself.]\fs28\i0\b0\ul0\cf0 font 2; ord 58\par +\f1\fs144 >\f3 9\f1 >\f2\i0\b0\ul0 [# JSKAD_TMW_TO_ACIP_ERROR_NO_SUCH_ACIP: Cannot convert glyph with THDL Extended Wylie \\u0F88+k to ACIP. Please transcribe this yourself.]\fs28\i0\b0\ul0\cf0 font 2; ord 57\par +\f1\fs144 >\f3 :\f1 >\f2\i0\b0\ul0 [# JSKAD_TMW_TO_ACIP_ERROR_NO_SUCH_ACIP: Cannot convert glyph with THDL Extended Wylie \\u0F88+kh to ACIP. Please transcribe this yourself.]\fs28\i0\b0\ul0\cf0 font 2; ord 58\par \f1\fs144 >\f3 ;\f1 >\f2\i0\b0\ul0 KH+KHA\fs28\i0\b0\ul0\cf0 font 2; ord 59\par \f1\fs144 >\f3 <\f1 >\f2\i0\b0\ul0 KH+NA\fs28\i0\b0\ul0\cf0 font 2; ord 60\par \f1\fs144 >\f3 =\f1 >\f2\i0\b0\ul0 KH+LA\fs28\i0\b0\ul0\cf0 font 2; ord 61\par @@ -812,8 +812,8 @@ \f1\fs144 >\f6 ^\f1 >\f2\i0\b0\ul0 [# JSKAD_TMW_TO_ACIP_ERROR_NO_SUCH_ACIP: Cannot convert glyph with THDL Extended Wylie \\u0F13 to ACIP. Please transcribe this yourself.]\fs28\i0\b0\ul0\cf0 font 5; ord 94\par \f1\fs144 >\f6 _\f1 >\f2\i0\b0\ul0 [# JSKAD_TMW_TO_ACIP_ERROR_NO_SUCH_ACIP: Cannot convert glyph with THDL Extended Wylie < to ACIP. Please transcribe this yourself.]\fs28\i0\b0\ul0\cf0 font 5; ord 95\par \f1\fs144 >\f6 `\f1 >\f2\i0\b0\ul0 [# JSKAD_TMW_TO_ACIP_ERROR_NO_SUCH_ACIP: Cannot convert glyph with THDL Extended Wylie > to ACIP. Please transcribe this yourself.]\fs28\i0\b0\ul0\cf0 font 5; ord 96\par -\f1\fs144 >\f6 a\f1 >\f2\i0\b0\ul0 [# JSKAD_TMW_TO_ACIP_ERROR_NO_SUCH_ACIP: Cannot convert glyph with THDL Extended Wylie <<[[JSKAD_TMW_TO_WYLIE_ERROR_NO_SUCH_WYLIE: Cannot convert to THDL Extended Wylie. Please see the documentation for the TM or TMW font and transcribe this yourself.]]>> to ACIP. Please transcribe this yourself.]\fs28\i0\b0\ul0\cf0 font 5; ord 97\par -\f1\fs144 >\f6 b\f1 >\f2\i0\b0\ul0 [# JSKAD_TMW_TO_ACIP_ERROR_NO_SUCH_ACIP: Cannot convert glyph with THDL Extended Wylie <<[[JSKAD_TMW_TO_WYLIE_ERROR_NO_SUCH_WYLIE: Cannot convert to THDL Extended Wylie. Please see the documentation for the TM or TMW font and transcribe this yourself.]]>> to ACIP. Please transcribe this yourself.]\fs28\i0\b0\ul0\cf0 font 5; ord 98\par +\f1\fs144 >\f6 a\f1 >\f2\i0\b0\ul0 [# JSKAD_TMW_TO_ACIP_ERROR_NO_SUCH_ACIP: Cannot convert glyph with THDL Extended Wylie \\u5350 to ACIP. Please transcribe this yourself.]\fs28\i0\b0\ul0\cf0 font 5; ord 97\par +\f1\fs144 >\f6 b\f1 >\f2\i0\b0\ul0 [# JSKAD_TMW_TO_ACIP_ERROR_NO_SUCH_ACIP: Cannot convert glyph with THDL Extended Wylie \\u534D to ACIP. Please transcribe this yourself.]\fs28\i0\b0\ul0\cf0 font 5; ord 98\par \f1\fs144 >\f6 c\f1 >\f2\i0\b0\ul0 [# JSKAD_TMW_TO_ACIP_ERROR_NO_SUCH_ACIP: Cannot convert glyph with THDL Extended Wylie \\uF038 to ACIP. Please transcribe this yourself.]\fs28\i0\b0\ul0\cf0 font 5; ord 99\par \f1\fs144 >\f6 d\f1 >\f2\i0\b0\ul0 [# JSKAD_TMW_TO_ACIP_ERROR_NO_SUCH_ACIP: Cannot convert glyph with THDL Extended Wylie \\uF037 to ACIP. Please transcribe this yourself.]\fs28\i0\b0\ul0\cf0 font 5; ord 100\par \f1\fs144 >\f6 e\f1 >\f2\i0\b0\ul0 o\fs28\i0\b0\ul0\cf0 font 5; ord 101\par diff --git a/source/org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIETest2ResultConversion.expected b/source/org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIETest2ResultConversion.expected index 583cdde..e076001 100644 --- a/source/org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIETest2ResultConversion.expected +++ b/source/org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIETest2ResultConversion.expected @@ -242,8 +242,8 @@ \f1\fs144 >\f3 6\f1 >\f2\i0\b0\ul0 k+s+ma\fs28\i0\b0\ul0\cf0 font 2; ord 54\par \f1\fs144 >\f3 7\f1 >\f2\i0\b0\ul0 k+s+ya\fs28\i0\b0\ul0\cf0 font 2; ord 55\par \f1\fs144 >\f3 8\f1 >\f2\i0\b0\ul0 k+s+wa\fs28\i0\b0\ul0\cf0 font 2; ord 56\par -\f1\fs144 >\f3 9\f1 >\f2\i0\b0\ul0 <<[[JSKAD_TMW_TO_WYLIE_ERROR_NO_SUCH_WYLIE: Cannot convert to THDL Extended Wylie. Please see the documentation for the TM or TMW font and transcribe this yourself.]]>>\fs28\i0\b0\ul0\cf0 font 2; ord 57\par -\f1\fs144 >\f3 :\f1 >\f2\i0\b0\ul0 <<[[JSKAD_TMW_TO_WYLIE_ERROR_NO_SUCH_WYLIE: Cannot convert to THDL Extended Wylie. Please see the documentation for the TM or TMW font and transcribe this yourself.]]>>\fs28\i0\b0\ul0\cf0 font 2; ord 58\par +\f1\fs144 >\f3 9\f1 >\f2\i0\b0\ul0\\u0F88+k\fs28\i0\b0\ul0\cf0 font 2; ord 57\par +\f1\fs144 >\f3 :\f1 >\f2\i0\b0\ul0\\u0F88+kh\fs28\i0\b0\ul0\cf0 font 2; ord 58\par \f1\fs144 >\f3 ;\f1 >\f2\i0\b0\ul0 kh+kha\fs28\i0\b0\ul0\cf0 font 2; ord 59\par \f1\fs144 >\f3 <\f1 >\f2\i0\b0\ul0 kh+na\fs28\i0\b0\ul0\cf0 font 2; ord 60\par \f1\fs144 >\f3 =\f1 >\f2\i0\b0\ul0 kh+la\fs28\i0\b0\ul0\cf0 font 2; ord 61\par @@ -812,8 +812,8 @@ \f1\fs144 >\f6 ^\f1 >\f2\i0\b0\ul0\\u0F13\fs28\i0\b0\ul0\cf0 font 5; ord 94\par \f1\fs144 >\f6 _\f1 >\f2\i0\b0\ul0 <\fs28\i0\b0\ul0\cf0 font 5; ord 95\par \f1\fs144 >\f6 `\f1 >\f2\i0\b0\ul0 >\fs28\i0\b0\ul0\cf0 font 5; ord 96\par -\f1\fs144 >\f6 a\f1 >\f2\i0\b0\ul0 <<[[JSKAD_TMW_TO_WYLIE_ERROR_NO_SUCH_WYLIE: Cannot convert to THDL Extended Wylie. Please see the documentation for the TM or TMW font and transcribe this yourself.]]>>\fs28\i0\b0\ul0\cf0 font 5; ord 97\par -\f1\fs144 >\f6 b\f1 >\f2\i0\b0\ul0 <<[[JSKAD_TMW_TO_WYLIE_ERROR_NO_SUCH_WYLIE: Cannot convert to THDL Extended Wylie. Please see the documentation for the TM or TMW font and transcribe this yourself.]]>>\fs28\i0\b0\ul0\cf0 font 5; ord 98\par +\f1\fs144 >\f6 a\f1 >\f2\i0\b0\ul0\\u5350\fs28\i0\b0\ul0\cf0 font 5; ord 97\par +\f1\fs144 >\f6 b\f1 >\f2\i0\b0\ul0\\u534D\fs28\i0\b0\ul0\cf0 font 5; ord 98\par \f1\fs144 >\f6 c\f1 >\f2\i0\b0\ul0\\uF038\fs28\i0\b0\ul0\cf0 font 5; ord 99\par \f1\fs144 >\f6 d\f1 >\f2\i0\b0\ul0\\uF037\fs28\i0\b0\ul0\cf0 font 5; ord 100\par \f1\fs144 >\f6 e\f1 >\f2\i0\b0\ul0 X\fs28\i0\b0\ul0\cf0 font 5; ord 101\par diff --git a/source/org/thdl/tib/text/THDLWylieConstants.java b/source/org/thdl/tib/text/THDLWylieConstants.java index 9c355a0..a0dbfaa 100644 --- a/source/org/thdl/tib/text/THDLWylieConstants.java +++ b/source/org/thdl/tib/text/THDLWylieConstants.java @@ -18,9 +18,80 @@ Contributor(s): ______________________________________. package org.thdl.tib.text; -/** This is where basic, static knowledge of THDL's Extended Wylie is housed. +/** This is where basic, static knowledge of THDL's Extended Wylie is + * housed.

TODO(dchandler): tibwn.ini has all this, yes? So + * extend TibetanMachineWeb if necessary and use a bunch of HashMaps + * there! This is needless duplication. * @see TibetanMachineWeb */ public interface THDLWylieConstants { +// TODO(DLC)[EWTS->Tibetan]: what about U+2638, mentioned in Section +// 9.11 "Tibetan" of the Unicode 4.0.1 standard? Why doesn't EWTS +// mention it? (Because TMW has no glyph for it, I bet.) Do we +// handle it well? +/** The EWTS standard mentions this character specifically. See +* http://www.symbols.com/encyclopedia/15/155.html to learn about +* its meaning as relates to Buddhism. +*/ + public static final char SAUVASTIKA = '\u534d'; +/** The EWTS standard mentions this character specifically. See +* http://www.symbols.com/encyclopedia/15/151.html to learn about +* its meaning as relates to Buddhism. +*/ + public static final char SWASTIKA = '\u5350'; +/** EWTS has some glyphs not specified by Unicode in the +* private-use area (PUA). EWTS puts them in the range [PUA_MIN, +* PUA_MAX]. (Note that \uf042 is the highest in use as of July +* 2, 2005.) */ + public static final char PUA_MIN = '\uf021'; +/** EWTS has some glyphs not specified by Unicode in the +* private-use area (PUA). EWTS puts them in the range [PUA_MIN, +* PUA_MAX]. (Note that \uf042 is the highest in use as of July +* 2, 2005.) */ + public static final char PUA_MAX = '\uf0ff'; +/** +* the Wylie for U+0F3E +*/ + public static final String U0F3E = "}"; +/** +* the Wylie for U+0F3F +*/ + public static final String U0F3F = "{"; +/** +* the Wylie for U+0F86 +*/ + public static final String U0F86 = "\\u0F86"; +/** +* the Wylie for U+0F87 +*/ + public static final String U0F87 = "\\u0F87"; +/** +* the Wylie for U+0FC6 +*/ + public static final String U0FC6 = "\\u0FC6"; +/** +* the Wylie for U+0F18 +*/ + public static final String U0F18 = "\\u0F18"; +/** +* the Wylie for U+0F19 +*/ + public static final String U0F19 = "\\u0F19"; +/** +* the Wylie for U+0F84 +*/ + public static final String U0F84 = "?"; +/** +* the Wylie for U+0F7F +*/ + public static final String U0F7F = "H"; +/** +* the Wylie for U+0F35 +*/ + public static final String U0F35 = "~X"; +/** +* the Wylie for U+0F37 +*/ + public static final String U0F37 = "X"; /** * the Wylie for U+0F82 */ @@ -32,7 +103,7 @@ public interface THDLWylieConstants { /** * the Wylie for bindu/anusvara (U+0F7E) */ - public static final char BINDU = 'M'; + public static final String BINDU = "M"; /** * the Wylie for tsheg */ @@ -64,31 +135,51 @@ public interface THDLWylieConstants { */ public static final String WYLIE_TSA_PHRU = "^"; /** -* the Wylie for achung +* the Wylie for achung, \u0f60 */ public static final char ACHUNG_character = '\''; /** -* the Wylie for achung +* the Wylie for achung, \u0f60 */ public static final String ACHUNG = new String(new char[] { ACHUNG_character }); /** -* the Wylie for the 28th of the 30 consonants, sa: +* the Wylie for the 28th of the 30 consonants, sa, \u0f66: */ public static final String SA = "s"; /** -* the Wylie for the consonant ra: +* the Wylie for the consonant ra, \u0f62: */ public static final String RA = "r"; /** -* the Wylie for the 16th of the 30 consonants, ma: +* the Wylie for the 16th of the 30 consonants, ma, \u0f58: */ public static final String MA = "m"; /** -* the Wylie for the 4th of the 30 consonants, nga: +* the Wylie for \u0f56: +*/ + public static final String BA = "b"; +/** +* the Wylie for \u0f51: +*/ + public static final String DA = "d"; +/** +* the Wylie for \u0f42: +*/ + public static final String GA = "g"; +/** +* the Wylie for \u0f63: +*/ + public static final String LA = "l"; +/** +* the Wylie for the 4th of the 30 consonants, nga, \u0f44: */ public static final String NGA = "ng"; /** +* the Wylie for \u0f53: +*/ + public static final String NA = "n"; +/** * the Wylie for achen */ public static final String ACHEN = "a"; diff --git a/source/org/thdl/tib/text/TibTextUtils.java b/source/org/thdl/tib/text/TibTextUtils.java index fdf42fc..94181c4 100644 --- a/source/org/thdl/tib/text/TibTextUtils.java +++ b/source/org/thdl/tib/text/TibTextUtils.java @@ -418,7 +418,7 @@ public class TibTextUtils implements THDLWylieConstants { chars.clear(); - if (next.equals(String.valueOf(BINDU))) { + if (next.equals(BINDU)) { if (glyphs.isEmpty()) dc = null; else @@ -560,11 +560,11 @@ public class TibTextUtils implements THDLWylieConstants { * or null */ public static void getBindu(List list, DuffCode dc) { if (null == dc) { - list.add(TibetanMachineWeb.getGlyph(String.valueOf(BINDU))); + list.add(TibetanMachineWeb.getGlyph(BINDU)); } else { if (!TibetanMachineWeb.getBinduMap().containsKey(dc)) { list.add(dc); - list.add(TibetanMachineWeb.getGlyph(String.valueOf(BINDU))); + list.add(TibetanMachineWeb.getGlyph(BINDU)); } else { list.add((DuffCode)TibetanMachineWeb.getBinduMap().get(dc)); } diff --git a/source/org/thdl/tib/text/TibetanMachineWeb.java b/source/org/thdl/tib/text/TibetanMachineWeb.java index c55a852..fcc2d1b 100644 --- a/source/org/thdl/tib/text/TibetanMachineWeb.java +++ b/source/org/thdl/tib/text/TibetanMachineWeb.java @@ -1347,12 +1347,26 @@ public static boolean isKnownHashKey(String hashKey) { * @see DuffCode */ public static DuffCode getGlyph(String hashKey) { - DuffCode[] dc = (DuffCode[])tibHash.get(hashKey); + DuffCode dc = maybeGetGlyph(hashKey); if (null == dc) throw new Error("Hash key " + hashKey + " not found; it is likely that you misconfigured tibwn.ini such that, say, M is expected (i.e., it is listed as, e.g. punctuation), but no 'M~...' line appears."); + return dc; +} + +/** +* Gets a glyph for this hash key if possible; returns null +* otherwise. +* @see #getGlyph(String) +*/ +public static DuffCode maybeGetGlyph(String hashKey) { + DuffCode[] dc = (DuffCode[])tibHash.get(hashKey); + if (null == dc) + return null; return dc[TMW]; } + + /** * Gets the half height character for this hash key. * @param hashKey the key you want a half height glyph for; see {@link @@ -1783,6 +1797,8 @@ private static final String Unicode_tab = "\t"; = new DuffCode[] { new DuffCode(1, (char)58) }; private static final DuffCode[] tmwFor0F73 = new DuffCode[] { new DuffCode(4, (char)106), new DuffCode(1, (char)109) }; + private static final DuffCode[] tmwFor0F75 + = new DuffCode[] { new DuffCode(10, (char)126) }; private static final DuffCode[] tmwFor0F76 = new DuffCode[] { new DuffCode(8, (char)71), new DuffCode(8, (char)87) }; private static final DuffCode[] tmwFor0F77 @@ -1840,6 +1856,8 @@ private static final String Unicode_tab = "\t"; return tmwFor0F6A; } else if ('\u0F73' == ch) { return tmwFor0F73; + } else if ('\u0F75' == ch) { + return tmwFor0F75; } else if ('\u0F76' == ch) { return tmwFor0F76; } else if ('\u0F77' == ch) { diff --git a/source/org/thdl/tib/text/tibwn.ini b/source/org/thdl/tib/text/tibwn.ini index 2c06cab..9f1f036 100644 --- a/source/org/thdl/tib/text/tibwn.ini +++ b/source/org/thdl/tib/text/tibwn.ini @@ -927,6 +927,15 @@ a+r+y~145,4~~8,65~1,109~8,121~1,123~1,125~8,107~8,114~f68,fb2,fb1 // nyi.zla editor's mark. This is NOT \u0F82, although it looks very similar. \uF03A~91,5~~9,89~~~~~~~none +// yungs.drung (reversed): +\u5350~97,5~~9,97~~~~~~~5350 +// yungs.drung (standard): +\u534D~98,5~~9,98~~~~~~~534D + +// utsama ka: +\u0F88+k~57,2~~3,59~1,109~4,120~1,125~1,123~4,106~4,113~f88,f90 +// utsama kha: +\u0F88+kh~58,2~~3,60~1,109~4,120~1,125~1,123~4,106~4,113~f88,f91 M~238,1~~8,90~~~~~~~0F7E @@ -1069,13 +1078,13 @@ A~204,2~~4,109~~~~~~~0F71 A~205,2~~4,110~~~~~~~0F71 A~206,2~~4,111~~~~~~~0F71 A~207,2~~4,112~~~~~~~0F71 -U~211,2~~4,113~~~~~~~0F75 -U~212,2~~4,114~~~~~~~0F75 -U~213,2~~4,115~~~~~~~0F75 -U~214,2~~4,116~~~~~~~0F75 -U~215,2~~4,117~~~~~~~0F75 -U~216,2~~4,118~~~~~~~0F75 -U~217,2~~4,119~~~~~~~0F75 +U~211,2~~4,113~~~~~~~0F71,0F74 +U~212,2~~4,114~~~~~~~0F71,0F74 +U~213,2~~4,115~~~~~~~0F71,0F74 +U~214,2~~4,116~~~~~~~0F71,0F74 +U~215,2~~4,117~~~~~~~0F71,0F74 +U~216,2~~4,118~~~~~~~0F71,0F74 +U~217,2~~4,119~~~~~~~0F71,0F74 u~224,2~~4,120~~~~~~~0F74 u~225,2~~4,121~~~~~~~0F74 u~226,2~~4,122~~~~~~~0F74 @@ -1090,13 +1099,13 @@ A~204,3~~6,109~~~~~~~0F71 A~205,3~~6,110~~~~~~~0F71 A~206,3~~6,111~~~~~~~0F71 A~207,3~~6,112~~~~~~~0F71 -U~211,3~~6,113~~~~~~~0F75 -U~212,3~~6,114~~~~~~~0F75 -U~213,3~~6,115~~~~~~~0F75 -U~214,3~~6,116~~~~~~~0F75 -U~215,3~~6,117~~~~~~~0F75 -U~216,3~~6,118~~~~~~~0F75 -U~217,3~~6,119~~~~~~~0F75 +U~211,3~~6,113~~~~~~~0F71,0F74 +U~212,3~~6,114~~~~~~~0F71,0F74 +U~213,3~~6,115~~~~~~~0F71,0F74 +U~214,3~~6,116~~~~~~~0F71,0F74 +U~215,3~~6,117~~~~~~~0F71,0F74 +U~216,3~~6,118~~~~~~~0F71,0F74 +U~217,3~~6,119~~~~~~~0F71,0F74 u~224,3~~6,120~~~~~~~0F74 u~225,3~~6,121~~~~~~~0F74 u~226,3~~6,122~~~~~~~0F74 @@ -1111,13 +1120,13 @@ A~204,4~~8,109~~~~~~~0F71 A~205,4~~8,110~~~~~~~0F71 A~206,4~~8,111~~~~~~~0F71 A~207,4~~8,112~~~~~~~0F71 -U~211,4~~8,113~~~~~~~0F75 -U~212,4~~8,114~~~~~~~0F75 -U~213,4~~8,115~~~~~~~0F75 -U~214,4~~8,116~~~~~~~0F75 -U~215,4~~8,117~~~~~~~0F75 -U~216,4~~8,118~~~~~~~0F75 -U~217,4~~8,119~~~~~~~0F75 +U~211,4~~8,113~~~~~~~0F71,0F74 +U~212,4~~8,114~~~~~~~0F71,0F74 +U~213,4~~8,115~~~~~~~0F71,0F74 +U~214,4~~8,116~~~~~~~0F71,0F74 +U~215,4~~8,117~~~~~~~0F71,0F74 +U~216,4~~8,118~~~~~~~0F71,0F74 +U~217,4~~8,119~~~~~~~0F71,0F74 u~224,4~~8,120~~~~~~~0F74 u~225,4~~8,121~~~~~~~0F74 u~226,4~~8,122~~~~~~~0F74 @@ -1131,13 +1140,13 @@ A~163,1~~10,116~~~~~~~0F71 A~164,1~~10,117~~~~~~~0F71 A~211,1~~10,118~~~~~~~0F71 A~212,1~~10,119~~~~~~~0F71 -U~213,1~~10,120~~~~~~~0F75 -U~214,1~~10,121~~~~~~~0F75 -U~215,1~~10,122~~~~~~~0F75 -U~216,1~~10,123~~~~~~~0F75 -U~217,1~~10,124~~~~~~~0F75 -U~218,1~~10,125~~~~~~~0F75 -U~219,1~~10,126~~~~~~~0F75 +U~213,1~~10,120~~~~~~~0F71,0F74 +U~214,1~~10,121~~~~~~~0F71,0F74 +U~215,1~~10,122~~~~~~~0F71,0F74 +U~216,1~~10,123~~~~~~~0F71,0F74 +U~217,1~~10,124~~~~~~~0F71,0F74 +U~218,1~~10,125~~~~~~~0F71,0F74 +U~219,1~~10,126~~~~~~~0F71,0F74 // ra.mgo: r~173,4~~8,66~~~~~~~0F62 @@ -1191,13 +1200,3 @@ r~176,4~~8,71~~~~~~~0FB2 \tmw8070~67,5~~9,70~~~~~~~none \tmw8071~68,5~~9,71~~~~~~~none \tmw8072~69,5~~9,72~~~~~~~none - -// yungs.drung (reversed): -\tmw8097~97,5~~9,97~~~~~~~5350 -// yungs.drung (standard): -\tmw8098~98,5~~9,98~~~~~~~534D - -// utsama ka: -\tmw2059~57,2~~3,59~1,109~4,120~1,125~1,123~4,106~4,113~f88,f90 -// utsama kha: -\tmw2060~58,2~~3,60~1,109~4,120~1,125~1,123~4,106~4,113~f88,f91 diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java index 098132c..cbf8c27 100644 --- a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java +++ b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java @@ -298,7 +298,7 @@ public class UnicodeUtils implements UnicodeConstants { characters will appear as themselves. */ public static String unicodeCodepointToString(char cp, boolean shortenIfPossible) { - return unicodeCodepointToString(cp, shortenIfPossible, "\\u"); + return unicodeCodepointToString(cp, shortenIfPossible, "\\u", false); } /** Like {@link #unicodeCodepointToString(char, boolean)} if you @@ -307,7 +307,8 @@ public class UnicodeUtils implements UnicodeConstants { 0F55. */ public static String unicodeCodepointToString(char cp, boolean shortenIfPossible, - String prefix) { + String prefix, + boolean upperCase) { if (shortenIfPossible) { if ((cp >= 'a' && cp <= 'z') || (cp >= 'A' && cp <= 'Z') @@ -348,14 +349,16 @@ public class UnicodeUtils implements UnicodeConstants { return "\\r"; } + String suffix; if (cp < '\u0010') - return prefix + "000" + Integer.toHexString((int)cp); + suffix = "000" + Integer.toHexString((int)cp); else if (cp < '\u0100') - return prefix + "00" + Integer.toHexString((int)cp); + suffix = "00" + Integer.toHexString((int)cp); else if (cp < '\u1000') - return prefix + "0" + Integer.toHexString((int)cp); + suffix = "0" + Integer.toHexString((int)cp); else - return prefix + Integer.toHexString((int)cp); + suffix = Integer.toHexString((int)cp); + return prefix + (upperCase ? suffix.toUpperCase() : suffix); } /** diff --git a/source/org/thdl/tib/text/ttt/ACIPTraits.java b/source/org/thdl/tib/text/ttt/ACIPTraits.java index e3dc988..e71cafa 100644 --- a/source/org/thdl/tib/text/ttt/ACIPTraits.java +++ b/source/org/thdl/tib/text/ttt/ACIPTraits.java @@ -546,10 +546,12 @@ public final class ACIPTraits implements TTraits { /** Gets the duffcodes for wowel, such that they look good with * the preceding glyph, and appends them to duff. */ - public void getDuffForWowel(ArrayList duff, DuffCode preceding, String wowel) { + public void getDuffForWowel(ArrayList duff, DuffCode preceding, String wowel) + throws ConversionException + { if (null == wowel) return; if (null == getEwtsForWowel(wowel)) // FIXME: expensive assertion! Use assert. - throw new IllegalArgumentException("Wowel " + wowel + " isn't in the small set of wowels we handle correctly."); + throw new ConversionException("Wowel " + wowel + " isn't in the small set of wowels we handle correctly."); // Order matters here. boolean context_added[] = new boolean[] { false }; @@ -619,8 +621,10 @@ public final class ACIPTraits implements TTraits { try { return TPairListFactory.breakACIPIntoChunks(tt, sh); } catch (StackOverflowError e) { + // TODO(dchandler): use ConversionException? Stop catching these? throw new IllegalArgumentException("Input too large[1]: " + tt); } catch (OutOfMemoryError e) { + // TODO(dchandler): use ConversionException? Stop catching these? throw new IllegalArgumentException("Input too large[2]: " + tt); } } diff --git a/source/org/thdl/tib/text/ttt/ConversionException.java b/source/org/thdl/tib/text/ttt/ConversionException.java new file mode 100644 index 0000000..01d2623 --- /dev/null +++ b/source/org/thdl/tib/text/ttt/ConversionException.java @@ -0,0 +1,30 @@ +/* +The contents of this file are subject to the THDL Open Community License +Version 1.0 (the "License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License on the THDL web site +(http://www.thdl.org/). + +Software distributed under the License is distributed on an "AS IS" basis, +WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +License for the specific terms governing rights and limitations under the +License. + +The Initial Developer of this software is the Tibetan and Himalayan Digital +Library (THDL). Portions created by the THDL are Copyright 2005 THDL. +All Rights Reserved. + +Contributor(s): ______________________________________. +*/ + +package org.thdl.tib.text.ttt; + +/** + * @author David Chandler + * + *

A ConversionException is a general-purpose checked exception + * used to indicate a problem during conversion. + */ +public final class ConversionException extends Exception { + /** @see Exception.Exception(String) */ + ConversionException(String x) { super(x); } +} diff --git a/source/org/thdl/tib/text/ttt/EWTSTest.java b/source/org/thdl/tib/text/ttt/EWTSTest.java index ed55d1a..02a9d08 100644 --- a/source/org/thdl/tib/text/ttt/EWTSTest.java +++ b/source/org/thdl/tib/text/ttt/EWTSTest.java @@ -19,10 +19,12 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.ttt; import java.io.PrintStream; +import java.util.ArrayList; import junit.framework.TestCase; import org.thdl.util.ThdlOptions; +import org.thdl.tib.text.TibetanDocument; import org.thdl.tib.text.tshegbar.UnicodeUtils; /** Tests this package's ability to understand EWTS and turn it into @@ -76,42 +78,145 @@ public class EWTSTest extends TestCase { } } - /** Causes a JUnit test case failure unless the EWTS document ewts - * converts to the unicode expectedUnicode. */ - static void ewts2uni_test(String ewts, String expectedUnicode) { - // TODO(DLC)[EWTS->Tibetan]: In addition to what this - // currently does, have this function convert to TMW and - // convert that TMW to Unicode and verify that the result is - // the same. Almost every call should allow for that. - + /** Returns the Unicode corresponding to the TMW to which ewts + * corresponds, or null if we couldn't push through, even with + * errors, from EWTS->TMW->Unicode. */ + private static String ewts2tmw2uni(String ewts) { + TTraits traits = EWTSTraits.instance(); StringBuffer errors = new StringBuffer(); - String unicode = TConverter.convertToUnicodeText(EWTSTraits.instance(), - ewts, errors, - null, true, - "None", // TODO(DLC)[EWTS->Tibetan]: ??? - false /* short warnings */); - if (null == unicode) { + boolean shortMessages = true; + String warningLevel = "All"; // slow but exercises more code paths + ArrayList scan + = traits.scanner().scan(ewts, errors, -1, + shortMessages, + warningLevel); + if (null == scan) + return null; + if (errors.length() > 0) + return null; + errors = new StringBuffer(); + TibetanDocument tdoc = new TibetanDocument(); + boolean rv; + try { + rv = TConverter.convertToTMW(traits, + scan, tdoc, errors, null, null, + false, warningLevel, + shortMessages, true, + new int[] { tdoc.getLength() }); + } catch (java.io.IOException e) { + // I doubt this can happen. + throw new Error(e.toString()); + } + if (!rv) + return null; + if (tdoc.getLength() < 1 && ewts.length() > 0) + return null; + errors = new StringBuffer(); + long numAttemptedReplacements[] = new long[] { 0 }; + tdoc.convertToUnicode(0, tdoc.getLength(), errors, null, + numAttemptedReplacements); + if (errors.length() > 0) + return null; + if (numAttemptedReplacements[0] < 1) + return null; + + try { + return tdoc.getText(0, tdoc.getLength()); + } catch (javax.swing.text.BadLocationException e) { + throw new Error("I know this won't happen: " + e); + } + } + + static void ewts2uni_test(String ewts, String expectedUnicode) { + ewts2uni_test(ewts, expectedUnicode, true); + } + + /** Tests EWTS->Unicode but not EWTS->TMW[->Unicode]. */ + static void just_ewts2uni_test(String ewts, String expectedUnicode) { + ewts2uni_test(ewts, expectedUnicode, false); + } + + /** Causes a JUnit test case failure unless the EWTS document ewts + * converts to the unicode expectedUnicode. If doEwts2tmw2uni is + * true, then this causes a test case failure if an + * EWTS->TMW->Unicode trip doesn't give the same + * expectedUnicode. */ + static void ewts2uni_test(String ewts, String expectedUnicode, + boolean doEwts2tmw2uni) { + StringBuffer errors = new StringBuffer(); + String unicode + = TConverter.convertToUnicodeText(EWTSTraits.instance(), + ewts, errors, + null, true, + "None", // TODO(DLC)[EWTS->Tibetan]: ??? + false /* short warnings */); + help_ewts2uni_test("EWTS->Unicode: ", + ewts, expectedUnicode, unicode, errors); + if (doEwts2tmw2uni) { + help_ewts2uni_test("EWTS->TMW->Unicode: ", + ewts, expectedUnicode, ewts2tmw2uni(ewts), + new StringBuffer()); + } + } + + /** Doing EWTS->Unicode conversions yields one answer out of many + * for some inputs, such as "b+ha". This function checks for + * equality between two pieces of Unicode modulo such acceptable + * changes. It's only complete enough to handle the test cases + * we have. Why do we make two choices? TMW->Unicode is + * different source code from EWTS->Unicode; that's why. */ + private static boolean ewts2uni_unicode_equality(String expectedUnicode, + String actualUnicode) { + // TODO(dchandler): replaceAll is a 1.4-ism. Will users balk? + if (actualUnicode + .replaceAll("\u0f0d\u0f0d", "\u0f0e") // TMW has no \u0f0e glyph + .replaceAll("\u0f69", "\u0f40\u0fb5") // equivalent and neither are discouraged + .replaceAll("\u0f43", "\u0f42\u0fb7") // ditto... + .replaceAll("\u0f4d", "\u0f4c\u0fb7") + .replaceAll("\u0f52", "\u0f51\u0fb7") + .replaceAll("\u0f57", "\u0f56\u0fb7") + .replaceAll("\u0f5c", "\u0f5b\u0fb7") + .replaceAll("\u0fb9", "\u0f90\u0fb5") + .replaceAll("\u0f93", "\u0f92\u0fb7") + .replaceAll("\u0f9d", "\u0f9c\u0fb7") + .replaceAll("\u0fa2", "\u0fa1\u0fb7") + .replaceAll("\u0fa7", "\u0fa6\u0fb7") // ... + .replaceAll("\u0fac", "\u0fab\u0fb7") // equivalent and neither are discouraged + + .equals(expectedUnicode)) { + return true; + } + return expectedUnicode.equals(actualUnicode); + } + + private static void help_ewts2uni_test(String prefix, + String ewts, + String expectedUnicode, + String actualUnicode, + StringBuffer errors) { + if (null == actualUnicode) { if (null != expectedUnicode && "none" != expectedUnicode) { - System.out.println("No unicode exists for " + ewts + System.out.println(prefix + "No unicode exists for " + ewts + " but you expected " + UnicodeUtils.unicodeStringToPrettyString(expectedUnicode)); assertTrue(false); } - System.out.println("Unicode for " + ewts + " can't be had; errors are " + errors); + System.out.println(prefix + "Unicode for " + ewts + " can't be had; errors are " + errors); } else { - if (null != expectedUnicode && !expectedUnicode.equals(unicode)) { - explainInequality(unicode, expectedUnicode, System.out); - if (UnicodeUtils.unicodeStringToPrettyString(unicode).equals(UnicodeUtils.unicodeStringToPrettyString(expectedUnicode))) { - System.out.println("UGLY strings: The unicode for\n \"" + ewts + if (null != expectedUnicode + && !ewts2uni_unicode_equality(expectedUnicode, actualUnicode)) { + explainInequality(actualUnicode, expectedUnicode, System.out); + if (UnicodeUtils.unicodeStringToPrettyString(actualUnicode).equals(UnicodeUtils.unicodeStringToPrettyString(expectedUnicode))) { + System.out.println(prefix + "UGLY strings: The unicode for\n \"" + ewts + "\"\nis\n \"" - + unicode + + actualUnicode + "\",\nbut you expected\n \"" + expectedUnicode + "\""); } else { - System.out.println("The unicode for\n \"" + ewts + System.out.println(prefix + "The unicode for\n \"" + ewts + "\"\nis\n \"" - + UnicodeUtils.unicodeStringToPrettyString(unicode) + + UnicodeUtils.unicodeStringToPrettyString(actualUnicode) + "\",\nbut you expected\n \"" + UnicodeUtils.unicodeStringToPrettyString(expectedUnicode) + "\""); @@ -122,7 +227,7 @@ public class EWTSTest extends TestCase { TPairList[] la = EWTSTraits.instance().breakTshegBarIntoChunks(sb.toString(), false); assertTrue(la[1] == null); - System.out.println("EWTS=" + ewts + " and l'=" + la[0].toString2()); + System.out.println(prefix + "EWTS=" + ewts + " and l'=" + la[0].toString2()); } assertTrue(false); } @@ -156,24 +261,25 @@ public class EWTSTest extends TestCase { public void test0F39() { ewts2uni_test("v", "\u0F56\u0F39"); ewts2uni_test("f", "\u0F55\u0F39"); - ewts2uni_test("f+beM", "\u0f55\u0f39\u0fa6\u0f7a\u0f7e"); + just_ewts2uni_test("f+beM", "\u0f55\u0f39\u0fa6\u0f7a\u0f7e"); ewts2uni_test("faM", "\u0f55\u0f39\u0f7e"); ewts2uni_test("vaM", "\u0f56\u0f39\u0f7e"); - ewts2uni_test("k+fa", "\u0f40\u0fa5\u0f39"); - ewts2uni_test("f+va", "\u0f55\u0f39\u0fa6\u0f39"); - ewts2uni_test("ph+veM", "\u0f55\u0fa6\u0f39\u0f7a\u0f7e"); + just_ewts2uni_test("k+fa", "\u0f40\u0fa5\u0f39"); + just_ewts2uni_test("f+va", "\u0f55\u0f39\u0fa6\u0f39"); + just_ewts2uni_test("ph+veM", "\u0f55\u0fa6\u0f39\u0f7a\u0f7e"); ewts2uni_test("a^", "\u0f68\u0f39"); - ewts2uni_test("hUM^", "\u0f67\u0f71\u0f74\u0f7e\u0f39"); + ewts2uni_test("hUM^", "\u0f67\u0f39\u0f71\u0f74\u0f7e"); ewts2uni_test("ph^", "\u0f55\u0f39"); - ewts2uni_test("phe^", "\u0f55\u0f7a\u0f39"); // TODO(DLC)[EWTS->Tibetan]: does order of U+0F39 matter? - ewts2uni_test("ph^e", "\u0f55\u0f39\u0f68\u0f7a"); // TODO(DLC)[EWTS->Tibetan]: This is no good! We don't even warn, do we!? + ewts2uni_test("phe^", "\u0f55\u0f39\u0f7a"); + ewts2uni_test("ph^e", "\u0f55\u0f39\u0f68\u0f7a"); // TODO(DLC)[EWTS->Tibetan]: This is no good! We don't even warn, do we!? EWTSTraits.isWowelThatRequiresAChen(..) might be to blame + ewts2uni_test("a\u0f39", "\u0f68\u0f39"); - ewts2uni_test("hUM\u0f39", "\u0f67\u0f71\u0f74\u0f7e\u0f39"); + ewts2uni_test("hUM\u0f39", "\u0f67\u0f39\u0f71\u0f74\u0f7e"); ewts2uni_test("ph\u0f39", "\u0f55\u0f39"); - ewts2uni_test("phe\u0f39", "\u0f55\u0f7a\u0f39"); // TODO(DLC)[EWTS->Tibetan]: does order of U+0F39 matter? - ewts2uni_test("ph\u0f39e", "\u0f55\u0f39\u0f68\u0f7a"); // TODO(DLC)[EWTS->Tibetan]: This is no good! We don't even warn, do we!? + ewts2uni_test("phe\u0f39", "\u0f55\u0f39\u0f7a"); + ewts2uni_test("ph\u0f39e", "\u0f55\u0f39\u0f68\u0f7a"); // TODO(DLC)[EWTS->Tibetan]: This is no good! We don't even warn, do we!? EWTSTraits.isWowelThatRequiresAChen(..) might be to blame if (RUN_FAILING_TESTS) ewts2uni_test("ph^+beM", "\u0f55\u0f39\u0fa6\u0f7a\u0f7e"); } @@ -181,6 +287,13 @@ public class EWTSTest extends TestCase { /** Tests that the EWTS->unicode converter isn't completely braindead. */ public void testEwtsBasics() { + just_ewts2uni_test("r+sa", "\u0f62\u0fb6"); + ewts2uni_test("R+s", "\u0f6a\u0fb6"); + + ewts2uni_test("k?e", "\u0f40\u0f84\u0f68\u0f7a"); + ewts2uni_test("ko+o", "\u0f40\u0f7c\u0f7c"); + ewts2uni_test("kau+u", "\u0f40\u0f74\u0f7d"); + ewts2uni_test("g.yogs", "\u0f42\u0f61\u0f7c\u0f42\u0f66"); ewts2uni_test("brgyad", "\u0f56\u0f62\u0f92\u0fb1\u0f51"); ewts2uni_test("brjod", "\u0f56\u0f62\u0f97\u0f7c\u0f51"); @@ -220,39 +333,46 @@ public class EWTSTest extends TestCase { ewts2uni_test("b.ra ", "\u0f56\u0f62\u0f0b"); ewts2uni_test("bara ", "\u0f56\u0f62\u0f0b"); - ewts2uni_test("b+Ra ", "\u0f56\u0fbc\u0f0b"); + just_ewts2uni_test("b+Ra ", "\u0f56\u0fbc\u0f0b"); } /** Miscellaneous tests of EWTS->Unicode conversion. */ public void test__EWTS__miscellany() { + just_ewts2uni_test("ga\\u0f02ha", "\u0f42\u0f02\u0f67"); // TODO(DLC)[EWTS->Tibetan]: ewts->tmw is broken + just_ewts2uni_test("g.\\u0f03\u0f0b", "\u0f42\u0f03\u0f0b"); // TODO(DLC)[EWTS->Tibetan]: ewts->tmw is broken - ewts2uni_test("", ""); + just_ewts2uni_test("k+\u0fb2e", "\u0f40\u0fb2\u0f7a"); + assert_EWTS_error("\u0f42ya"); + just_ewts2uni_test("\u0f42+ya", "\u0f42\u0fb1"); + just_ewts2uni_test("\u0f42.ya", "\u0f42\u0f61"); + + just_ewts2uni_test("", ""); ewts2uni_test("0\\u0f19", "\u0f20\u0f19"); ewts2uni_test("0\\u0f18", "\u0f20\u0f18"); ewts2uni_test("0\\u0f3e", "\u0f20\u0f3e"); // TODO(DLC)[EWTS->Tibetan]: test ewts->tmw ewts2uni_test("0\\u0f3f", "\u0f20\u0f3f"); // TODO(DLC)[EWTS->Tibetan]: test ewts->tmw - ewts2uni_test("R", "\u0f6A"); - ewts2uni_test("Ra", "\u0f6A"); + just_ewts2uni_test("R", "\u0f6A"); + just_ewts2uni_test("Ra", "\u0f6A"); - ewts2uni_test("R+ka", "\u0F6A\u0f90"); - ewts2uni_test("k+Wa", "\u0f40\u0FBA"); - ewts2uni_test("k+Ya", "\u0f40\u0FBB"); - ewts2uni_test("k+Ra", "\u0f40\u0FBC"); + just_ewts2uni_test("R+ka", "\u0F6A\u0f90"); + just_ewts2uni_test("k+Wa", "\u0f40\u0FBA"); + just_ewts2uni_test("k+Ya", "\u0f40\u0FBB"); + just_ewts2uni_test("k+Ra", "\u0f40\u0FBC"); ewts2uni_test("k+wa", "\u0f40\u0Fad"); ewts2uni_test("k+la", "\u0f40\u0Fb3"); ewts2uni_test("k+ya", "\u0f40\u0Fb1"); ewts2uni_test("k+ra", "\u0f40\u0Fb2"); - ewts2uni_test("r-I", "\u0f62\u0f81"); - ewts2uni_test("l-I", "\u0f63\u0f81"); + ewts2uni_test("r-I", "\u0f62\u0f71\u0f80"); + ewts2uni_test("l-I", "\u0f63\u0f71\u0f80"); ewts2uni_test("r-i", "\u0f62\u0f80"); ewts2uni_test("l-i", "\u0f63\u0f80"); ewts2uni_test("gr-i", "\u0f42\u0fb2\u0f80"); - ewts2uni_test("gr-I", "\u0f42\u0fb2\u0f81"); + ewts2uni_test("gr-I", "\u0f42\u0fb2\u0f71\u0f80"); ewts2uni_test("gl-i", "\u0f42\u0fb3\u0f80"); - ewts2uni_test("gl-I", "\u0f42\u0fb3\u0f81"); + ewts2uni_test("gl-I", "\u0f42\u0fb3\u0f71\u0f80"); } @@ -277,9 +397,9 @@ public class EWTSTest extends TestCase { ewts2uni_test("u", "\u0f68\u0f74"); ewts2uni_test("U", "\u0f68\u0f71\u0f74"); ewts2uni_test("a+r-i", "\u0f68\u0fb2\u0f80"); - ewts2uni_test("a+r-I", "\u0f68\u0fb2\u0f81"); - ewts2uni_test("a+l-i", "\u0f68\u0fb3\u0f80"); - ewts2uni_test("a+l-I", "\u0f68\u0fb3\u0f81"); + ewts2uni_test("a+r-I", "\u0f68\u0fb2\u0f71\u0f80"); + just_ewts2uni_test("a+l-i", "\u0f68\u0fb3\u0f80"); + just_ewts2uni_test("a+l-I", "\u0f68\u0fb3\u0f71\u0f80"); ewts2uni_test("e", "\u0f68\u0f7a"); ewts2uni_test("ai", "\u0f68\u0f7b"); // ewts2uni_test("ao", "\u0f68\u0f68\u0f7c"); // TODO(DLC)[EWTS->Tibetan]: @@ -289,11 +409,12 @@ public class EWTSTest extends TestCase { // ewts2uni_test("aM", "\u0f68\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: than needs to say // ewts2uni_test("aH", "\u0f68\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say ewts2uni_test("-i", "\u0f68\u0f80"); - ewts2uni_test("-I", "\u0f68\u0f81"); + ewts2uni_test("-I", "\u0f68\u0f71\u0f80"); // ewts2uni_test("a~M`", "\u0f68\u0f82"); // TODO(DLC)[EWTS->Tibetan]: than needs to say // ewts2uni_test("a~M", "\u0f68\u0f83"); // TODO(DLC)[EWTS->Tibetan]: than needs to say // ewts2uni_test("a?", "\u0f68\u0f84"); // TODO(DLC)[EWTS->Tibetan]: than needs to say - ewts2uni_test("\\u0f68", "\u0f68"); + just_ewts2uni_test("\\u0f68", "\u0f68"); + ewts2uni_test("\\u0f86", "\u0f68\u0f86"); // TODO(DLC)[EWTS->Tibetan]: than needs to say ewts2uni_test("a\\u0f86", "\u0f68\u0f86"); ewts2uni_test("a\\U0f86", "\u0f68\u0f86"); ewts2uni_test("a\\U0F86", "\u0f68\u0f86"); @@ -305,7 +426,7 @@ public class EWTSTest extends TestCase { ewts2uni_test("a\\u0f87", "\u0f68\u0f87"); // ewts2uni_test("aMH", "\u0f68\u0f7e\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say -// ewts2uni_test("aHM", "\u0f68\u0f7f\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: than needs to say +// ewts2uni_test("aHM", "\u0f68\u0f7e\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say ewts2uni_test("a", "\u0f68"); } @@ -325,7 +446,7 @@ public class EWTSTest extends TestCase { ewts2uni_test("e+e+e+e+e", "\u0f68\u0f7a\u0f7a\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? ewts2uni_test("o+e", "\u0f68\u0f7c\u0f7a"); ewts2uni_test("u+A+i+o+e", "\u0f68\u0f74\u0f71\u0f72\u0f7c\u0f7a"); - ewts2uni_test("u+A+i+o+eHM", "\u0f68\u0f74\u0f71\u0f72\u0f7c\u0f7a\u0f7f\u0f7e"); + ewts2uni_test("u+A+i+o+eHM", "\u0f68\u0f74\u0f71\u0f72\u0f7c\u0f7a\u0f7e\u0f7f"); ewts2uni_test("u+A", "\u0f68\u0f74\u0f71"); ewts2uni_test("o+-I", "DLC"); @@ -342,9 +463,9 @@ public class EWTSTest extends TestCase { ewts2uni_test("ku", "\u0f40\u0f74"); ewts2uni_test("kU", "\u0f40\u0f71\u0f74"); ewts2uni_test("k+r-i", "\u0f40\u0fb2\u0f80"); - ewts2uni_test("k+r-I", "\u0f40\u0fb2\u0f81"); + ewts2uni_test("k+r-I", "\u0f40\u0fb2\u0f71\u0f80"); ewts2uni_test("k+l-i", "\u0f40\u0fb3\u0f80"); - ewts2uni_test("k+l-I", "\u0f40\u0fb3\u0f81"); + ewts2uni_test("k+l-I", "\u0f40\u0fb3\u0f71\u0f80"); ewts2uni_test("ke", "\u0f40\u0f7a"); ewts2uni_test("e", "\u0f68\u0f7a"); ewts2uni_test("a", "\u0f68"); @@ -354,7 +475,7 @@ public class EWTSTest extends TestCase { ewts2uni_test("kaM", "\u0f40\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: than needs to say ewts2uni_test("kaH", "\u0f40\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say ewts2uni_test("k-i", "\u0f40\u0f80"); - ewts2uni_test("k-I", "\u0f40\u0f81"); + ewts2uni_test("k-I", "\u0f40\u0f71\u0f80"); ewts2uni_test("ka~M`", "\u0f40\u0f82"); // TODO(DLC)[EWTS->Tibetan]: than needs to say ewts2uni_test("ka~M", "\u0f40\u0f83"); // TODO(DLC)[EWTS->Tibetan]: than needs to say ewts2uni_test("ka?", "\u0f40\u0f84"); // TODO(DLC)[EWTS->Tibetan]: than needs to say @@ -369,7 +490,7 @@ public class EWTSTest extends TestCase { ewts2uni_test("ka\\u0f87", "\u0f40\u0f87"); ewts2uni_test("kaMH", "\u0f40\u0f7e\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say - ewts2uni_test("kaHM", "\u0f40\u0f7f\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: than needs to say + ewts2uni_test("kaHM", "\u0f40\u0f7e\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say // Than's e-mails of Aug 10 and Aug 11, 2004 say that A+i is @@ -380,10 +501,10 @@ public class EWTSTest extends TestCase { ewts2uni_test("ke+e+e", "\u0f40\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? ewts2uni_test("ke+e+e+e", "\u0f40\u0f7a\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? ewts2uni_test("ke+e+e+e+e", "\u0f40\u0f7a\u0f7a\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? - ewts2uni_test("ko+e", "\u0f40\u0f7c\u0f7a"); - ewts2uni_test("ku+A+i+o+e", "\u0f40\u0f74\u0f71\u0f72\u0f7c\u0f7a"); - ewts2uni_test("ku+A+i+o+eHM", "\u0f40\u0f74\u0f71\u0f72\u0f7c\u0f7a\u0f7f\u0f7e"); - ewts2uni_test("ku+A", "\u0f40\u0f74\u0f71"); + ewts2uni_test("ko+e", "\u0f40\u0f7a\u0f7c"); + ewts2uni_test("ku+A+i+o+e", "\u0f40\u0f71\u0f74\u0f72\u0f7a\u0f7c"); + ewts2uni_test("ku+A+i+o+eHM", "\u0f40\u0f71\u0f74\u0f72\u0f7a\u0f7c\u0f7e\u0f7f"); + ewts2uni_test("ku+A", "\u0f40\u0f71\u0f74"); ewts2uni_test("k", "\u0f40"); ewts2uni_test("ka", "\u0f40"); @@ -414,7 +535,7 @@ public class EWTSTest extends TestCase { ewts2uni_test("'aM", "\u0f60\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: than needs to say ewts2uni_test("'aH", "\u0f60\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say ewts2uni_test("'-i", "\u0f60\u0f80"); - ewts2uni_test("'-I", "\u0f60\u0f81"); + ewts2uni_test("'-I", "\u0f60\u0f71\u0f80"); ewts2uni_test("'a~M`", "\u0f60\u0f82"); // TODO(DLC)[EWTS->Tibetan]: than needs to say ewts2uni_test("'a~M", "\u0f60\u0f83"); // TODO(DLC)[EWTS->Tibetan]: than needs to say ewts2uni_test("'a?", "\u0f60\u0f84"); // TODO(DLC)[EWTS->Tibetan]: than needs to say @@ -429,7 +550,7 @@ public class EWTSTest extends TestCase { ewts2uni_test("'a\\u0f87", "\u0f60\u0f87"); ewts2uni_test("'aMH", "\u0f60\u0f7e\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say - ewts2uni_test("'aHM", "\u0f60\u0f7f\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: than needs to say + ewts2uni_test("'aHM", "\u0f60\u0f7e\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say // Than's e-mails of Aug 10 and Aug 11, 2004 say that A+i is @@ -440,19 +561,19 @@ public class EWTSTest extends TestCase { ewts2uni_test("'e+e+e", "\u0f60\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? ewts2uni_test("'e+e+e+e", "\u0f60\u0f7a\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? ewts2uni_test("'e+e+e+e+e", "\u0f60\u0f7a\u0f7a\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? - ewts2uni_test("'o+e", "\u0f60\u0f7c\u0f7a"); - ewts2uni_test("'u+A+i+o+e", "\u0f60\u0f74\u0f71\u0f72\u0f7c\u0f7a"); - ewts2uni_test("'u+A+i+o+eHM", "\u0f60\u0f74\u0f71\u0f72\u0f7c\u0f7a\u0f7f\u0f7e"); + ewts2uni_test("'o+e", "\u0f60\u0f7a\u0f7c"); + ewts2uni_test("'u+A+i+o+e", "\u0f60\u0f71\u0f74\u0f72\u0f7a\u0f7c"); + ewts2uni_test("'u+A+i+o+eHM", "\u0f60\u0f71\u0f74\u0f72\u0f7a\u0f7c\u0f7e\u0f7f"); - ewts2uni_test("'u+A", "\u0f60\u0f74\u0f71"); + ewts2uni_test("'u+A", "\u0f60\u0f71\u0f74"); ewts2uni_test("'", "\u0f60"); ewts2uni_test("'a", "\u0f60"); - ewts2uni_test("'+r-i", "\u0f60\u0fb2\u0f80"); - ewts2uni_test("'+r-I", "\u0f60\u0fb2\u0f81"); - ewts2uni_test("'+l-i", "\u0f60\u0fb3\u0f80"); - ewts2uni_test("'+l-I", "\u0f60\u0fb3\u0f81"); + just_ewts2uni_test("'+r-i", "\u0f60\u0fb2\u0f80"); + just_ewts2uni_test("'+r-I", "\u0f60\u0fb2\u0f71\u0f80"); + just_ewts2uni_test("'+l-i", "\u0f60\u0fb3\u0f80"); + just_ewts2uni_test("'+l-I", "\u0f60\u0fb3\u0f71\u0f80"); } /** Tests that our implementation of EWTS's wowels are correct, @@ -471,7 +592,7 @@ public class EWTSTest extends TestCase { ewts2uni_test("k+ShaM", "\u0f40\u0fb5\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: than needs to say ewts2uni_test("k+ShaH", "\u0f40\u0fb5\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say ewts2uni_test("k+Sh-i", "\u0f40\u0fb5\u0f80"); - ewts2uni_test("k+Sh-I", "\u0f40\u0fb5\u0f81"); + ewts2uni_test("k+Sh-I", "\u0f40\u0fb5\u0f71\u0f80"); ewts2uni_test("k+Sha~M`", "\u0f40\u0fb5\u0f82"); // TODO(DLC)[EWTS->Tibetan]: than needs to say ewts2uni_test("k+Sha~M", "\u0f40\u0fb5\u0f83"); // TODO(DLC)[EWTS->Tibetan]: than needs to say ewts2uni_test("k+Sha?", "\u0f40\u0fb5\u0f84"); // TODO(DLC)[EWTS->Tibetan]: than needs to say @@ -486,7 +607,7 @@ public class EWTSTest extends TestCase { ewts2uni_test("k+Sha\\u0f87", "\u0f40\u0fb5\u0f87"); ewts2uni_test("k+ShaMH", "\u0f40\u0fb5\u0f7e\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say - ewts2uni_test("k+ShaHM", "\u0f40\u0fb5\u0f7f\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: than needs to say + ewts2uni_test("k+ShaHM", "\u0f40\u0fb5\u0f7e\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say // Than's e-mails of Aug 10 and Aug 11, 2004 say that A+i is @@ -497,18 +618,18 @@ public class EWTSTest extends TestCase { ewts2uni_test("k+She+e+e", "\u0f40\u0fb5\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? ewts2uni_test("k+She+e+e+e", "\u0f40\u0fb5\u0f7a\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? ewts2uni_test("k+She+e+e+e+e", "\u0f40\u0fb5\u0f7a\u0f7a\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? - ewts2uni_test("k+Sho+e", "\u0f40\u0fb5\u0f7c\u0f7a"); - ewts2uni_test("k+Shu+A+i+o+e", "\u0f40\u0fb5\u0f74\u0f71\u0f72\u0f7c\u0f7a"); - ewts2uni_test("k+Shu+A+i+o+eHM", "\u0f40\u0fb5\u0f74\u0f71\u0f72\u0f7c\u0f7a\u0f7f\u0f7e"); - ewts2uni_test("k+Shu+A", "\u0f40\u0fb5\u0f74\u0f71"); + ewts2uni_test("k+Sho+e", "\u0f40\u0fb5\u0f7a\u0f7c"); + ewts2uni_test("k+Shu+A+i+o+e", "\u0f40\u0fb5\u0f71\u0f74\u0f72\u0f7a\u0f7c"); + ewts2uni_test("k+Shu+A+i+o+eHM", "\u0f40\u0fb5\u0f71\u0f74\u0f72\u0f7a\u0f7c\u0f7e\u0f7f"); + ewts2uni_test("k+Shu+A", "\u0f40\u0fb5\u0f71\u0f74"); ewts2uni_test("k+Sh", "\u0f40\u0fb5"); ewts2uni_test("k+Sha", "\u0f40\u0fb5"); - ewts2uni_test("k+Sh+r-i", "\u0f40\u0fb5\u0fb2\u0f80"); - ewts2uni_test("k+Sh+r-I", "\u0f40\u0fb5\u0fb2\u0f81"); + just_ewts2uni_test("k+Sh+r-i", "\u0f40\u0fb5\u0fb2\u0f80"); + just_ewts2uni_test("k+Sh+r-I", "\u0f40\u0fb5\u0fb2\u0f71\u0f80"); ewts2uni_test("k+Sh+l-i", "\u0f40\u0fb5\u0fb3\u0f80"); - ewts2uni_test("k+Sh+l-I", "\u0f40\u0fb5\u0fb3\u0f81"); + ewts2uni_test("k+Sh+l-I", "\u0f40\u0fb5\u0fb3\u0f71\u0f80"); } /** Tests that our implementation of EWTS's wowels are correct, @@ -526,12 +647,12 @@ public class EWTSTest extends TestCase { ewts2uni_test("phywo", "\u0f55\u0fb1\u0fad\u0f7c"); ewts2uni_test("phywau", "\u0f55\u0fb1\u0fad\u0f7d"); ewts2uni_test("phyw-i", "\u0f55\u0fb1\u0fad\u0f80"); - ewts2uni_test("phyw-I", "\u0f55\u0fb1\u0fad\u0f81"); + ewts2uni_test("phyw-I", "\u0f55\u0fb1\u0fad\u0f71\u0f80"); ewts2uni_test("phyw\\u0f86", "\u0f55\u0fb1\u0fad\u0f86"); assertEquals(EWTSTraits.instance().getUnicodeForWowel("\u0f86+\u0f84"), "\u0f86\u0f84"); - ewts2uni_test("phyw\\u0f84\\u0f86", "\u0f55\u0fb1\u0fad\u0f84\u0f86"); - ewts2uni_test("phyw\\u0f84\u0f86", "\u0f55\u0fb1\u0fad\u0f84\u0f86"); + ewts2uni_test("phyw\\u0f84\\u0f86", "\u0f55\u0fb1\u0fad\u0f86\u0f84"); + ewts2uni_test("phyw\\u0f84\u0f86", "\u0f55\u0fb1\u0fad\u0f86\u0f84"); ewts2uni_test("phywa\\u0f86", "\u0f55\u0fb1\u0fad\u0f86"); ewts2uni_test("phywa\\u0f86\u0f84", "\u0f55\u0fb1\u0fad\u0f86\u0f84"); ewts2uni_test("phywa\\U0f86", "\u0f55\u0fb1\u0fad\u0f86"); @@ -552,10 +673,10 @@ public class EWTSTest extends TestCase { ewts2uni_test("phywe+e+e", "\u0f55\u0fb1\u0fad\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? ewts2uni_test("phywe+e+e+e", "\u0f55\u0fb1\u0fad\u0f7a\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? ewts2uni_test("phywe+e+e+e+e", "\u0f55\u0fb1\u0fad\u0f7a\u0f7a\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? - ewts2uni_test("phywo+e", "\u0f55\u0fb1\u0fad\u0f7c\u0f7a"); - ewts2uni_test("phywu+A+i+o+e", "\u0f55\u0fb1\u0fad\u0f74\u0f71\u0f72\u0f7c\u0f7a"); - ewts2uni_test("phywu+A+i+o+eHM", "\u0f55\u0fb1\u0fad\u0f74\u0f71\u0f72\u0f7c\u0f7a\u0f7f\u0f7e"); - ewts2uni_test("phywu+A", "\u0f55\u0fb1\u0fad\u0f74\u0f71"); + ewts2uni_test("phywo+e", "\u0f55\u0fb1\u0fad\u0f7a\u0f7c"); + ewts2uni_test("phywu+A+i+o+e", "\u0f55\u0fb1\u0fad\u0f71\u0f74\u0f72\u0f7a\u0f7c"); + ewts2uni_test("phywu+A+i+o+eHM", "\u0f55\u0fb1\u0fad\u0f71\u0f74\u0f72\u0f7a\u0f7c\u0f7e\u0f7f"); + ewts2uni_test("phywu+A", "\u0f55\u0fb1\u0fad\u0f71\u0f74"); ewts2uni_test("phyw", "\u0f55\u0fb1\u0fad"); ewts2uni_test("phywa", "\u0f55\u0fb1\u0fad"); @@ -566,7 +687,7 @@ public class EWTSTest extends TestCase { ewts2uni_test("phywa~M", "\u0f55\u0fb1\u0fad\u0f83"); // TODO(DLC)[EWTS->Tibetan]: than needs to say ewts2uni_test("phywa?", "\u0f55\u0fb1\u0fad\u0f84"); // TODO(DLC)[EWTS->Tibetan]: than needs to say ewts2uni_test("phywaMH", "\u0f55\u0fb1\u0fad\u0f7e\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say - ewts2uni_test("phywaHM", "\u0f55\u0fb1\u0fad\u0f7f\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: than needs to say + ewts2uni_test("phywaHM", "\u0f55\u0fb1\u0fad\u0f7e\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say assert_EWTS_error("phywr-i"); assert_EWTS_error("phyw+r-i"); @@ -579,55 +700,55 @@ public class EWTSTest extends TestCase { * (U+0F40,U+0F97,U+0F97,U+0F90,U+0F90,U+0F97) is correct. I * chose this stack as an example of an absurd stack. */ public void test__EWTS__wowels_on_kjjkkj() { - ewts2uni_test("k+j+j+k+k+jA", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f71"); - ewts2uni_test("k+j+j+k+k+ji", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f72"); - ewts2uni_test("k+j+j+k+k+jI", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f71\u0f72"); - ewts2uni_test("k+j+j+k+k+ju", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f74"); - ewts2uni_test("k+j+j+k+k+jU", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f71\u0f74"); - ewts2uni_test("k+j+j+k+k+je", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7a"); - ewts2uni_test("k+j+j+k+k+jai", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7b"); - ewts2uni_test("k+j+j+k+k+jo", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7c"); - ewts2uni_test("k+j+j+k+k+jau", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7d"); - ewts2uni_test("k+j+j+k+k+jaM", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: than needs to say - ewts2uni_test("k+j+j+k+k+jaH", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say - ewts2uni_test("k+j+j+k+k+j-i", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f80"); - ewts2uni_test("k+j+j+k+k+j-I", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f81"); - ewts2uni_test("k+j+j+k+k+ja~M`", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f82"); // TODO(DLC)[EWTS->Tibetan]: than needs to say - ewts2uni_test("k+j+j+k+k+ja~M", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f83"); // TODO(DLC)[EWTS->Tibetan]: than needs to say - ewts2uni_test("k+j+j+k+k+ja?", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f84"); // TODO(DLC)[EWTS->Tibetan]: than needs to say - ewts2uni_test("k+j+j+k+k+ja\\u0f86", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f86"); - ewts2uni_test("k+j+j+k+k+ja\\U0f86", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f86"); - ewts2uni_test("k+j+j+k+k+ja\\U0F86", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f86"); - ewts2uni_test("k+j+j+k+k+ja\\u0F86", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f86"); - ewts2uni_test("k+j+j+k+k+ja\\u00000f86", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f86"); - ewts2uni_test("k+j+j+k+k+ja\\u00000f86", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f86"); - ewts2uni_test("k+j+j+k+k+ja\\u00000F86", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f86"); - ewts2uni_test("k+j+j+k+k+ja\\u00000F86", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f86"); - ewts2uni_test("k+j+j+k+k+ja\\u0f87", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f87"); + just_ewts2uni_test("k+j+j+k+k+jA", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f71"); + just_ewts2uni_test("k+j+j+k+k+ji", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f72"); + just_ewts2uni_test("k+j+j+k+k+jI", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f71\u0f72"); + just_ewts2uni_test("k+j+j+k+k+ju", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f74"); + just_ewts2uni_test("k+j+j+k+k+jU", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f71\u0f74"); + just_ewts2uni_test("k+j+j+k+k+je", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7a"); + just_ewts2uni_test("k+j+j+k+k+jai", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7b"); + just_ewts2uni_test("k+j+j+k+k+jo", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7c"); + just_ewts2uni_test("k+j+j+k+k+jau", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7d"); + just_ewts2uni_test("k+j+j+k+k+jaM", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: than needs to say + just_ewts2uni_test("k+j+j+k+k+jaH", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say + just_ewts2uni_test("k+j+j+k+k+j-i", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f80"); + just_ewts2uni_test("k+j+j+k+k+j-I", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f71\u0f80"); + just_ewts2uni_test("k+j+j+k+k+ja~M`", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f82"); // TODO(DLC)[EWTS->Tibetan]: than needs to say + just_ewts2uni_test("k+j+j+k+k+ja~M", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f83"); // TODO(DLC)[EWTS->Tibetan]: than needs to say + just_ewts2uni_test("k+j+j+k+k+ja?", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f84"); // TODO(DLC)[EWTS->Tibetan]: than needs to say + just_ewts2uni_test("k+j+j+k+k+ja\\u0f86", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f86"); + just_ewts2uni_test("k+j+j+k+k+ja\\U0f86", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f86"); + just_ewts2uni_test("k+j+j+k+k+ja\\U0F86", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f86"); + just_ewts2uni_test("k+j+j+k+k+ja\\u0F86", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f86"); + just_ewts2uni_test("k+j+j+k+k+ja\\u00000f86", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f86"); + just_ewts2uni_test("k+j+j+k+k+ja\\u00000f86", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f86"); + just_ewts2uni_test("k+j+j+k+k+ja\\u00000F86", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f86"); + just_ewts2uni_test("k+j+j+k+k+ja\\u00000F86", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f86"); + just_ewts2uni_test("k+j+j+k+k+ja\\u0f87", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f87"); - ewts2uni_test("k+j+j+k+k+jaMH", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7e\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say - ewts2uni_test("k+j+j+k+k+jaHM", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7f\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: than needs to say + just_ewts2uni_test("k+j+j+k+k+jaMH", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7e\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say + just_ewts2uni_test("k+j+j+k+k+jaHM", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7e\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say // Than's e-mails of Aug 10 and Aug 11, 2004 say that A+i is // the same as I and o+o is the same as au. - ewts2uni_test("k+j+j+k+k+jA+i", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f71\u0f72"); - ewts2uni_test("k+j+j+k+k+jo+o", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7c\u0f7c"); - ewts2uni_test("k+j+j+k+k+je+e", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7a\u0f7a"); - ewts2uni_test("k+j+j+k+k+je+e+e", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? - ewts2uni_test("k+j+j+k+k+je+e+e+e", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7a\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? - ewts2uni_test("k+j+j+k+k+je+e+e+e+e", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7a\u0f7a\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? - ewts2uni_test("k+j+j+k+k+jo+e", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7c\u0f7a"); - ewts2uni_test("k+j+j+k+k+ju+A+i+o+e", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f74\u0f71\u0f72\u0f7c\u0f7a"); - ewts2uni_test("k+j+j+k+k+ju+A+i+o+eHM", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f74\u0f71\u0f72\u0f7c\u0f7a\u0f7f\u0f7e"); - ewts2uni_test("k+j+j+k+k+ju+A", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f74\u0f71"); + just_ewts2uni_test("k+j+j+k+k+jA+i", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f71\u0f72"); + just_ewts2uni_test("k+j+j+k+k+jo+o", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7c\u0f7c"); + just_ewts2uni_test("k+j+j+k+k+je+e", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7a\u0f7a"); + just_ewts2uni_test("k+j+j+k+k+je+e+e", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? + just_ewts2uni_test("k+j+j+k+k+je+e+e+e", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7a\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? + just_ewts2uni_test("k+j+j+k+k+je+e+e+e+e", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7a\u0f7a\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? + just_ewts2uni_test("k+j+j+k+k+jo+e", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7a\u0f7c"); + just_ewts2uni_test("k+j+j+k+k+ju+A+i+o+e", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f71\u0f74\u0f72\u0f7a\u0f7c"); + just_ewts2uni_test("k+j+j+k+k+ju+A+i+o+eHM", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f71\u0f74\u0f72\u0f7a\u0f7c\u0f7e\u0f7f"); + just_ewts2uni_test("k+j+j+k+k+ju+A", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f71\u0f74"); - ewts2uni_test("k+j+j+k+k+j", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97"); - ewts2uni_test("k+j+j+k+k+ja", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97"); - ewts2uni_test("k+j+j+k+k+j+r-i", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0fb2\u0f80"); - ewts2uni_test("k+j+j+k+k+j+r-I", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0fb2\u0f81"); - ewts2uni_test("k+j+j+k+k+j+l-i", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0fb3\u0f80"); - ewts2uni_test("k+j+j+k+k+j+l-I", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0fb3\u0f81"); + just_ewts2uni_test("k+j+j+k+k+j", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97"); + just_ewts2uni_test("k+j+j+k+k+ja", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97"); + just_ewts2uni_test("k+j+j+k+k+j+r-i", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0fb2\u0f80"); + just_ewts2uni_test("k+j+j+k+k+j+r-I", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0fb2\u0f71\u0f80"); + just_ewts2uni_test("k+j+j+k+k+j+l-i", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0fb3\u0f80"); + just_ewts2uni_test("k+j+j+k+k+j+l-I", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0fb3\u0f71\u0f80"); } /** Tests that the EWTS that the spec says corresponds to each @@ -644,14 +765,16 @@ public class EWTSTest extends TestCase { ewts2uni_test("\\u0000", "\u0000"); ewts2uni_test("\\u0eff", "\u0eff"); } - ewts2uni_test("\\u0f00", "\u0f00"); - ewts2uni_test("\\u0f40", "\u0f40"); + just_ewts2uni_test("\\u0f00", "\u0f00"); // TODO(DLC)[EWTS->Tibetan]: ewts->tmw is broken + just_ewts2uni_test("\\u0F02", "\u0F02"); // TODO(DLC)[EWTS->Tibetan]: ewts->tmw is broken + just_ewts2uni_test("\\u0F03", "\u0F03"); // TODO(DLC)[EWTS->Tibetan]: ewts->tmw is broken + just_ewts2uni_test("\\u0f40", "\u0f40"); if (RUN_FAILING_TESTS) { assert_EWTS_error("\\u0f70"); // reserved codepoint assert_EWTS_error("\\u0fff"); // reserved codepoint - ewts2uni_test("\\uf000", "\uf000"); - ewts2uni_test("\\uf01f", "\uf01f"); - ewts2uni_test("\\uefff", "\uefff"); + just_ewts2uni_test("\\uf000", "\uf000"); + just_ewts2uni_test("\\uf01f", "\uf01f"); + just_ewts2uni_test("\\uefff", "\uefff"); } @@ -661,11 +784,11 @@ public class EWTSTest extends TestCase { ewts2uni_test("f", "\u0F55\u0F39"); ewts2uni_test("\u0f88+ka", "\u0f88\u0f90"); ewts2uni_test("\u0f88+kha", "\u0f88\u0f91"); + ewts2uni_test("\\u0f88+ka", "\u0f88\u0f90"); + ewts2uni_test("\\u0f88+kha", "\u0f88\u0f91"); ewts2uni_test("oM", false ? "\u0F00" : "\u0f68\u0f7c\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: which is correct? see e-mail (maybe it was cfynn who thought \u0F00 ought not be generated? ewts2uni_test("\\u0F01", "\u0F01"); - ewts2uni_test("\\u0F02", "\u0F02"); - ewts2uni_test("\\u0F03", "\u0F03"); ewts2uni_test("@", "\u0F04"); ewts2uni_test("#", "\u0F05"); // TODO(DLC)[EWTS->Tibetan]: warning/error? [#] alone is nonsense. ewts2uni_test("$", "\u0F06"); @@ -777,9 +900,9 @@ public class EWTSTest extends TestCase { ewts2uni_test("u", achen + "\u0F74"); ewts2uni_test("U", achen + "\u0F71\u0F74"); ewts2uni_test("a+r-i", achen + "\u0fb2\u0f80"); // not 0F76, which is discouraged by the Unicode standard - ewts2uni_test("a+r-I", achen + "\u0fb2\u0f81"); // not 0F77, which is discouraged by the Unicode standard - ewts2uni_test("a+l-i", achen + "\u0fb3\u0f80"); // not 0F78, which is discouraged by the Unicode standard - ewts2uni_test("a+l-I", achen + "\u0fb3\u0f81"); // not 0F79, which is discouraged by the Unicode standard + ewts2uni_test("a+r-I", achen + "\u0fb2\u0f71\u0f80"); // not 0F77, which is discouraged by the Unicode standard + just_ewts2uni_test("a+l-i", achen + "\u0fb3\u0f80"); // not 0F78, which is discouraged by the Unicode standard + just_ewts2uni_test("a+l-I", achen + "\u0fb3\u0f71\u0f80"); // not 0F79, which is discouraged by the Unicode standard ewts2uni_test("e", achen + "\u0F7A"); ewts2uni_test("ai", achen + "\u0F7B"); ewts2uni_test("o", achen + "\u0F7C"); @@ -787,7 +910,7 @@ public class EWTSTest extends TestCase { ewts2uni_test("M", achen + "\u0F7E"); ewts2uni_test("H", achen + "\u0F7F"); ewts2uni_test("-i", achen + "\u0F80"); - ewts2uni_test("-I", achen + "\u0F81"); + ewts2uni_test("-I", achen + "\u0F71\u0F80"); ewts2uni_test("~M`", achen + "\u0F82"); ewts2uni_test("~M", achen + "\u0F83"); ewts2uni_test("?", achen + "\u0F84"); // \u0f84 is a combiner @@ -799,8 +922,8 @@ public class EWTSTest extends TestCase { ewts2uni_test("\\u0F8A", "\u0F8A"); ewts2uni_test("\\u0F8B", "\u0F8B"); - final String ewts_for_superscript = "tsh+"; - final String unicode_for_superscript = "\u0f5a"; + final String ewts_for_superscript = "r+"; + final String unicode_for_superscript = "\u0f62"; ewts2uni_test(ewts_for_superscript + "k", unicode_for_superscript + "\u0F90"); ewts2uni_test(ewts_for_superscript + "kh", @@ -812,10 +935,10 @@ public class EWTSTest extends TestCase { + (false ? "\u0F93" : "\u0f92\u0fb7")); ewts2uni_test(ewts_for_superscript + "ng", unicode_for_superscript + "\u0F94"); - ewts2uni_test(ewts_for_superscript + "c", - unicode_for_superscript + "\u0F95"); - ewts2uni_test(ewts_for_superscript + "ch", - unicode_for_superscript + "\u0F96"); + just_ewts2uni_test(ewts_for_superscript + "c", + unicode_for_superscript + "\u0F95"); + just_ewts2uni_test(ewts_for_superscript + "ch", + unicode_for_superscript + "\u0F96"); ewts2uni_test(ewts_for_superscript + "j", unicode_for_superscript + "\u0F97"); ewts2uni_test(ewts_for_superscript + "ny", @@ -826,9 +949,9 @@ public class EWTSTest extends TestCase { unicode_for_superscript + "\u0F9B"); ewts2uni_test(ewts_for_superscript + "D", unicode_for_superscript + "\u0F9C"); - ewts2uni_test(ewts_for_superscript + "D+h", - unicode_for_superscript - + (false ? "\u0F9D" : "\u0f9c\u0fb7")); + just_ewts2uni_test(ewts_for_superscript + "D+h", + unicode_for_superscript + + (false ? "\u0F9D" : "\u0f9c\u0fb7")); ewts2uni_test(ewts_for_superscript + "N", unicode_for_superscript + "\u0F9E"); ewts2uni_test(ewts_for_superscript + "t", @@ -844,8 +967,8 @@ public class EWTSTest extends TestCase { unicode_for_superscript + "\u0FA3"); ewts2uni_test(ewts_for_superscript + "p", unicode_for_superscript + "\u0FA4"); - ewts2uni_test(ewts_for_superscript + "ph", - unicode_for_superscript + "\u0FA5"); + just_ewts2uni_test(ewts_for_superscript + "ph", + unicode_for_superscript + "\u0FA5"); ewts2uni_test(ewts_for_superscript + "b", unicode_for_superscript + "\u0FA6"); ewts2uni_test(ewts_for_superscript + "b+h", @@ -859,119 +982,122 @@ public class EWTSTest extends TestCase { unicode_for_superscript + "\u0FAA"); ewts2uni_test(ewts_for_superscript + "dz", unicode_for_superscript + "\u0FAB"); - ewts2uni_test(ewts_for_superscript + "dz+h", - unicode_for_superscript - + (false ? "\u0FAC" : "\u0fab\u0fb7")); + just_ewts2uni_test(ewts_for_superscript + "dz+h", + unicode_for_superscript + + (false ? "\u0FAC" : "\u0fab\u0fb7")); ewts2uni_test(ewts_for_superscript + "w", unicode_for_superscript + "\u0FAD"); - ewts2uni_test(ewts_for_superscript + "zh", - unicode_for_superscript + "\u0FAE"); - ewts2uni_test(ewts_for_superscript + "z", - unicode_for_superscript + "\u0FAF"); - ewts2uni_test(ewts_for_superscript + "'", - unicode_for_superscript + "\u0FB0"); - ewts2uni_test(ewts_for_superscript + "y", - unicode_for_superscript + "\u0FB1"); - ewts2uni_test(ewts_for_superscript + "r", - unicode_for_superscript + "\u0FB2"); + just_ewts2uni_test(ewts_for_superscript + "zh", + unicode_for_superscript + "\u0FAE"); + just_ewts2uni_test(ewts_for_superscript + "z", + unicode_for_superscript + "\u0FAF"); + just_ewts2uni_test(ewts_for_superscript + "'", + unicode_for_superscript + "\u0FB0"); + just_ewts2uni_test(ewts_for_superscript + "y", + unicode_for_superscript + "\u0FB1"); + just_ewts2uni_test(ewts_for_superscript + "r", + unicode_for_superscript + "\u0FB2"); ewts2uni_test(ewts_for_superscript + "l", unicode_for_superscript + "\u0FB3"); - ewts2uni_test(ewts_for_superscript + "sh", - unicode_for_superscript + "\u0FB4"); - ewts2uni_test(ewts_for_superscript + "Sh", - unicode_for_superscript + "\u0FB5"); - ewts2uni_test(ewts_for_superscript + "s", - unicode_for_superscript + "\u0FB6"); + just_ewts2uni_test(ewts_for_superscript + "sh", + unicode_for_superscript + "\u0FB4"); + just_ewts2uni_test(ewts_for_superscript + "Sh", + unicode_for_superscript + "\u0FB5"); + just_ewts2uni_test(ewts_for_superscript + "s", + unicode_for_superscript + "\u0FB6"); ewts2uni_test(ewts_for_superscript + "h", unicode_for_superscript + "\u0FB7"); - ewts2uni_test(ewts_for_superscript + "a", - unicode_for_superscript + "\u0FB8"); + just_ewts2uni_test(ewts_for_superscript + "a", + unicode_for_superscript + "\u0FB8"); ewts2uni_test(ewts_for_superscript + "k+Sh", unicode_for_superscript + (false ? "\u0FB9" : "\u0f90\u0fb5")); - ewts2uni_test(ewts_for_superscript + "W", - unicode_for_superscript + "\u0FBA"); - ewts2uni_test(ewts_for_superscript + "Y", - unicode_for_superscript + "\u0FBB"); - ewts2uni_test(ewts_for_superscript + "R", - unicode_for_superscript + "\u0FBC"); + just_ewts2uni_test(ewts_for_superscript + "W", + unicode_for_superscript + "\u0FBA"); + just_ewts2uni_test(ewts_for_superscript + "Y", + unicode_for_superscript + "\u0FBB"); + just_ewts2uni_test(ewts_for_superscript + "R", + unicode_for_superscript + "\u0FBC"); - ewts2uni_test("\\u0FBE", "\u0FBE"); - ewts2uni_test("\\u0FBF", "\u0FBF"); - ewts2uni_test("\\u0FC0", "\u0FC0"); - ewts2uni_test("\\u0FC1", "\u0FC1"); - ewts2uni_test("\\u0FC2", "\u0FC2"); - ewts2uni_test("\\u0FC3", "\u0FC3"); - ewts2uni_test("\\u0FC4", "\u0FC4"); - ewts2uni_test("\\u0FC5", "\u0FC5"); - ewts2uni_test("\\u0FC6", achen + "\u0FC6"); // \u0fc6 is a combiner - ewts2uni_test("\\u0FC7", "\u0FC7"); - ewts2uni_test("\\u0FC8", "\u0FC8"); - ewts2uni_test("\\u0FC9", "\u0FC9"); - ewts2uni_test("\\u0FCA", "\u0FCA"); - ewts2uni_test("\\u0FCB", "\u0FCB"); - ewts2uni_test("\\u0FCC", "\u0FCC"); - ewts2uni_test("\\u0FCF", "\u0FCF"); - ewts2uni_test("\\u0FD0", "\u0FD0"); - ewts2uni_test("\\u0FD1", "\u0FD1"); + just_ewts2uni_test("\\u0FBE", "\u0FBE"); + just_ewts2uni_test("\\u0FBF", "\u0FBF"); + just_ewts2uni_test("\\u0FC0", "\u0FC0"); + just_ewts2uni_test("\\u0FC1", "\u0FC1"); + just_ewts2uni_test("\\u0FC2", "\u0FC2"); + just_ewts2uni_test("\\u0FC3", "\u0FC3"); + just_ewts2uni_test("\\u0FC4", "\u0FC4"); + just_ewts2uni_test("\\u0FC5", "\u0FC5"); + just_ewts2uni_test("\\u0FC6", achen + "\u0FC6"); // \u0fc6 is a combiner + just_ewts2uni_test("\\u0FC7", "\u0FC7"); + just_ewts2uni_test("\\u0FC8", "\u0FC8"); + just_ewts2uni_test("\\u0FC9", "\u0FC9"); + just_ewts2uni_test("\\u0FCA", "\u0FCA"); + just_ewts2uni_test("\\u0FCB", "\u0FCB"); + just_ewts2uni_test("\\u0FCC", "\u0FCC"); + just_ewts2uni_test("\\u0FCF", "\u0FCF"); + just_ewts2uni_test("\\u0FD0", "\u0FD0"); + just_ewts2uni_test("\\u0FD1", "\u0FD1"); ewts2uni_test("_", "\u00a0"); // tibwn.ini says that the Unicode spec wants a non-breaking space. ewts2uni_test("\\u534D", "\u534D"); ewts2uni_test("\\u5350", "\u5350"); + ewts2uni_test("\u534D", "\u534D"); + ewts2uni_test("\u5350", "\u5350"); ewts2uni_test("\\u0F88+k", "\u0F88\u0F90"); ewts2uni_test("\\u0F88+kh", "\u0F88\u0F91"); /* TODO(DLC)[EWTS->Tibetan]: Do we want to ever generate \uf021? (NOT \u0f21, but the private-use area (PUA) of Unicode). EWTS->TMW and this - makes sense, but EWTS->Unicode? */ - ewts2uni_test("\\uF021", "\uF021"); - ewts2uni_test("\\uF022", "\uF022"); - ewts2uni_test("\\uF023", "\uF023"); - ewts2uni_test("\\uF024", "\uF024"); - ewts2uni_test("\\uF025", "\uF025"); - ewts2uni_test("\\uF026", "\uF026"); - ewts2uni_test("\\uF027", "\uF027"); - ewts2uni_test("\\uF028", "\uF028"); - ewts2uni_test("\\uF029", "\uF029"); - ewts2uni_test("\\uF02A", "\uF02A"); - ewts2uni_test("\\uF02B", "\uF02B"); - ewts2uni_test("\\uF02C", "\uF02C"); - ewts2uni_test("\\uF02D", "\uF02D"); - ewts2uni_test("\\uF02E", "\uF02E"); - ewts2uni_test("\\uF02F", "\uF02F"); - ewts2uni_test("\\uF030", "\uF030"); - ewts2uni_test("\\uF031", "\uF031"); - ewts2uni_test("\\uF032", "\uF032"); - ewts2uni_test("\\uF033", "\uF033"); - ewts2uni_test("\\uF034", "\uF034"); - ewts2uni_test("\\uF035", "\uF035"); - ewts2uni_test("\\uF036", "\uF036"); - ewts2uni_test("\\uF037", "\uF037"); - ewts2uni_test("\\uF038", "\uF038"); - ewts2uni_test("\\uF039", "\uF039"); - ewts2uni_test("\\uF03A", "\uF03A"); - ewts2uni_test("\\uF03B", "\uF03B"); - ewts2uni_test("\\uF03C", "\uF03C"); - ewts2uni_test("\\uF03D", "\uF03D"); - ewts2uni_test("\\uF03E", "\uF03E"); - ewts2uni_test("\\uF03F", "\uF03F"); - ewts2uni_test("\\uF040", "\uF040"); - ewts2uni_test("\\uF041", "\uF041"); - ewts2uni_test("\\uF042", "\uF042"); + makes sense, but EWTS->Unicode? Shouldn't we match the + behavior of TMW->Unicode, regardless? */ + just_ewts2uni_test("\\uF021", "\uF021"); + just_ewts2uni_test("\\uF022", "\uF022"); + just_ewts2uni_test("\\uF023", "\uF023"); + just_ewts2uni_test("\\uF024", "\uF024"); + just_ewts2uni_test("\\uF025", "\uF025"); + just_ewts2uni_test("\\uF026", "\uF026"); + just_ewts2uni_test("\\uF027", "\uF027"); + just_ewts2uni_test("\\uF028", "\uF028"); + just_ewts2uni_test("\\uF029", "\uF029"); + just_ewts2uni_test("\\uF02A", "\uF02A"); + just_ewts2uni_test("\\uF02B", "\uF02B"); + just_ewts2uni_test("\\uF02C", "\uF02C"); + just_ewts2uni_test("\\uF02D", "\uF02D"); + just_ewts2uni_test("\\uF02E", "\uF02E"); + just_ewts2uni_test("\\uF02F", "\uF02F"); + just_ewts2uni_test("\\uF030", "\uF030"); + just_ewts2uni_test("\\uF031", "\uF031"); + just_ewts2uni_test("\\uF032", "\uF032"); + just_ewts2uni_test("\\uF033", "\uF033"); + just_ewts2uni_test("\\uF034", "\uF034"); + just_ewts2uni_test("\\uF035", "\uF035"); + just_ewts2uni_test("\\uF036", "\uF036"); + just_ewts2uni_test("\\uF037", "\uF037"); + just_ewts2uni_test("\\uF038", "\uF038"); + just_ewts2uni_test("\\uF039", "\uF039"); + just_ewts2uni_test("\\uF03A", "\uF03A"); + just_ewts2uni_test("\\uF03B", "\uF03B"); + just_ewts2uni_test("\\uF03C", "\uF03C"); + just_ewts2uni_test("\\uF03D", "\uF03D"); + just_ewts2uni_test("\\uF03E", "\uF03E"); + just_ewts2uni_test("\\uF03F", "\uF03F"); + just_ewts2uni_test("\\uF040", "\uF040"); + just_ewts2uni_test("\\uF041", "\uF041"); + just_ewts2uni_test("\\uF042", "\uF042"); } public void test__EWTS__long_wowels() { - ewts2uni_test("k-I~M`~X", "\u0f40\u0f81\u0f82\u0f35"); // TODO(DLC)[EWTS->Tibetan]: actually the 0f68 stuff could be true... ask + ewts2uni_test("k-I~M`~X", "\u0f40\u0f71\u0f80\u0f82\u0f35"); // TODO(DLC)[EWTS->Tibetan]: actually the 0f68 stuff could be true... ask } public void test__EWTS__32bit_unicode_escapes() { assert_EWTS_error("\\u00010000"); // TODO(dchandler): make it work - ewts2uni_test("\\uF0010000", - "[#ERROR ERROR TODO(DLC)[EWTS->Tibetan]: this character is illegal in EWTS: \\]\u0f68\u0f74[#ERROR ERROR TODO(DLC)[EWTS->Tibetan]: this character is illegal in EWTS: F]\u0f20\u0f20\u0f21\u0f20\u0f20\u0f20\u0f20"); // TODO(dchandler): make it work. Until you can, TODO(DLC)[EWTS->Tibetan]: make the following work: + just_ewts2uni_test("\\uF0010000", + "[#ERROR ERROR TODO(DLC)[EWTS->Tibetan]: this character is illegal in EWTS: \\]\u0f68\u0f74[#ERROR ERROR TODO(DLC)[EWTS->Tibetan]: this character is illegal in EWTS: F]\u0f20\u0f20\u0f21\u0f20\u0f20\u0f20\u0f20"); // TODO(dchandler): make it work. Until you can, TODO(DLC)[EWTS->Tibetan]: make the following work: if (RUN_FAILING_TESTS) assert_EWTS_error("\\uF0010000"); // TODO(DLC)[EWTS->Tibetan]: error subsystem is hosed if (RUN_FAILING_TESTS) { - ewts2uni_test("\\ucafe0000", - "[#ERROR Sorry, we don't yet support Unicode escape sequences above 0x0000FFFF! File a bug.]"); + just_ewts2uni_test("\\ucafe0000", + "[#ERROR Sorry, we don't yet support Unicode escape sequences above 0x0000FFFF! File a bug.]"); // TODO(dchandler): make it "\ucafe0000"); ewts2uni_test("\\ucafe0eff", "\ucafe0eff"); ewts2uni_test("\\ucafe0eff", "\ucafe0eff"); @@ -1003,8 +1129,8 @@ public class EWTSTest extends TestCase { ewts2uni_test("\\u00000000", "\u0000"); ewts2uni_test("\\u00000eff", "\u0eff"); } - ewts2uni_test("\\u00000f00", "\u0f00"); - ewts2uni_test("\\u00000f40", "\u0f40"); + just_ewts2uni_test("\\u00000f00", "\u0f00"); // TODO(DLC)[EWTS->Tibetan]: EWTS->TMW is broken for this + just_ewts2uni_test("\\u00000f40", "\u0f40"); if (RUN_FAILING_TESTS) { ewts2uni_test("\\u00000f70", "\u0f70"); ewts2uni_test("\\u00000fff", "\u0fff"); @@ -1089,22 +1215,33 @@ public class EWTSTest extends TestCase { if (RUN_FAILING_TESTS) { ewts2uni_test("'a+r-i", "\u0f60\u0fb2\u0f80"); // TODO(DLC)[EWTS->Tibetan]: NOW: prefix rules should make this invalid! - ewts2uni_test("'a+r-I", "\u0f60\u0fb2\u0f81"); + ewts2uni_test("'a+r-I", "\u0f60\u0fb2\u0f71\u0f80"); ewts2uni_test("'a+l-i", "\u0f60\u0fb3\u0f80");// TODO(DLC)[EWTS->Tibetan]: NOW error handling is CRAP - ewts2uni_test("'a+l-I", "\u0f60\u0fb3\u0f81"); + ewts2uni_test("'a+l-I", "\u0f60\u0fb3\u0f71\u0f80"); } } public void testMoreMiscellany() { + ewts2uni_test("k+Sh+R-i", "\u0f40\u0fb5\u0fbc\u0f80"); + + ewts2uni_test("k\\u0f35", "\u0f40\u0f35"); + ewts2uni_test("k\\u0f72", "\u0f40\u0f72"); + ewts2uni_test("k\\u0f73", "\u0f40\u0f71\u0f72"); + ewts2uni_test("k\\u0f75", "\u0f40\u0f71\u0f74"); + ewts2uni_test("k\\u0f3e", "\u0f40\u0f3e"); + ewts2uni_test("k\\u0f3f", "\u0f40\u0f3f"); + + ewts2uni_test("kHai", "\u0f40\u0f7f\u0f68\u0f7b"); // TODO(DLC)[EWTS->Tibetan]: Is this correct? + ewts2uni_test("r-i", "\u0f62\u0f80"); - ewts2uni_test("r-I", "\u0f62\u0f81"); + ewts2uni_test("r-I", "\u0f62\u0f71\u0f80"); ewts2uni_test("l-i", "\u0f63\u0f80"); - ewts2uni_test("l-I", "\u0f63\u0f81"); - ewts2uni_test("ga\u0f0bga ga\\u0F0bga", - "\u0f42\u0f0b\u0f42\u0f0b\u0f42\u0f0b\u0f42"); - ewts2uni_test("ga\u0f0cga*ga\\u0f0Cga", - "\u0f42\u0f0c\u0f42\u0f0c\u0f42\u0f0c\u0f42"); + ewts2uni_test("l-I", "\u0f63\u0f71\u0f80"); + just_ewts2uni_test("ga\u0f0bga ga\\u0F0bga", + "\u0f42\u0f0b\u0f42\u0f0b\u0f42\u0f0b\u0f42"); + just_ewts2uni_test("ga\u0f0cga*ga\\u0f0Cga", + "\u0f42\u0f0c\u0f42\u0f0c\u0f42\u0f0c\u0f42"); ewts2uni_test("'jam", "\u0f60\u0f47\u0f58"); ewts2uni_test("jamX 'jam~X", diff --git a/source/org/thdl/tib/text/ttt/EWTSTraits.java b/source/org/thdl/tib/text/ttt/EWTSTraits.java index d489076..35149db 100644 --- a/source/org/thdl/tib/text/ttt/EWTSTraits.java +++ b/source/org/thdl/tib/text/ttt/EWTSTraits.java @@ -21,6 +21,7 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.ttt; import java.util.ArrayList; +import java.util.HashMap; import org.thdl.tib.text.tshegbar.UnicodeUtils; import org.thdl.tib.text.DuffCode; @@ -102,124 +103,172 @@ public final class EWTSTraits implements TTraits { public boolean isWowel(String s) { return (getUnicodeForWowel(s) != null); - /* TODO(DLC)[EWTS->Tibetan]: test ko+m+e etc. - // TODO(DLC)[EWTS->Tibetan]: all non-consonant combiners? 0f71 0f87 etc.? - if (s.length() == 1 && isUnicodeWowel(s.charAt(0))) return true; - return ("a".equals(s) - || "e".equals(s) - || "i".equals(s) - || "o".equals(s) - || "u".equals(s) - || "U".equals(s) - || "I".equals(s) - || "A".equals(s) - || "-i".equals(s) - || "-I".equals(s) - || "au".equals(s) - || "ai".equals(s) - || isWowelThatRequiresAChen(s)); - // TODO(DLC)[EWTS->Tibetan]:??? - */ } - public String aVowel() { return "a"; } + public String aVowel() { return THDLWylieConstants.WYLIE_aVOWEL; } public boolean isPostsuffix(String s) { return ("s".equals(s) || "d".equals(s)); } public boolean isPrefix(String l) { - return ("'".equals(l) - || "m".equals(l) - || "b".equals(l) - || "d".equals(l) - || "g".equals(l)); + return (THDLWylieConstants.ACHUNG.equals(l) + || THDLWylieConstants.MA.equals(l) + || THDLWylieConstants.BA.equals(l) + || THDLWylieConstants.DA.equals(l) + || THDLWylieConstants.GA.equals(l)); } public boolean isSuffix(String l) { - return ("s".equals(l) - || "g".equals(l) - || "d".equals(l) - || "m".equals(l) - || "'".equals(l) - || "b".equals(l) - || "ng".equals(l) - || "n".equals(l) - || "l".equals(l) - || "r".equals(l)); + return (isPrefix(l) + || THDLWylieConstants.SA.equals(l) + || THDLWylieConstants.NGA.equals(l) + || THDLWylieConstants.NA.equals(l) + || THDLWylieConstants.LA.equals(l) + || THDLWylieConstants.RA.equals(l)); } - /** Returns l, since this is EWTS's traits class. */ - public String getEwtsForConsonant(String l) { return l; } + /** Returns the best EWTS for l, which is often l but not always + * thanks to Unicode escapes. NOTE: For "\u0f42", you don't want + * to return "g" lest "\\u0f42ya " become the wrong thing under + * EWTS->Unicode. */ + public String getEwtsForConsonant(String l) { + return helpGetEwts(l); + } - /** Returns l, since this is EWTS's traits class. */ - public String getEwtsForOther(String l) { return l; } + /** Returns the best EWTS for l, which is often l but not always + * thanks to Unicode escapes. */ + public String getEwtsForOther(String l) { + return helpGetEwts(l); + } + + private String helpGetEwts(String l) { + if (l.length() == 1 + && ((l.charAt(0) >= THDLWylieConstants.PUA_MIN + && l.charAt(0) <= THDLWylieConstants.PUA_MAX) + || 0 <= "\u0F01\u0F09\u0F0A\u0F10\u0F12\u0F13\u0F15\u0F16\u0F17\u0F18\u0F19\u0F1A\u0F1B\u0F1C\u0F1D\u0F1E\u0F1F\u0F2A\u0F2B\u0F2C\u0F2D\u0F2E\u0F2F\u0F30\u0F31\u0F32\u0F33\u0F36\u0F38\u0F86\u0F87\u0F88\u0F89\u0F8A\u0F8B\u0FBE\u0FBF\u0FC0\u0FC1\u0FC2\u0FC3\u0FC4\u0FC5\u0FC6\u0FC7\u0FC8\u0FC9\u0FCA\u0FCB\u0FCC\u0FCF\u5350\u534D".indexOf(l.charAt(0)))) { + return UnicodeUtils.unicodeCodepointToString(l.charAt(0), false, "\\u", true); + } + if (false) { // TODO(dchandler): it's too late in the game to do this. EWTS->TMW is broken for \u0f00, \u0f02, and \u0f03 right now, fix that. + if ("\u0f02".equals(l)) return "u~M`H"; // too long for a single hash key, see? + if ("\u0f03".equals(l)) return "u~M`:"; // ditto + } + return l; + } /** Returns l, since this is EWTS's traits class. */ public String getEwtsForWowel(String l) { return l; } public TTshegBarScanner scanner() { return EWTSTshegBarScanner.instance(); } - public void getDuffForWowel(ArrayList duff, DuffCode preceding, String wowel) - throws IllegalArgumentException + /** If needle is found in haystack, then haystack without the + * first instance of needle is returned. Otherwise haystack + * itself is returned. */ + private static String removeFirstMatch(String haystack, String needle) { + int ix; + if ((ix = haystack.indexOf(needle)) >= 0) { + StringBuffer sb = new StringBuffer(haystack); + sb.replace(ix, ix + needle.length(), ""); + return sb.toString(); + } + return haystack; + } + + private static HashMap bestEwtsMap = null; + private static String getBestEwtsForSingleWowel(String wowel) { + // NOTE: Not MT-safe + if (null == bestEwtsMap) { + bestEwtsMap = new HashMap(20); + // Unicode-escape sequences are handled early. To be + // correct, we must "unescape" here any Unicode escape to + // whatever tibwn.ini has. (TODO(dchandler): tibwn.ini + // has this info, use that instead of duplicating it in + // this code.) + bestEwtsMap.put("\u0f18", THDLWylieConstants.U0F18); + bestEwtsMap.put("\u0f19", THDLWylieConstants.U0F19); + bestEwtsMap.put("\u0f35", THDLWylieConstants.U0F35); + bestEwtsMap.put("\u0f37", THDLWylieConstants.U0F37); + bestEwtsMap.put("\u0f39", THDLWylieConstants.WYLIE_TSA_PHRU); + bestEwtsMap.put("\u0f3e", THDLWylieConstants.U0F3E); + bestEwtsMap.put("\u0f3f", THDLWylieConstants.U0F3F); + bestEwtsMap.put("\u0f84", THDLWylieConstants.U0F84); + bestEwtsMap.put("\u0f86", THDLWylieConstants.U0F86); + bestEwtsMap.put("\u0f87", THDLWylieConstants.U0F87); + bestEwtsMap.put("\u0fc6", THDLWylieConstants.U0FC6); + + bestEwtsMap.put("\u0f71", THDLWylieConstants.A_VOWEL); + bestEwtsMap.put("\u0f72", THDLWylieConstants.i_VOWEL); + bestEwtsMap.put("\u0f74", THDLWylieConstants.u_VOWEL); + bestEwtsMap.put("\u0f7a", THDLWylieConstants.e_VOWEL); + bestEwtsMap.put("\u0f7b", THDLWylieConstants.ai_VOWEL); + bestEwtsMap.put("\u0f7c", THDLWylieConstants.o_VOWEL); + bestEwtsMap.put("\u0f7d", THDLWylieConstants.au_VOWEL); + bestEwtsMap.put("\u0f7e", THDLWylieConstants.BINDU); + bestEwtsMap.put("\u0f80", THDLWylieConstants.reverse_i_VOWEL); + bestEwtsMap.put("\u0f81", THDLWylieConstants.reverse_I_VOWEL); + + bestEwtsMap.put("\u0f73", THDLWylieConstants.I_VOWEL); // not in tibwn.ini + bestEwtsMap.put("\u0f75", THDLWylieConstants.U_VOWEL); // not in tibwn.ini + } + String mapping = (String)bestEwtsMap.get(wowel); + if (null != mapping) + return mapping; + else + return wowel; + } + + public void getDuffForWowel(ArrayList duff, DuffCode preceding, + String wowel) + throws ConversionException { - // TODO(DLC)[EWTS->Tibetan]: I have no confidence in this! test, test, test. + boolean preceding_added[] = new boolean[] { false }; + String[] wowels = wowel.split("\\+"); + for (int i = 0; i < wowels.length; i++) { + getDuffForSingleWowel(duff, preceding, + getBestEwtsForSingleWowel(wowels[i]), + preceding_added); + } + } - // TODO(DLC)[EWTS->Tibetan]: ko+o doesn't work. kai+-i doesn't work. - - // TODO(DLC)[EWTS->Tibetan]: kai doesn't work. - - // Order matters here. - boolean context_added[] = new boolean[] { false }; - if (wowel.equals(THDLWylieConstants.WYLIE_aVOWEL)) { - TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.WYLIE_aVOWEL, context_added); + /** Wowels can stack. This works on a single wowel. */ + private void getDuffForSingleWowel(ArrayList duff, DuffCode preceding, + String wowel, boolean preceding_added[]) + throws ConversionException + { + if (wowel.equals(THDLWylieConstants.WYLIE_aVOWEL)) { // TODO(dchandler): ka+o deserves at least a warning. kaM, though, does not. Do we handle it? + TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.WYLIE_aVOWEL, preceding_added); + wowel = ""; } else { - // TODO(DLC)[EWTS->Tibetan]: test vowel stacking - if (wowel.indexOf(THDLWylieConstants.U_VOWEL) >= 0) { - TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.U_VOWEL, context_added); - } - if (wowel.indexOf(THDLWylieConstants.reverse_I_VOWEL) >= 0) { - TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.reverse_I_VOWEL, context_added); - } else if (wowel.indexOf(THDLWylieConstants.I_VOWEL) >= 0) { - TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.I_VOWEL, context_added); - } - if (wowel.indexOf(THDLWylieConstants.A_VOWEL) >= 0) { - TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.A_VOWEL, context_added); - } - if (wowel.indexOf(THDLWylieConstants.ai_VOWEL) >= 0) { - TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.ai_VOWEL, context_added); - } else if (wowel.indexOf(THDLWylieConstants.reverse_i_VOWEL) >= 0) { - TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.reverse_i_VOWEL, context_added); - } else if (wowel.indexOf(THDLWylieConstants.i_VOWEL) >= 0) { - TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.i_VOWEL, context_added); - } - if (wowel.indexOf(THDLWylieConstants.e_VOWEL) >= 0) { - TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.e_VOWEL, context_added); - } - if (wowel.indexOf(THDLWylieConstants.o_VOWEL) >= 0) { - TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.o_VOWEL, context_added); - } - if (wowel.indexOf(THDLWylieConstants.au_VOWEL) >= 0) { - TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.au_VOWEL, context_added); - } else if (wowel.indexOf(THDLWylieConstants.u_VOWEL) >= 0) { - TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.u_VOWEL, context_added); - } - if (wowel.indexOf("~X") >= 0) { // TODO(DLC)[EWTS->Tibetan]: introduce THDLWylieConstants.blah - duff.add(TibetanMachineWeb.getGlyph("~X")); - } else if (wowel.indexOf("X") >= 0) { // TODO(DLC)[EWTS->Tibetan]: introduce THDLWylieConstants.blah - duff.add(TibetanMachineWeb.getGlyph("X")); + // We call these combining because the TMW font treats + // such a vowel specially depending on the preceding glyph + // with which it combines. + String combining_wowels[] = new String[] { + // order does not matter + THDLWylieConstants.U_VOWEL, + THDLWylieConstants.reverse_I_VOWEL, + THDLWylieConstants.I_VOWEL, + THDLWylieConstants.A_VOWEL, + THDLWylieConstants.ai_VOWEL, + THDLWylieConstants.reverse_i_VOWEL, + THDLWylieConstants.i_VOWEL, + THDLWylieConstants.e_VOWEL, + THDLWylieConstants.o_VOWEL, + THDLWylieConstants.au_VOWEL, + THDLWylieConstants.u_VOWEL + }; + for (int i = 0; i < combining_wowels.length; i++) { + if (wowel.equals(combining_wowels[i])) { + TibTextUtils.getVowel(duff, preceding, combining_wowels[i], + preceding_added); + wowel = removeFirstMatch(wowel, combining_wowels[i]); + } } } // FIXME: Use TMW9.61, the "o'i" special combination, when appropriate. - if (wowel.indexOf(THDLWylieConstants.BINDU) >= 0 - // TODO(DLC)[EWTS->Tibetan]: This is really ugly... we - // rely on the fact that we know every Wylie wowel that - // contains 'M'. Let's, instead, parse the wowel. - && wowel.indexOf(THDLWylieConstants.U0F82) < 0 - && wowel.indexOf(THDLWylieConstants.U0F83) < 0) { + if (wowel.equals(THDLWylieConstants.BINDU)) { DuffCode last = null; - if (!context_added[0]) { + if (!preceding_added[0]) { last = preceding; } else if (duff.size() > 0) { last = (DuffCode)duff.get(duff.size() - 1); @@ -227,52 +276,77 @@ public final class EWTSTraits implements TTraits { // TODO(DLC)[EWTS->Tibetan]: is this okay???? when is a bindu okay to be alone??? } TibTextUtils.getBindu(duff, last); - context_added[0] = true; + preceding_added[0] = true; + wowel = removeFirstMatch(wowel, THDLWylieConstants.BINDU); } - if (!context_added[0]) { + + if (!preceding_added[0]) { duff.add(preceding); + preceding_added[0] = true; } - if (wowel.indexOf('H') >= 0) - duff.add(TibetanMachineWeb.getGlyph("H")); - int ix; - if ((ix = wowel.indexOf(THDLWylieConstants.WYLIE_TSA_PHRU)) >= 0) { + + String standalone_wowels[] = new String[] { + // order does not matter + // This likely won't look good! TMW has glyphs for [va] // and [fa], so use that transliteration if you care, not // [ph^] or [b^]. - duff.add(TibetanMachineWeb.getGlyph(THDLWylieConstants.WYLIE_TSA_PHRU)); - StringBuffer sb = new StringBuffer(wowel); - sb.replace(ix, ix + THDLWylieConstants.WYLIE_TSA_PHRU.length(), ""); - wowel = sb.toString(); + THDLWylieConstants.WYLIE_TSA_PHRU, + THDLWylieConstants.U0F35, + THDLWylieConstants.U0F37, + THDLWylieConstants.U0F7F, + THDLWylieConstants.U0F82, + THDLWylieConstants.U0F83, + THDLWylieConstants.U0F86, + THDLWylieConstants.U0F87, + THDLWylieConstants.U0F19, + THDLWylieConstants.U0F18, + THDLWylieConstants.U0FC6, + THDLWylieConstants.U0F3E, + THDLWylieConstants.U0F3F, + THDLWylieConstants.U0F84, + }; + for (int i = 0; i < standalone_wowels.length; i++) { + if (wowel.equals(standalone_wowels[i])) { + ThdlDebug.verify(preceding_added[0]); + duff.add(TibetanMachineWeb.getGlyph(standalone_wowels[i])); + wowel = removeFirstMatch(wowel, standalone_wowels[i]); + } } - if ((ix = wowel.indexOf(THDLWylieConstants.U0F82)) >= 0) { - duff.add(TibetanMachineWeb.getGlyph(THDLWylieConstants.U0F82)); - StringBuffer sb = new StringBuffer(wowel); - sb.replace(ix, ix + THDLWylieConstants.U0F82.length(), ""); - wowel = sb.toString(); - } - if ((ix = wowel.indexOf(THDLWylieConstants.U0F83)) >= 0) { - duff.add(TibetanMachineWeb.getGlyph(THDLWylieConstants.U0F83)); - StringBuffer sb = new StringBuffer(wowel); - sb.replace(ix, ix + THDLWylieConstants.U0F83.length(), ""); - wowel = sb.toString(); - } - - // TODO(DLC)[EWTS->Tibetan]: verify that no part of wowel is discarded! acip does that. 'jam~X I think we screw up, e.g. - // TODO(DLC)[EWTS->Tibetan]:: are bindus are screwed up in the unicode output? i see (with tmuni font) lone bindus without glyphs to stack on + // We verify that no part of wowel is discarded. + if (wowel.length() > 0) { + throw new ConversionException( + "Full wowel was not handled, there remains: " + wowel); + } + + // TODO(DLC)[EWTS->Tibetan]:: are bindus are screwed up in the + // unicode output? i see (with tmuni font) lone bindus + // without glyphs to stack on } public String getUnicodeForWowel(String wowel) { - if ("a".equals(wowel)) + if (THDLWylieConstants.WYLIE_aVOWEL.equals(wowel)) return ""; return helpGetUnicodeForWowel(wowel); } private String helpGetUnicodeForWowel(String wowel) { - if ("a".equals(wowel)) + if (THDLWylieConstants.WYLIE_aVOWEL.equals(wowel)) return null; // ko+a+e is invalid, e.g. - if (wowel.length() == 1 && isUnicodeWowel(wowel.charAt(0))) + if (wowel.length() == 1 && isUnicodeWowel(wowel.charAt(0))) { + if ("\u0f75".equals(wowel)) + return "\u0f71\u0f74"; // \u0f75 is discouraged + if ("\u0f81".equals(wowel)) + return "\u0f71\u0f80"; // \u0f81 is discouraged + if ("\u0f73".equals(wowel)) + return "\u0f71\u0f72"; // \u0f73 is discouraged + if ("\u0f79".equals(wowel)) + return "\u0fb3\u0f81"; // \u0f79 is discouraged + if ("\u0f78".equals(wowel)) + return "\u0fb3\u0f80"; // \u0f78 is discouraged return wowel; + } // handle o+u, etc. int i; if ((i = wowel.indexOf("+")) >= 0) { @@ -290,27 +364,27 @@ public final class EWTSTraits implements TTraits { } else { // Handle vowels. (TODO(dchandler): tibwn.ini has this // info, use that instead of duplicating it in this code.) - if ("i".equals(wowel)) return "\u0f72"; - if ("u".equals(wowel)) return "\u0f74"; - if ("A".equals(wowel)) return "\u0f71"; - if ("U".equals(wowel)) return "\u0f71\u0f74"; // \u0f75 is discouraged - if ("e".equals(wowel)) return "\u0f7a"; - if ("o".equals(wowel)) return "\u0f7c"; - if ("-i".equals(wowel)) return "\u0f80"; - if ("ai".equals(wowel)) return "\u0f7b"; - if ("au".equals(wowel)) return "\u0f7d"; - if ("-I".equals(wowel)) return "\u0f81"; - if ("I".equals(wowel)) return "\u0f71\u0f72"; // \u0f73 is discouraged + if (THDLWylieConstants.i_VOWEL.equals(wowel)) return "\u0f72"; + if (THDLWylieConstants.u_VOWEL.equals(wowel)) return "\u0f74"; + if (THDLWylieConstants.A_VOWEL.equals(wowel)) return "\u0f71"; + if (THDLWylieConstants.U_VOWEL.equals(wowel)) return "\u0f71\u0f74"; // \u0f75 is discouraged + if (THDLWylieConstants.e_VOWEL.equals(wowel)) return "\u0f7a"; + if (THDLWylieConstants.o_VOWEL.equals(wowel)) return "\u0f7c"; + if (THDLWylieConstants.reverse_i_VOWEL.equals(wowel)) return "\u0f80"; + if (THDLWylieConstants.ai_VOWEL.equals(wowel)) return "\u0f7b"; + if (THDLWylieConstants.au_VOWEL.equals(wowel)) return "\u0f7d"; + if (THDLWylieConstants.reverse_I_VOWEL.equals(wowel)) return "\u0f71\u0f80"; // \u0f81 is discouraged + if (THDLWylieConstants.I_VOWEL.equals(wowel)) return "\u0f71\u0f72"; // \u0f73 is discouraged - // TODO(DLC)[EWTS->Tibetan]: test, test, test. - if ("M".equals(wowel)) return "\u0f7e"; - if ("H".equals(wowel)) return "\u0f7f"; - if ("?".equals(wowel)) return "\u0f84"; - if ("~M".equals(wowel)) return "\u0f83"; - if ("~M`".equals(wowel)) return "\u0f82"; - if ("X".equals(wowel)) return "\u0f37"; - if ("~X".equals(wowel)) return "\u0f35"; - if ("^".equals(wowel)) return "\u0f39"; + // TODO(DLC)[EWTS->Tibetan]: what about \u0f3e and \u0f3f!!!! + if (THDLWylieConstants.BINDU.equals(wowel)) return "\u0f7e"; + if (THDLWylieConstants.U0F7F.equals(wowel)) return "\u0f7f"; + if (THDLWylieConstants.U0F84.equals(wowel)) return "\u0f84"; + if (THDLWylieConstants.U0F83.equals(wowel)) return "\u0f83"; + if (THDLWylieConstants.U0F82.equals(wowel)) return "\u0f82"; + if (THDLWylieConstants.U0F37.equals(wowel)) return "\u0f37"; + if (THDLWylieConstants.U0F35.equals(wowel)) return "\u0f35"; + if (THDLWylieConstants.WYLIE_TSA_PHRU.equals(wowel)) return "\u0f39"; return null; } @@ -324,9 +398,9 @@ public final class EWTSTraits implements TTraits { for (int i = 0; i < l.length(); i++) { char ch = l.charAt(i); if ((ch < '\u0f00' || ch > '\u0fff') - && SAUVASTIKA != ch - && SWASTIKA != ch - && (ch < PUA_MIN || ch > PUA_MAX) // TODO(DLC)[EWTS->Tibetan]: give a warning, though? PUA isn't specified by the unicode standard after all. + && THDLWylieConstants.SAUVASTIKA != ch + && THDLWylieConstants.SWASTIKA != ch + && (ch < THDLWylieConstants.PUA_MIN || ch > THDLWylieConstants.PUA_MAX) // TODO(DLC)[EWTS->Tibetan]: give a warning, though? PUA isn't specified by the unicode standard after all. && '\n' != ch && '\r' != ch) { // TODO(DLC)[EWTS->Tibetan]: Is this the place @@ -346,6 +420,8 @@ public final class EWTSTraits implements TTraits { if ("Y".equals(l)) return "\u0fbb"; if ("W".equals(l)) return "\u0fba"; + // TODO(dchandler): use tibwn.ini -- it has this same info. + // g+h etc. should not be inputs to this function, but for // completeness they're here. if ("k".equals(l)) return "\u0F90"; @@ -455,18 +531,24 @@ public final class EWTSTraits implements TTraits { public boolean vowelsMayStack() { return true; } public boolean isWowelThatRequiresAChen(String s) { - // TODO(DLC)[EWTS->Tibetan]: fix me! - return ((s.length() == 1 && (isUnicodeWowelThatRequiresAChen(s.charAt(0)) - || "?MHX^".indexOf(s.charAt(0)) >= 0)) - || "~X".equals(s) - || "~M".equals(s) - || "~M`".equals(s) - ); + // TODO(DLC)[EWTS->Tibetan]: not sure why we pick this subset. + // Why don't we use a negative set of regular vowels like "i", + // "o", etc.? + return ((s.length() == 1 + && (isUnicodeWowelThatRequiresAChen(s.charAt(0)))) + || THDLWylieConstants.BINDU.equals(s) + || THDLWylieConstants.U0F35.equals(s) + || THDLWylieConstants.U0F37.equals(s) + || THDLWylieConstants.U0F7F.equals(s) + || THDLWylieConstants.U0F82.equals(s) + || THDLWylieConstants.U0F83.equals(s) + || THDLWylieConstants.U0F84.equals(s) + || THDLWylieConstants.WYLIE_TSA_PHRU.equals(s)); } public boolean isUnicodeWowelThatRequiresAChen(char ch) { // TODO(DLC)[EWTS->Tibetan]: ask if 18 19 3e 3f combine only with digits - return "\u0f39\u0f35\u0f37\u0f18\u0f19\u0f3e\u0f3f\u0f86\u0f87\u0fc6".indexOf(ch) >= 0; + return ("\u0f39\u0f35\u0f37\u0f18\u0f19\u0f3e\u0f3f\u0f86\u0f87\u0fc6".indexOf(ch) >= 0); } public boolean couldBeValidStack(TPairList pl) { @@ -485,33 +567,9 @@ public final class EWTSTraits implements TTraits { public boolean stackingMustBeExplicit() { return true; } - public String U0F7F() { return "H"; } + public String U0F7F() { return THDLWylieConstants.U0F7F; } - public String U0F35() { return "~X"; } + public String U0F35() { return THDLWylieConstants.U0F35; } - public String U0F37() { return "X"; } - - /** The EWTS standard mentions this character specifically. See - http://www.symbols.com/encyclopedia/15/155.html to learn about - its meaning as relates to Buddhism. - */ - static final char SAUVASTIKA = '\u534d'; - - /** The EWTS standard mentions this character specifically. See - http://www.symbols.com/encyclopedia/15/151.html to learn about - its meaning as relates to Buddhism. - */ - static final char SWASTIKA = '\u5350'; - - /** EWTS has some glyphs not specified by Unicode in the - * private-use area (PUA). EWTS puts them in the range [PUA_MIN, - * PUA_MAX]. (Note that \uf042 is the highest in use as of July - * 2, 2005.) */ - static final char PUA_MIN = '\uf021'; - - /** EWTS has some glyphs not specified by Unicode in the - * private-use area (PUA). EWTS puts them in the range [PUA_MIN, - * PUA_MAX]. (Note that \uf042 is the highest in use as of July - * 2, 2005.) */ - static final char PUA_MAX = '\uf0ff'; + public String U0F37() { return THDLWylieConstants.U0F37; } } diff --git a/source/org/thdl/tib/text/ttt/EWTSTshegBarScanner.java b/source/org/thdl/tib/text/ttt/EWTSTshegBarScanner.java index f6201c7..123b812 100644 --- a/source/org/thdl/tib/text/ttt/EWTSTshegBarScanner.java +++ b/source/org/thdl/tib/text/ttt/EWTSTshegBarScanner.java @@ -21,6 +21,8 @@ package org.thdl.tib.text.ttt; import java.math.BigInteger; import java.util.ArrayList; +import org.thdl.tib.text.THDLWylieConstants; + /** * This singleton class is able to break up Strings of EWTS text (for * example, an entire sutra file) into tsheg bars, comments, etc. @@ -76,8 +78,11 @@ class EWTSTshegBarScanner extends TTshegBarScanner { StringBuffer sb = new StringBuffer(s); ExpandEscapeSequences(sb); int sl = sb.length(); - // TODO(DLC)[EWTS->Tibetan]:: '@#', in ewts->tmw, is not working - // TODO(DLC)[EWTS->Tibetan]:: 'jamX 'jam~X one is not working in ->tmw mode + // TODO(DLC)[EWTS->Tibetan]:: '@#', in ewts->tmw, is not working (probably because) + // TODO(DLC)[EWTS->Tibetan]:: '#', in ewts->tmw, is not working + // + // TODO(DLC)[EWTS->Tibetan]:: 'jamX one is not working in ewts->tmw mode in the sense that X appears under the last glyph of the three instead of the middle glyph + // // TODO(DLC)[EWTS->Tibetan]:: dzaHsogs is not working for (int i = 0; i < sl; i++) { // i is modified in the loop, also if (isValidInsideTshegBar(sb.charAt(i))) { @@ -102,14 +107,14 @@ class EWTSTshegBarScanner extends TTshegBarScanner { al.add(new TString("EWTS", "//", TString.TIBETAN_PUNCTUATION)); ++i; - } else if ((sb.charAt(i) >= EWTSTraits.PUA_MIN - && sb.charAt(i) <= EWTSTraits.PUA_MAX) + } else if ((sb.charAt(i) >= THDLWylieConstants.PUA_MIN + && sb.charAt(i) <= THDLWylieConstants.PUA_MAX) || (sb.charAt(i) >= '\u0f00' && sb.charAt(i) <= '\u0f17') || (sb.charAt(i) >= '\u0f1a' && sb.charAt(i) <= '\u0f1f') || (sb.charAt(i) >= '\u0fbe' && sb.charAt(i) <= '\u0fcc') || (sb.charAt(i) >= '\u0fcf' && sb.charAt(i) <= '\u0fd1') - || (EWTSTraits.SAUVASTIKA == sb.charAt(i)) - || (EWTSTraits.SWASTIKA == sb.charAt(i)) + || (THDLWylieConstants.SAUVASTIKA == sb.charAt(i)) + || (THDLWylieConstants.SWASTIKA == sb.charAt(i)) || (" /;|!:=_@#$%<>()*&\r\n\t\u0f36\u0f38\u0f89\u0f8a\u0f8b".indexOf(sb.charAt(i)) >= 0)) { al.add(new TString("EWTS", sb.substring(i, i+1), @@ -186,7 +191,31 @@ class EWTSTshegBarScanner extends TTshegBarScanner { // leave x == -1 } if (x >= 0) { - sb.replace(i, i + "\\uXXXX".length(), new String(new char[] { (char)x })); + String replacement = String.valueOf((char)x); + + if (false) { + // This would ruin EWTS->Unicode to + // help EWTS->TMW, so we don't do it. + // TODO(dchandler): Fix EWTS->TMW for + // \u0f02 and \u0f03. + + // A nasty little HACK for you: + // + // TODO(dchandler): we may create "ga..u~M`H..ha" which may cause errors + String hack = null; + if ('\u0f02' == x) { + hack = "u~M`H"; // hard-coded EWTS + } else if ('\u0f03' == x) { + hack = "u~M`:"; // hard-coded EWTS + } else if ('\u0f00' == x) { + hack = "oM"; // hard-coded EWTS + } + if (null != hack) { + replacement = "." + hack + "."; // hard-coded EWTS disambiguators + i += replacement.length() - 1; + } + } + sb.replace(i, i + "\\uXXXX".length(), replacement); continue; } } diff --git a/source/org/thdl/tib/text/ttt/TConverter.java b/source/org/thdl/tib/text/ttt/TConverter.java index a822c30..8bf797c 100644 --- a/source/org/thdl/tib/text/ttt/TConverter.java +++ b/source/org/thdl/tib/text/ttt/TConverter.java @@ -29,6 +29,7 @@ import java.util.ArrayList; import org.thdl.tib.text.DuffCode; import org.thdl.tib.text.TibetanDocument; import org.thdl.tib.text.TibetanMachineWeb; +import org.thdl.tib.text.THDLWylieConstants; import org.thdl.util.ThdlDebug; import org.thdl.util.ThdlOptions; @@ -699,7 +700,13 @@ public class TConverter { } else { String wy = ttraits.getEwtsForOther(s.getText()); if (null == wy) throw new Error("No wylie for ACIP " + s.getText()); - duff = new Object[] { TibetanMachineWeb.getGlyph(wy) }; + duff = new Object[] { TibetanMachineWeb.maybeGetGlyph(wy) }; + if (null == duff[0]) { + duff[0] = + ErrorsAndWarnings.getMessage( + 137, shortMessages, + s.getText(), ttraits); + } } } } @@ -730,8 +737,8 @@ public class TConverter { ThdlDebug.verify(1 == s.getText().length()); if (null != writer) { char ch = s.getText().charAt(0); - if (ch >= EWTSTraits.PUA_MIN - && ch <= EWTSTraits.PUA_MAX) { + if (ch >= THDLWylieConstants.PUA_MIN + && ch <= THDLWylieConstants.PUA_MAX) { hasErrors = true; String errorMessage = "[#ERROR " diff --git a/source/org/thdl/tib/text/ttt/TPairList.java b/source/org/thdl/tib/text/ttt/TPairList.java index 014804e..4431571 100644 --- a/source/org/thdl/tib/text/ttt/TPairList.java +++ b/source/org/thdl/tib/text/ttt/TPairList.java @@ -21,6 +21,7 @@ package org.thdl.tib.text.ttt; import java.util.ArrayList; import java.util.HashMap; +import org.thdl.tib.text.tshegbar.UnicodeUtils; import org.thdl.tib.text.TGCPair; import org.thdl.tib.text.TibetanMachineWeb; import org.thdl.util.ThdlDebug; @@ -710,47 +711,49 @@ class TPairList { wylieForConsonant.append(lastPair.getWylie(true, false)); String hashKey = wylieForConsonant.toString(); - // Because EWTS has special handling for full-formed - // subjoined consonants, we have special handling here. - if ("r+y".equals(hashKey)) - hashKey = "r+Y"; - else if ("y+y".equals(hashKey)) - hashKey = "y+Y"; - else if ("N+D+y".equals(hashKey)) - hashKey = "N+D+Y"; - else if ("N+D+r+y".equals(hashKey)) - hashKey = "N+D+R+y"; - else if ("k+Sh+r".equals(hashKey)) - hashKey = "k+Sh+R"; + if (traits.isACIP()) { + // Because EWTS has special handling for full-formed + // subjoined consonants, we have special handling here. + if ("r+y".equals(hashKey)) + hashKey = "r+Y"; + else if ("y+y".equals(hashKey)) + hashKey = "y+Y"; + else if ("N+D+y".equals(hashKey)) + hashKey = "N+D+Y"; + else if ("N+D+r+y".equals(hashKey)) + hashKey = "N+D+R+y"; + else if ("k+Sh+r".equals(hashKey)) + hashKey = "k+Sh+R"; - // TPair.getWylie(..) returns "W" sometimes when "w" is what - // really should be returned. ("V" always causes "w" to be - // returned, which is fine.) We'll change "W" to "w" here if - // we need to. We do it only for a few known stacks (the ones - // in TMW). - if ("W".equals(hashKey)) - hashKey = "w"; - else if ("W+y".equals(hashKey)) - hashKey = "w+y"; - else if ("W+r".equals(hashKey)) - hashKey = "w+r"; - else if ("W+n".equals(hashKey)) - hashKey = "w+n"; - else if ("W+W".equals(hashKey)) - hashKey = "w+W"; + // TPair.getWylie(..) returns "W" sometimes when "w" is what + // really should be returned. ("V" always causes "w" to be + // returned, which is fine.) We'll change "W" to "w" here if + // we need to. We do it only for a few known stacks (the ones + // in TMW). + if ("W".equals(hashKey)) + hashKey = "w"; + else if ("W+y".equals(hashKey)) + hashKey = "w+y"; + else if ("W+r".equals(hashKey)) + hashKey = "w+r"; + else if ("W+n".equals(hashKey)) + hashKey = "w+n"; + else if ("W+W".equals(hashKey)) + hashKey = "w+W"; - if ("r+Y".equals(hashKey) - || "r+W".equals(hashKey) - || "r+sh".equals(hashKey) - || "r+sh+y".equals(hashKey) - || "r+Sh".equals(hashKey) - || "r+Sh+N".equals(hashKey) - || "r+Sh+N+y".equals(hashKey) - || "r+Sh+m".equals(hashKey) - || "r+Sh+y".equals(hashKey) - || "r+s".equals(hashKey) - ) { - hashKey = "R" + hashKey.substring(1); // r+Y => R+Y, etc. + if ("r+Y".equals(hashKey) + || "r+W".equals(hashKey) + || "r+sh".equals(hashKey) + || "r+sh+y".equals(hashKey) + || "r+Sh".equals(hashKey) + || "r+Sh+N".equals(hashKey) + || "r+Sh+N+y".equals(hashKey) + || "r+Sh+m".equals(hashKey) + || "r+Sh+y".equals(hashKey) + || "r+s".equals(hashKey) + ) { + hashKey = "R" + hashKey.substring(1); // r+Y => R+Y, etc. + } } if (!TibetanMachineWeb.isKnownHashKey(hashKey)) { @@ -774,7 +777,7 @@ class TPairList { traits.getDuffForWowel(duffsAndErrors, TibetanMachineWeb.getGlyph(hashKey), lastPair.getRight()); - } catch (IllegalArgumentException e) { + } catch (ConversionException e) { // TODO(dchandler): Error 137 isn't the perfect // message. Try EWTS [RAM], e.g. to see why. It acts // like we're trying to find a single glyph for (R diff --git a/source/org/thdl/tib/text/ttt/TPairListFactory.java b/source/org/thdl/tib/text/ttt/TPairListFactory.java index 7ebcfae..e1a791d 100644 --- a/source/org/thdl/tib/text/ttt/TPairListFactory.java +++ b/source/org/thdl/tib/text/ttt/TPairListFactory.java @@ -20,7 +20,12 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.ttt; +import java.util.Arrays; +import java.util.List; +import java.util.Comparator; + import org.thdl.tib.text.TibetanMachineWeb; +import org.thdl.tib.text.THDLWylieConstants; /** A factory for creating {@link TPairList TPairLists} from * Strings of ACIP. @@ -374,6 +379,85 @@ class TPairListFactory { return 0; } + /** Returns a TPair just like tp (sometimes the very same, + * unchanged instance) except that the wowel, if present, is in + * the order that Section 9.11 of the Unicode Standard, version + * 4.0.1, would have us use. */ + private static TPair ewtsSortWowels(TPair tp) { + if (tp.getRight() != null + && tp.getRight().length() > 0 + && !"+".equals(tp.getRight())) { + class WowelComparator implements Comparator { + /** @see + * org.thdl.tib.text.tshegbar.UnicodeUtils#fixSomeOrderingErrorsInTibetanUnicode(StringBuffer) */ + private List order = Arrays.asList(new String[] { + // equivalence class: + "\u0f39", THDLWylieConstants.WYLIE_TSA_PHRU, + + // equivalence class: + THDLWylieConstants.WYLIE_aVOWEL, + + // equivalence class: + "\u0f71", THDLWylieConstants.A_VOWEL, + "\u0f73", THDLWylieConstants.I_VOWEL, // TODO(dchandler): in a perfect world, we'd decompose and sort the components. + "\u0f75", THDLWylieConstants.U_VOWEL, // TODO(dchandler): in a perfect world, we'd decompose and sort the components. + "\u0f81", THDLWylieConstants.reverse_I_VOWEL, // TODO(dchandler): in a perfect world, we'd decompose and sort the components. + + "\u0f74", THDLWylieConstants.u_VOWEL, + + // equivalence class: + "\u0f72", THDLWylieConstants.i_VOWEL, + "\u0f7a", THDLWylieConstants.e_VOWEL, + "\u0f7b", THDLWylieConstants.ai_VOWEL, + "\u0f7c", THDLWylieConstants.o_VOWEL, + "\u0f7d", THDLWylieConstants.au_VOWEL, + "\u0f80", THDLWylieConstants.reverse_i_VOWEL, + + // equivalence class: + "\u0f7e", THDLWylieConstants.BINDU, + "\u0f82", THDLWylieConstants.U0F82, + "\u0f83", THDLWylieConstants.U0F83, + "\u0f86", THDLWylieConstants.U0F86, + "\u0f87", THDLWylieConstants.U0F87, + + // NOTE: we always say "e" comes before "o" but + // either order would work. + + /* TODO(dchandler): should these go with other + * under-line wowels like \u0f74? They're for the + * whole tsheg-bar, so they're oddballs... + * + * bestEwtsMap.put("\u0f35", THDLWylieConstants.U0F35); + * + * bestEwtsMap.put("\u0f37", THDLWylieConstants.U0F37); + * + * bestEwtsMap.put("\u0f84", THDLWylieConstants.U0F84); + * + * bestEwtsMap.put("\u0fc6", THDLWylieConstants.U0FC6); + */ + }); + public int compare(Object o1, Object o2) { + int i1 = order.indexOf(o1); + int i2 = order.indexOf(o2); + if (i1 < 0) i1 = order.size(); + if (i2 < 0) i2 = order.size(); + return i1 - i2; + } + } + String wowels[] = tp.getRight().split("\\+"); + java.util.Arrays.sort(wowels, new WowelComparator()); + StringBuffer sb = new StringBuffer(); + for (int i = 0; i < wowels.length; i++) { + sb.append(wowels[i]); + if (i + 1 < wowels.length) + sb.append('+'); + } + return new TPair(tp.getTraits(), tp.getLeft(), sb.toString()); + } else { + return tp; + } + } + // TODO(DLC)[EWTS->Tibetan]: doc private static TPairList breakHelperEWTS(String ewts, TTraits ttraits) { @@ -383,7 +467,9 @@ class TPairListFactory { StringBuffer ewtsBuf = new StringBuffer(ewts); int howMuchBuf[] = new int[1]; - TPair head = getFirstConsonantAndVowel(ewtsBuf, howMuchBuf, ttraits); + TPair head = ewtsSortWowels(getFirstConsonantAndVowel(ewtsBuf, + howMuchBuf, + ttraits)); int howMuch = howMuchBuf[0]; TPairList tail; @@ -448,7 +534,7 @@ class TPairListFactory { * {N+YE} or an error or whatever you like. howMuch[0] will be * set to the number of characters of tx that this call has * consumed. */ - private static TPair getFirstConsonantAndVowel(StringBuffer tx, // TODO(DLC)[EWTS->Tibetan]: function name needs ACIP in it? + private static TPair getFirstConsonantAndVowel(StringBuffer tx, int howMuch[], TTraits ttraits) { // To handle EWTS "phywa\\u0f84\u0f86" [yes that's two slashes diff --git a/source/org/thdl/tib/text/ttt/TString.java b/source/org/thdl/tib/text/ttt/TString.java index c84fd83..c7f51d8 100644 --- a/source/org/thdl/tib/text/ttt/TString.java +++ b/source/org/thdl/tib/text/ttt/TString.java @@ -21,6 +21,7 @@ package org.thdl.tib.text.ttt; import java.util.HashSet; import org.thdl.tib.text.tshegbar.UnicodeUtils; +import org.thdl.tib.text.THDLWylieConstants; import org.thdl.util.ThdlDebug; import org.thdl.util.ThdlOptions; @@ -66,8 +67,8 @@ public class TString { && type != END_SLASH && (type != UNICODE_CHARACTER || !(UnicodeUtils.isInTibetanRange(ch = getText().charAt(0)) - || (ch >= EWTSTraits.PUA_MIN - && ch <= EWTSTraits.PUA_MAX)))); + || (ch >= THDLWylieConstants.PUA_MIN + && ch <= THDLWylieConstants.PUA_MAX)))); } /** For ACIP [#COMMENTS] and EWTS (DLC FIXME: what are EWTS comments?) */ diff --git a/source/org/thdl/tib/text/ttt/TTraits.java b/source/org/thdl/tib/text/ttt/TTraits.java index ac2aee8..4386825 100644 --- a/source/org/thdl/tib/text/ttt/TTraits.java +++ b/source/org/thdl/tib/text/ttt/TTraits.java @@ -136,7 +136,8 @@ public interface TTraits { /** Gets the duffcodes for wowel, such that they look good with * the preceding glyph, and appends them to duff. */ - void getDuffForWowel(ArrayList duff, DuffCode preceding, String wowel); + void getDuffForWowel(ArrayList duff, DuffCode preceding, String wowel) + throws ConversionException; /** Human-readable name of this transliteration for short error strings. */ diff --git a/source/org/thdl/util/VerboseUnicodeDump.java b/source/org/thdl/util/VerboseUnicodeDump.java index b864a0a..0a8c318 100644 --- a/source/org/thdl/util/VerboseUnicodeDump.java +++ b/source/org/thdl/util/VerboseUnicodeDump.java @@ -43,7 +43,7 @@ public class VerboseUnicodeDump { java.nio.charset.Charset.forName(args[1])); int x; while (-1 != (x = fr.read())) { - System.out.println(org.thdl.tib.text.tshegbar.UnicodeUtils.unicodeCodepointToString((char)x, false, "")); + System.out.println(org.thdl.tib.text.tshegbar.UnicodeUtils.unicodeCodepointToString((char)x, false, "", false)); } System.exit(0); }