diff --git a/source/org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIETest.java b/source/org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIETest.java index 7a0582e..704ff80 100644 --- a/source/org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIETest.java +++ b/source/org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIETest.java @@ -144,7 +144,7 @@ public class TMW_RTF_TO_THDL_WYLIETest extends TestCase { /** Tests the --to-wylie converter mode of {@link * org.thdl.tib.input.TibetanConverter}. */ public void testConverterMode() { - helper("--to-wylie", "Conversion", 44); + helper("--to-wylie", "Conversion", 0); } /** Tests the --to-tibetan-machine converter mode of {@link diff --git a/source/org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIETest1ResultConversion.expected b/source/org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIETest1ResultConversion.expected index be3a254..8ca0166 100644 --- a/source/org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIETest1ResultConversion.expected +++ b/source/org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIETest1ResultConversion.expected @@ -37,8 +37,8 @@ rgyal ba kun dngos mtsho skyes rdo rje bstan pa'i rtsa lag thams cad mkhyen pa z bka' drin gzugs can dbyig 'dzin lto 'dir shong 'gyur min na kun mkhyen srang las gang gis gzhal//\par \par li khri'i lcug phran mkhyen pa'i snang ba can//\par -'jam mgon blo<<[[JSKAD_TMW_TO_WYLIE_ERROR_NO_SUCH_WYLIE: Cannot convert DuffCode to THDL Extended Wylie. Please see the documentation for the TMW font and transcribe this yourself.]]>>. yi lang tsho baza<<[[JSKAD_TMW_TO_WYLIE_ERROR_NO_SUCH_WYLIE: Cannot convert DuffCode to THDL Extended Wylie. Please see the documentation for the TMW font and transcribe this yourself.]]>>.nga po'i tshon//\par -kha dog so sor bkra ba'i graga<<[[JSKAD_TMW_TO_WYLIE_ERROR_NO_SUCH_WYLIE: Cannot convert DuffCode to THDL Extended Wylie. Please see the documentation for the TMW font and transcribe this yourself.]]>>.sa pa<<[[JSKAD_TMW_TO_WYLIE_ERROR_NO_SUCH_WYLIE: Cannot convert DuffCode to THDL Extended Wylie. Please see the documentation for the TMW font and transcribe this yourself.]]>>.'i rgyan//\par +'jam mgon bloX. yi lang tsho bazaX.nga po'i tshon//\par +kha dog so sor bkra ba'i gragaX.sa paX.'i rgyan//\par phyogs bral rna lung 'god mkhas rtag tu rgyal//\f2\fs44\i0\b0\ul0\cf0\par \par \f1\fs28\i0\b0\ul0 dpal ldan chos kyi rang bzhin ngos yangs par//\par diff --git a/source/org/thdl/tib/text/tibwn.ini b/source/org/thdl/tib/text/tibwn.ini index bbeac41..297711d 100644 --- a/source/org/thdl/tib/text/tibwn.ini +++ b/source/org/thdl/tib/text/tibwn.ini @@ -5,6 +5,10 @@ // - initial // marks a comment // - blank lines should be ignored // - marks a command +// +// If you change the Wylie here, it can break the ACIP->TMW and +// ACIP->Unicode conversion. So keep ACIPRules in sync with this, and be +// sure to run 'ant clean check' after your change. //_~32,1~0,32 @@ -34,6 +38,10 @@ $~38,5~~9,41~~~~~~~0F06 // dbu.khang.g-yas: (If this changes, edit ACIPConverter) )~209,1~~9,94~~~~~~~0F3D H~239,1~~8,92~~~~~~~0F7F +// mtshan.rtags: +X~101,5~~9,101~~~~~~~0F37 +// mtshan.rtags zhes.sa: +__TILDE__X~102,5~~9,102~~~~~~~0F35 // 8,91 is the small bindu. We say that this, and not 8,90 (large // anusvara) is the glyph that M yields. This is because [8,90] is @@ -971,10 +979,6 @@ r~176,4~~8,71~~~~~~~0FB2 // mchan rtags leading: \tmw8100~100,5~~9,100~~~~~~~none -// mtshan.rtags: -\tmw8101~101,5~~9,101~~~~~~~0F37 -// mtshan.rtags zhes.sa: -\tmw8102~102,5~~9,102~~~~~~~0F35 // che.mgo: \tmw8103~103,5~~9,103~~~~~~~0F38 // kuruka: diff --git a/source/org/thdl/tib/text/ttt/ACIPConverter.java b/source/org/thdl/tib/text/ttt/ACIPConverter.java index bbdaae7..1966cd0 100644 --- a/source/org/thdl/tib/text/ttt/ACIPConverter.java +++ b/source/org/thdl/tib/text/ttt/ACIPConverter.java @@ -186,10 +186,10 @@ public class ACIPConverter { ByteArrayOutputStream sw = new ByteArrayOutputStream(); ArrayList al = ACIPTshegBarScanner.scan(acip, errors, -1); try { - if (null != al - && convertToUnicode(al, sw, errors, - warnings, writeWarningsToResult, - warningLevel)) { + if (null != al) { + convertToUnicode(al, sw, errors, + warnings, writeWarningsToResult, + warningLevel); return sw.toString("UTF-8"); } else { return null; @@ -282,6 +282,33 @@ public class ACIPConverter { String text = "[#ERROR CONVERTING ACIP DOCUMENT: Lexical error: " + s.getText() + "]"; if (null != writer) writer.write(text); if (null != tdoc) tdoc.appendRoman(text, Color.RED); + } else if (stype == ACIPString.TSHEG_BAR_ADORNMENT) { + if (lastGuyWasNonPunct) { + String err = "[#ERROR CONVERTING ACIP DOCUMENT: This converter cannot yet convert " + s.getText() + " because the converter's author is unclear what the result should be.]"; + if (null != writer) { + String uni = ACIPRules.getUnicodeFor(s.getText(), false); + if (null == uni) { + hasErrors = true; + uni = err; + } + if (null != writer) writer.write(uni); + } + if (null != tdoc) { + String wylie + = ACIPRules.getWylieForACIPOther(s.getText()); + if (null == wylie) { + hasErrors = true; + tdoc.appendRoman(err, Color.RED); + } else { + tdoc.appendDuffCodes(new DuffCode[] { TibetanMachineWeb.getGlyph(wylie) }, + Color.BLACK); + } + } + } else { + hasErrors = true; + } + lastGuyWasNonPunct = true; // this stuff is not really punctuation + lastGuy = null; } else if (stype == ACIPString.WARNING) { lastGuyWasNonPunct = false; lastGuy = null; @@ -408,10 +435,10 @@ public class ACIPConverter { && (lpl = lastGuy.get(lastGuy.size() - 1)).size() == 1 && lpl.get(0).getLeft().equals("G") && // it's (G . anything) - // followed by some number - // of spaces (at least one, - // this one) and then a - // comma: + // followed by some number + // of spaces (at least one, + // this one) and then a + // comma: peekaheadFindsSpacesAndComma(scan, i+1))) { if (null != writer) { unicode = " "; diff --git a/source/org/thdl/tib/text/ttt/ACIPRules.java b/source/org/thdl/tib/text/ttt/ACIPRules.java index acff4b6..78d8577 100644 --- a/source/org/thdl/tib/text/ttt/ACIPRules.java +++ b/source/org/thdl/tib/text/ttt/ACIPRules.java @@ -236,7 +236,7 @@ class ACIPRules { acipOther2wylie.put(";", ";"); acipOther2wylie.put("*", "@"); acipOther2wylie.put("#", "@#"); - acipOther2wylie.put("%", "%"); + acipOther2wylie.put("%", "~X"); acipOther2wylie.put("&", "&"); acipOther2wylie.put("0", "0"); diff --git a/source/org/thdl/tib/text/ttt/ACIPString.java b/source/org/thdl/tib/text/ttt/ACIPString.java index f5caf00..83a4f1e 100644 --- a/source/org/thdl/tib/text/ttt/ACIPString.java +++ b/source/org/thdl/tib/text/ttt/ACIPString.java @@ -77,9 +77,11 @@ public class ACIPString { public static final int END_PAREN = 16; /** For things that may not be legal syntax, such as {KA . KHA} */ public static final int WARNING = 17; + /** For ACIP %, o, and x */ + public static final int TSHEG_BAR_ADORNMENT = 18; /** For things that are not legal syntax, such as a file that * contains just "[# HALF A COMMEN" */ - public static final int ERROR = 18; + public static final int ERROR = 19; /** Returns true if and only if this string is Latin (usually * English). Returns false if this string is transliteration of @@ -135,6 +137,7 @@ public class ACIPString { if (type == START_PAREN) typeString = "START_PAREN"; if (type == END_PAREN) typeString = "END_PAREN"; if (type == WARNING) typeString = "WARNING"; + if (type == TSHEG_BAR_ADORNMENT) typeString = "TSHEG_BAR_ADORNMENT"; if (type == ERROR) typeString = "ERROR"; return typeString + ":{" + getText() + "}"; } diff --git a/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java b/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java index a8a5acd..1bdc019 100644 --- a/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java +++ b/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java @@ -767,9 +767,16 @@ public class ACIPTshegBarScanner { case ';': case '`': case '#': + case '%': + case 'x': + case 'o': + boolean legalTshegBarAdornment = false; // The tsheg bar ends here; new token. if (startOfString < i) { + if (currentType == ACIPString.TIBETAN_NON_PUNCTUATION + && isTshegBarAdornment(ch)) + legalTshegBarAdornment = true; al.add(new ACIPString(s.substring(startOfString, i), currentType)); } @@ -780,7 +787,8 @@ public class ACIPTshegBarScanner { if (('\r' == ch || ('\n' == ch && i > 0 && s.charAt(i - 1) != '\r')) && !al.isEmpty() - && ((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TIBETAN_NON_PUNCTUATION) { + && (((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TIBETAN_NON_PUNCTUATION + || ((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TSHEG_BAR_ADORNMENT)) { al.add(new ACIPString(" ", ACIPString.TIBETAN_PUNCTUATION)); } @@ -788,7 +796,8 @@ public class ACIPTshegBarScanner { if (('\r' == ch || ('\n' == ch && i > 0 && s.charAt(i - 1) != '\r')) && !al.isEmpty() - && ((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TIBETAN_PUNCTUATION + && (((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TIBETAN_PUNCTUATION + || ((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TSHEG_BAR_ADORNMENT) && ((ACIPString)al.get(al.size() - 1)).getText().equals(",") && s.charAt(i-1) == ',' && (i + (('\r' == ch) ? 2 : 1) < sl @@ -804,9 +813,17 @@ public class ACIPTshegBarScanner { || (realNewline = ((rn = ('\n' == ch && i >= 3 && s.charAt(i-3) == '\r' && s.charAt(i-2) == '\n' && s.charAt(i-1) == '\r')) || ('\n' == ch && i >= 1 && s.charAt(i-1) == '\n')))) { - for (int h = 0; h < (realNewline ? 2 : 1); h++) - al.add(new ACIPString(rn ? s.substring(i - 1, i+1) : s.substring(i, i+1), - ACIPString.TIBETAN_PUNCTUATION)); + for (int h = 0; h < (realNewline ? 2 : 1); h++) { + if (isTshegBarAdornment(ch) && !legalTshegBarAdornment) { + al.add(new ACIPString("The ACIP " + ch + " must be glued to the end of a tsheg bar, but this one was not", + ACIPString.ERROR)); + } else { + al.add(new ACIPString(rn ? s.substring(i - 1, i+1) : s.substring(i, i+1), + (legalTshegBarAdornment + ? ACIPString.TSHEG_BAR_ADORNMENT + : ACIPString.TIBETAN_PUNCTUATION))); + } + } } startOfString = i+1; currentType = ACIPString.ERROR; @@ -910,15 +927,17 @@ public class ACIPTshegBarScanner { return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n'; } + /** See implementation. */ + private static boolean isTshegBarAdornment(char ch) { + return (ch == '%' || ch == 'o' || ch == 'x'); + } + /** See implementation. */ private static boolean isAlpha(char ch) { return ch == '\'' // 23rd consonant // combining punctuation, vowels: - || ch == '%' - || ch == 'o' || ch == 'm' - || ch == 'x' || ch == ':' || ch == '^' // DLC FIXME: we must treat this guy like a vowel, a special vowel that numerals can take on. Until then, warn. || ch == '\\' diff --git a/source/org/thdl/tib/text/ttt/PackageTest.java b/source/org/thdl/tib/text/ttt/PackageTest.java index af5c19e..c9989dd 100644 --- a/source/org/thdl/tib/text/ttt/PackageTest.java +++ b/source/org/thdl/tib/text/ttt/PackageTest.java @@ -7171,7 +7171,13 @@ tstHelper("ZUR"); "[TIBETAN_NON_PUNCTUATION:{K}, ERROR:{Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.}, TIBETAN_PUNCTUATION:{,}]"); - shelp("MTHAR%", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR%}]"); + shelp("MTHAR%", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR}, TSHEG_BAR_ADORNMENT:{%}]"); + shelp("MTHARo", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR}, TSHEG_BAR_ADORNMENT:{o}]"); + shelp("MTHARx", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR}, TSHEG_BAR_ADORNMENT:{x}]"); + + shelp("MTHAR\n%", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR}, TIBETAN_PUNCTUATION:{ }, ERROR:{The ACIP % must be glued to the end of a tsheg bar, but this one was not}]"); + shelp("MTHAR x", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR}, TIBETAN_PUNCTUATION:{ }, ERROR:{The ACIP x must be glued to the end of a tsheg bar, but this one was not}]"); + shelp("PHYIR;", "", "[TIBETAN_NON_PUNCTUATION:{PHYIR}, TIBETAN_PUNCTUATION:{;}]"); shelp("......,DAM ", "", @@ -7254,6 +7260,10 @@ tstHelper("ZUR"); } public void testACIPConversion() { + uhelp("KA%\nKHA", "\u0f40\u0f35\u0f0b\u0f41"); + uhelp("KA%", "\u0f40\u0f35"); + uhelp("KAo", "\u0f40[#ERROR CONVERTING ACIP DOCUMENT: This converter cannot yet convert o because the converter's author is unclear what the result should be.]"); + uhelp("KAx", "\u0f40[#ERROR CONVERTING ACIP DOCUMENT: This converter cannot yet convert x because the converter's author is unclear what the result should be.]"); uhelp("G+DHA", "\u0f42\u0fa2"); uhelp("P'EE", "\u0f54\u0f71\u0f7b"); @@ -7284,13 +7294,11 @@ tstHelper("ZUR"); uhelp("K'A:", "\u0f40\u0f71\u0f7f"); - // DLC FIXME: in ACIP RTF files, (PARENTHESES) seem to make - // text go from 24-point to 18-point. Thus, ACIP->Unicode.txt - // is fundamentally flawed, whereas ACIP->Unicode.rtf is OK. - uhelp("/NY'EE/", "\u0f3C\u0f49\u0F71\u0F7B\u0f3D"); - uhelp("*#HUm: G+DHOO GRO`;.,", "\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b\u0f42\u0fa2\u0f7d\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d"); - uhelp("*#HUm: K+DHA GRO`;.,", "none"); + uhelp("*#HUm: G+DHOO GRO`;.,", + "\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b\u0f42\u0fa2\u0f7d\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d"); + uhelp("*#HUm: K+DHA GRO`;.,", + "\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") K+DHA IS ESSENTIALLY NOTHING.]\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d"); } /** Tests some more tsheg bars, these from Dr. Lacey's critical @@ -8861,6 +8869,9 @@ tstHelper("shKA"); } /* DLC FIXME: add test cases: from R0021F.ACE: ambiguous Tibetan/Sanskrit: +DLC NOW: warn, in "All" mode, about each occurrence of BD, DB, DG, +DGR, DGY, DM, GD, GN, MN (but not B+D etc.) + BDA' B+DA DBANG