The ACIP {NYA%} is supported. {NYAo} and {NYAx} are confusing to me,

because I don't know which glyphs o and x correspond to. For that reason, they cause ERRORs. The proposed THDL Extended Wylie ~X and X is now used for U+0F35 and U+0F37 respectively.
2003-09-07 16:19:50 +00:00 · 2003-09-07 16:19:50 +00:00 · 07e360d9a8
commit 07e360d9a8
parent f57cdda867
8 changed files with 96 additions and 32 deletions
--- a/source/org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIETest.java
+++ b/source/org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIETest.java
@ -144,7 +144,7 @@ public class TMW_RTF_TO_THDL_WYLIETest extends TestCase {
    /** Tests the --to-wylie converter mode of {@link
     *  org.thdl.tib.input.TibetanConverter}. */
    public void testConverterMode() {
-        helper("--to-wylie", "Conversion", 44);
+        helper("--to-wylie", "Conversion", 0);
    }

    /** Tests the --to-tibetan-machine converter mode of {@link
--- a/source/org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIETest1ResultConversion.expected
+++ b/source/org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIETest1ResultConversion.expected
@ -37,8 +37,8 @@ rgyal ba kun dngos mtsho skyes rdo rje bstan pa'i rtsa lag thams cad mkhyen pa z
 bka' drin gzugs can dbyig 'dzin lto 'dir shong 'gyur min na kun mkhyen srang las gang gis gzhal//\par
 \par
 li khri'i lcug phran mkhyen pa'i snang ba can//\par
-'jam mgon blo<<[[JSKAD_TMW_TO_WYLIE_ERROR_NO_SUCH_WYLIE: Cannot convert DuffCode <duffcode font=TibetanMachineWeb8 charNum=101 character=e/> to THDL Extended Wylie.  Please see the documentation for the TMW font and transcribe this yourself.]]>>. yi lang tsho baza<<[[JSKAD_TMW_TO_WYLIE_ERROR_NO_SUCH_WYLIE: Cannot convert DuffCode <duffcode font=TibetanMachineWeb8 charNum=101 character=e/> to THDL Extended Wylie.  Please see the documentation for the TMW font and transcribe this yourself.]]>>.nga po'i tshon//\par
-kha dog so sor bkra ba'i graga<<[[JSKAD_TMW_TO_WYLIE_ERROR_NO_SUCH_WYLIE: Cannot convert DuffCode <duffcode font=TibetanMachineWeb8 charNum=101 character=e/> to THDL Extended Wylie.  Please see the documentation for the TMW font and transcribe this yourself.]]>>.sa pa<<[[JSKAD_TMW_TO_WYLIE_ERROR_NO_SUCH_WYLIE: Cannot convert DuffCode <duffcode font=TibetanMachineWeb8 charNum=101 character=e/> to THDL Extended Wylie.  Please see the documentation for the TMW font and transcribe this yourself.]]>>.'i rgyan//\par
+'jam mgon bloX. yi lang tsho bazaX.nga po'i tshon//\par
+kha dog so sor bkra ba'i gragaX.sa paX.'i rgyan//\par
 phyogs bral rna lung 'god mkhas rtag tu rgyal//\f2\fs44\i0\b0\ul0\cf0\par
 \par
 \f1\fs28\i0\b0\ul0 dpal ldan chos kyi rang bzhin ngos yangs par//\par
--- a/source/org/thdl/tib/text/tibwn.ini
+++ b/source/org/thdl/tib/text/tibwn.ini
@ -5,6 +5,10 @@
 //   - initial // marks a comment
 //   - blank lines should be ignored
 //   - <?x?> marks a command
+//
+// If you change the Wylie here, it can break the ACIP->TMW and
+// ACIP->Unicode conversion.  So keep ACIPRules in sync with this, and be
+// sure to run 'ant clean check' after your change.

 <?Input:Punctuation?>
 //_~32,1~0,32
@ -34,6 +38,10 @@ $~38,5~~9,41~~~~~~~0F06
 // dbu.khang.g-yas: (If this changes, edit ACIPConverter)
 )~209,1~~9,94~~~~~~~0F3D
 H~239,1~~8,92~~~~~~~0F7F
+// mtshan.rtags:
+X~101,5~~9,101~~~~~~~0F37
+// mtshan.rtags zhes.sa:
+__TILDE__X~102,5~~9,102~~~~~~~0F35

 // 8,91 is the small bindu.  We say that this, and not 8,90 (large
 // anusvara) is the glyph that M yields.  This is because [8,90] is
@ -971,10 +979,6 @@ r~176,4~~8,71~~~~~~~0FB2
 // mchan rtags leading:
 \tmw8100~100,5~~9,100~~~~~~~none

-// mtshan.rtags:
-\tmw8101~101,5~~9,101~~~~~~~0F37
-// mtshan.rtags zhes.sa:
-\tmw8102~102,5~~9,102~~~~~~~0F35
 // che.mgo:
 \tmw8103~103,5~~9,103~~~~~~~0F38
 // kuruka:
--- a/source/org/thdl/tib/text/ttt/ACIPConverter.java
+++ b/source/org/thdl/tib/text/ttt/ACIPConverter.java
@ -186,10 +186,10 @@ public class ACIPConverter {
        ByteArrayOutputStream sw = new ByteArrayOutputStream();
        ArrayList al = ACIPTshegBarScanner.scan(acip, errors, -1);
        try {
-            if (null != al
-                && convertToUnicode(al, sw, errors,
-                                    warnings, writeWarningsToResult,
-                                    warningLevel)) {
+            if (null != al) {
+                convertToUnicode(al, sw, errors,
+                                 warnings, writeWarningsToResult,
+                                 warningLevel);
                return sw.toString("UTF-8");
            } else {
                return null;
@ -282,6 +282,33 @@ public class ACIPConverter {
                String text = "[#ERROR CONVERTING ACIP DOCUMENT: Lexical error: " + s.getText() + "]";
                if (null != writer) writer.write(text);
                if (null != tdoc) tdoc.appendRoman(text, Color.RED);
+            } else if (stype == ACIPString.TSHEG_BAR_ADORNMENT) {
+                if (lastGuyWasNonPunct) {
+                    String err = "[#ERROR CONVERTING ACIP DOCUMENT: This converter cannot yet convert " + s.getText() + " because the converter's author is unclear what the result should be.]";
+                    if (null != writer) {
+                        String uni = ACIPRules.getUnicodeFor(s.getText(), false);
+                        if (null == uni) {
+                            hasErrors = true;
+                            uni = err;
+                        }
+                        if (null != writer) writer.write(uni);
+                    }
+                    if (null != tdoc) {
+                        String wylie
+                            = ACIPRules.getWylieForACIPOther(s.getText());
+                        if (null == wylie) {
+                            hasErrors = true;
+                            tdoc.appendRoman(err, Color.RED);
+                        } else {
+                            tdoc.appendDuffCodes(new DuffCode[] { TibetanMachineWeb.getGlyph(wylie) },
+                                                 Color.BLACK);
+                        }
+                    }
+                } else {
+                    hasErrors = true;
+                }
+                lastGuyWasNonPunct = true; // this stuff is not really punctuation
+                lastGuy = null;
            } else if (stype == ACIPString.WARNING) {
                lastGuyWasNonPunct = false;
                lastGuy = null;
@ -408,10 +435,10 @@ public class ACIPConverter {
                                        && (lpl = lastGuy.get(lastGuy.size() - 1)).size() == 1
                                        && lpl.get(0).getLeft().equals("G")
                                        && // it's (G . anything)
-                                           // followed by some number
-                                           // of spaces (at least one,
-                                           // this one) and then a
-                                           // comma:
+                                        // followed by some number
+                                        // of spaces (at least one,
+                                        // this one) and then a
+                                        // comma:
                                        peekaheadFindsSpacesAndComma(scan, i+1))) {
                                    if (null != writer) {
                                        unicode = "    ";
--- a/source/org/thdl/tib/text/ttt/ACIPRules.java
+++ b/source/org/thdl/tib/text/ttt/ACIPRules.java
@ -236,7 +236,7 @@ class ACIPRules {
            acipOther2wylie.put(";", ";");
            acipOther2wylie.put("*", "@");
            acipOther2wylie.put("#", "@#");
-            acipOther2wylie.put("%", "%");
+            acipOther2wylie.put("%", "~X");
            acipOther2wylie.put("&", "&");

            acipOther2wylie.put("0", "0");
--- a/source/org/thdl/tib/text/ttt/ACIPString.java
+++ b/source/org/thdl/tib/text/ttt/ACIPString.java
@ -77,9 +77,11 @@ public class ACIPString {
    public static final int END_PAREN = 16;
    /** For things that may not be legal syntax, such as {KA . KHA} */
    public static final int WARNING = 17;
+    /** For ACIP %, o, and x */
+    public static final int TSHEG_BAR_ADORNMENT = 18;
    /** For things that are not legal syntax, such as a file that
     * contains just "[# HALF A COMMEN" */
-    public static final int ERROR = 18;
+    public static final int ERROR = 19;

    /** Returns true if and only if this string is Latin (usually
     *  English).  Returns false if this string is transliteration of
@ -135,6 +137,7 @@ public class ACIPString {
        if (type == START_PAREN) typeString = "START_PAREN";
        if (type == END_PAREN) typeString = "END_PAREN";
        if (type == WARNING) typeString = "WARNING";
+        if (type == TSHEG_BAR_ADORNMENT) typeString = "TSHEG_BAR_ADORNMENT";
        if (type == ERROR) typeString = "ERROR";
        return typeString + ":{" + getText() + "}";
    }
--- a/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java
+++ b/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java
@ -767,9 +767,16 @@ public class ACIPTshegBarScanner {
            case ';':
            case '`':
            case '#':
+            case '%':
+            case 'x':
+            case 'o':

+                boolean legalTshegBarAdornment = false;
                // The tsheg bar ends here; new token.
                if (startOfString < i) {
+                    if (currentType == ACIPString.TIBETAN_NON_PUNCTUATION
+                        && isTshegBarAdornment(ch))
+                        legalTshegBarAdornment = true;
                    al.add(new ACIPString(s.substring(startOfString, i),
                                          currentType));
                }
@ -780,7 +787,8 @@ public class ACIPTshegBarScanner {
                if (('\r' == ch
                     || ('\n' == ch && i > 0 && s.charAt(i - 1) != '\r'))
                    && !al.isEmpty()
-                    && ((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TIBETAN_NON_PUNCTUATION) {
+                    && (((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TIBETAN_NON_PUNCTUATION
+                        || ((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TSHEG_BAR_ADORNMENT)) {
                    al.add(new ACIPString(" ", ACIPString.TIBETAN_PUNCTUATION));
                }

@ -788,7 +796,8 @@ public class ACIPTshegBarScanner {
                if (('\r' == ch
                     || ('\n' == ch && i > 0 && s.charAt(i - 1) != '\r'))
                    && !al.isEmpty()
-                    && ((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TIBETAN_PUNCTUATION
+                    && (((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TIBETAN_PUNCTUATION
+                        || ((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TSHEG_BAR_ADORNMENT)
                    && ((ACIPString)al.get(al.size() - 1)).getText().equals(",")
                    && s.charAt(i-1) == ','
                    && (i + (('\r' == ch) ? 2 : 1) < sl
@ -804,9 +813,17 @@ public class ACIPTshegBarScanner {
                    || (realNewline
                        = ((rn = ('\n' == ch && i >= 3 && s.charAt(i-3) == '\r' && s.charAt(i-2) == '\n' && s.charAt(i-1) == '\r'))
                           || ('\n' == ch && i >= 1 && s.charAt(i-1) == '\n')))) {
-                    for (int h = 0; h < (realNewline ? 2 : 1); h++)
-                        al.add(new ACIPString(rn ? s.substring(i - 1, i+1) : s.substring(i, i+1),
-                                              ACIPString.TIBETAN_PUNCTUATION));
+                    for (int h = 0; h < (realNewline ? 2 : 1); h++) {
+                        if (isTshegBarAdornment(ch) && !legalTshegBarAdornment) {
+                            al.add(new ACIPString("The ACIP " + ch + " must be glued to the end of a tsheg bar, but this one was not",
+                                                  ACIPString.ERROR));
+                        } else {
+                            al.add(new ACIPString(rn ? s.substring(i - 1, i+1) : s.substring(i, i+1),
+                                                  (legalTshegBarAdornment
+                                                   ? ACIPString.TSHEG_BAR_ADORNMENT
+                                                   : ACIPString.TIBETAN_PUNCTUATION)));
+                        }
+                    }
                }
                startOfString = i+1;
                currentType = ACIPString.ERROR;
@ -910,15 +927,17 @@ public class ACIPTshegBarScanner {
        return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n';
    }

+    /** See implementation. */
+    private static boolean isTshegBarAdornment(char ch) {
+        return (ch == '%' || ch == 'o' || ch == 'x');
+    }
+
    /** See implementation. */
    private static boolean isAlpha(char ch) {
        return ch == '\'' // 23rd consonant

            // combining punctuation, vowels:
-            || ch == '%'
-            || ch == 'o'
            || ch == 'm'
-            || ch == 'x'
            || ch == ':'
            || ch == '^'
            // DLC FIXME: we must treat this guy like a vowel, a special vowel that numerals can take on.  Until then, warn.            || ch == '\\'
--- a/source/org/thdl/tib/text/ttt/PackageTest.java
+++ b/source/org/thdl/tib/text/ttt/PackageTest.java
@ -7171,7 +7171,13 @@ tstHelper("ZUR");
              "[TIBETAN_NON_PUNCTUATION:{K}, ERROR:{Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly.  Sorry!  Please do complain to the maintainers.}, TIBETAN_PUNCTUATION:{,}]");


-        shelp("MTHAR%", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR%}]");
+        shelp("MTHAR%", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR}, TSHEG_BAR_ADORNMENT:{%}]");
+        shelp("MTHARo", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR}, TSHEG_BAR_ADORNMENT:{o}]");
+        shelp("MTHARx", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR}, TSHEG_BAR_ADORNMENT:{x}]");
+
+        shelp("MTHAR\n%", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR}, TIBETAN_PUNCTUATION:{ }, ERROR:{The ACIP % must be glued to the end of a tsheg bar, but this one was not}]");
+        shelp("MTHAR x", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR}, TIBETAN_PUNCTUATION:{ }, ERROR:{The ACIP x must be glued to the end of a tsheg bar, but this one was not}]");
+
        shelp("PHYIR;", "", "[TIBETAN_NON_PUNCTUATION:{PHYIR}, TIBETAN_PUNCTUATION:{;}]");
        shelp("......,DAM ",
              "",
@ -7254,6 +7260,10 @@ tstHelper("ZUR");
    }

    public void testACIPConversion() {
+        uhelp("KA%\nKHA", "\u0f40\u0f35\u0f0b\u0f41");
+        uhelp("KA%", "\u0f40\u0f35");
+        uhelp("KAo", "\u0f40[#ERROR CONVERTING ACIP DOCUMENT: This converter cannot yet convert o because the converter's author is unclear what the result should be.]");
+        uhelp("KAx", "\u0f40[#ERROR CONVERTING ACIP DOCUMENT: This converter cannot yet convert x because the converter's author is unclear what the result should be.]");
        uhelp("G+DHA", "\u0f42\u0fa2");
        uhelp("P'EE", "\u0f54\u0f71\u0f7b");

@ -7284,13 +7294,11 @@ tstHelper("ZUR");

        uhelp("K'A:", "\u0f40\u0f71\u0f7f");

-        // DLC FIXME: in ACIP RTF files, (PARENTHESES) seem to make
-        // text go from 24-point to 18-point.  Thus, ACIP->Unicode.txt
-        // is fundamentally flawed, whereas ACIP->Unicode.rtf is OK.
-
        uhelp("/NY'EE/", "\u0f3C\u0f49\u0F71\u0F7B\u0f3D");
-        uhelp("*#HUm: G+DHOO GRO`;.,", "\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b\u0f42\u0fa2\u0f7d\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d");
-        uhelp("*#HUm: K+DHA GRO`;.,", "none");
+        uhelp("*#HUm: G+DHOO GRO`;.,",
+              "\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b\u0f42\u0fa2\u0f7d\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d");
+        uhelp("*#HUm: K+DHA GRO`;.,",
+              "\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") K+DHA IS ESSENTIALLY NOTHING.]\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d");
    }

    /** Tests some more tsheg bars, these from Dr. Lacey's critical
@ -8861,6 +8869,9 @@ tstHelper("shKA");
 }
 /* DLC FIXME: add test cases: from R0021F.ACE: ambiguous Tibetan/Sanskrit:

+DLC NOW: warn, in "All" mode, about each occurrence of BD, DB, DG,
+DGR, DGY, DM, GD, GN, MN (but not B+D etc.)
+
 BDA'
 B+DA
 DBANG