TMW->ACIP is much improved. V and W were confused, # and * were

confused; many glyphs that should have yielded errors were not. I've added a test case that transforms every TMW glyph save the one with no TM mapping to ACIP. I hand-checked that it was correct. ACIP->TMW is fixed for # and *. I never noticed it, but each needed an extra swoosh (U+0F05). Round-tripping would be good, as would testing real-world use of TMW->ACIP.
2004-04-14 05:44:51 +00:00 · 2004-04-14 05:44:51 +00:00 · 1bfd3772e6
commit 1bfd3772e6
parent 244a9d1370
10 changed files with 1110 additions and 85 deletions
--- a/source/org/thdl/tib/text/TGCPair.java
+++ b/source/org/thdl/tib/text/TGCPair.java
@ -112,17 +112,28 @@ public class TGCPair implements THDLWylieConstants {
    public String getACIP() {
        return getACIP(null);
    }
-    /** Like {@link #getWylie(String)} but for ACIP transliteration, not EWTS. */
+    /** Like {@link #getWylie(String)} but for ACIP transliteration,
+        not EWTS. */
    public String getACIP(String previousTranslitIfAppendaged) {
        // DLC FIXME: has the EWTS change affected Manipulate.acipToWylie?
        StringBuffer b = new StringBuffer();
        if (consonantWylie != null) {
            String consonantACIP
-                = org.thdl.tib.text.ttt.ACIPRules.getACIPForEWTS(consonantWylie);
+                = null;
+            if ("w".equals(consonantWylie)
+                && (SANSKRIT_WITHOUT_VOWEL == classification
+                    || SANSKRIT_WITH_VOWEL == classification))
+                consonantACIP = "V";
+            else
+                consonantACIP
+                    = org.thdl.tib.text.ttt.ACIPRules.getACIPForEWTS(consonantWylie);
            if (null == consonantACIP) {
-                return TibetanMachineWeb.getTMWToACIPErrorString("glyph with THDL Extended Wylie " + consonantWylie);
+                if (null != consonantWylie && consonantWylie.startsWith("R+"))
+                    return TibetanMachineWeb.getTMWToACIPErrorString("glyph with THDL Extended Wylie " + consonantWylie, " because the ACIP R+... could imply the short superscribed form, but this most likely intends the full form (i.e., Unicode character U+0F6A)");
+                return TibetanMachineWeb.getTMWToACIPErrorString("glyph with THDL Extended Wylie " + consonantWylie, "");
            } else {
-                // Think of pa'am...  we want 'am, not 'm; 'ang, not 'ng.  But we want 'ur, not 'uar, 'is, not 'ias.
+                // Think of pa'am...  we want 'am, not 'm; 'ang, not
+                // 'ng.  But we want 'ur, not 'uar, 'is, not 'ias.
                if (null != previousTranslitIfAppendaged
                    && "'".equals(previousTranslitIfAppendaged)) {
                    b.append("A");
@ -140,7 +151,7 @@ public class TGCPair implements THDLWylieConstants {
            String vowelACIP
                = org.thdl.tib.text.ttt.ACIPRules.getACIPForEWTS(vowelWylie);
            if (null == vowelACIP) {
-                return TibetanMachineWeb.getTMWToACIPErrorString("glyph with THDL Extended Wylie " + vowelWylie);
+                return TibetanMachineWeb.getTMWToACIPErrorString("glyph with THDL Extended Wylie " + vowelWylie, "");
            } else {
                b.append(vowelACIP);
            }
--- a/source/org/thdl/tib/text/TibTextUtils.java
+++ b/source/org/thdl/tib/text/TibTextUtils.java
@ -900,10 +900,13 @@ public class TibTextUtils implements THDLWylieConstants {
    /** Returns "a"/"A", unless wylie (which really is EWTS, not ACIP)
        is already "a". */
    private static String aVowelToUseAfter(boolean EWTSNotACIP, String wylie) {
-        if (wylie.equals(ACHEN))
-            return ""; // it's a, not aa, for achen alone.
-        else
-            return (EWTSNotACIP) ? WYLIE_aVOWEL : "A";
+        if (wylie.equals(ACHEN) && EWTSNotACIP) {
+            /* it's EWTS{a}, not EWTS{aa}, for achen alone. But it's
+               ACIP{AA}. */
+            return "";
+        } else
+            return ((EWTSNotACIP)
+                    ? WYLIE_aVOWEL : "A" /* hard-coded ACIP constant */);
    }

    private static String unambiguousPostAVowelTranslit(boolean EWTSNotACIP,
@ -929,7 +932,7 @@ public class TibTextUtils implements THDLWylieConstants {
 * EWTSNotACIP is true, or the ACIP otherwise.
 * @param EWTSNotACIP true if you want THDL Extended Wylie, false if
 * you want ACIP
-* @param dcs an array of glyphs
+* @param dcs an array of TMW glyphs
 * @param noSuch an array which will not be touched if this is
 * successful; however, if there is no THDL Extended Wylie/ACIP
 * corresponding to these glyphs, then noSuch[0] will be set to true
@ -959,9 +962,9 @@ public class TibTextUtils implements THDLWylieConstants {
    // DLC FIXME: {H}, U+0F7F, is part of a grapheme cluster!
    // David Chapman and I both need a comprehensive list of these
    // guys.  Get it from Unicode 4.0 spec?
-    /** Scans the glyphs in glyphList and creates the returned list of
-        grapheme clusters based on them.  A grapheme cluster is a
-        consonant or consonant stack with optional adornment or a
+    /** Scans the TMW glyphs in glyphList and creates the returned
+        list of grapheme clusters based on them.  A grapheme cluster
+        is a consonant or consonant stack with optional adornment or a
        number (possibly super- or subscribed) or some other glyph
        alone. */
    private static TGCList breakTshegBarIntoGraphemeClusters(java.util.List glyphList,
@ -986,7 +989,12 @@ public class TibTextUtils implements THDLWylieConstants {
            String wylie = TibetanMachineWeb.getWylieForGlyph(dc, noSuchWylie);
            boolean buildingUpSanskritNext = false;
            if ((buildingUpSanskritNext
-                 = TibetanMachineWeb.isWylieSanskritConsonantStack(wylie))
+                 = (TibetanMachineWeb.isWylieSanskritConsonantStack(wylie)
+                    ||
+                    /* U+0FAD, which should become ACIP "V", not "W",
+                       though the EWTS is "w" just as it is for
+                       TMW(fontNum==1).53: */
+                    (8 == dc.getFontNum() && 69 == dc.getCharNum())))
                || TibetanMachineWeb.isWylieTibetanConsonantOrConsonantStack(wylie)) {
                if (buildingUpVowel.length() > 0 || null != nonVowelWylie) {
                    gcs.add(new TGCPair(nonVowelWylie,
@ -1612,7 +1620,7 @@ public class TibTextUtils implements THDLWylieConstants {
        ArrayList glyphList = new ArrayList();
        StringBuffer translitBuffer = new StringBuffer();

-        // DLC FIXME: "    " should become " ", and test with ACIP # and *.
+        // DLC FIXME: "    " should become " " for ACIP
        for (int i=0; i<dcs.length; i++) {
            char ch = dcs[i].getCharacter();
            int k = dcs[i].getCharNum();
@ -1650,13 +1658,18 @@ public class TibTextUtils implements THDLWylieConstants {
                                                             ((i+1<dcs.length)
                                                              ? dcs[i+1]
                                                              : null),
+                                                             ((i+2<dcs.length)
+                                                              ? dcs[i+2]
+                                                              : null),
                                                             noSuch,
                                                             howManyConsumed);
                    if (howManyConsumed[0] == 1) {
                        // nothing to do
-                    } else {
-                        ThdlDebug.verify(howManyConsumed[0] == 2);
+                    } else if (howManyConsumed[0] == 2) {
                        ++i;
+                    } else {
+                        ThdlDebug.verify(howManyConsumed[0] == 3);
+                        ++i; ++i;
                    }
                }
                if (TibetanMachineWeb.isWyliePunc(wylie)
@ -1683,8 +1696,9 @@ public class TibTextUtils implements THDLWylieConstants {
                warnings.append("The stretch of Tibetan ended without final punctuation.");
        }

-        if (translitBuffer.length() > 0)
+        if (translitBuffer.length() > 0) {
            return translitBuffer.toString();
+        }
        else
            return null;
    }
--- a/source/org/thdl/tib/text/TibetanMachineWeb.java
+++ b/source/org/thdl/tib/text/TibetanMachineWeb.java
@ -966,9 +966,12 @@ public static boolean isWylieTibetanConsonantOrConsonantStack(String s) {
 }

 /**
-* Returns true if and only if s is the THDL Extended Wylie for a
-* Sanskrit multi-consonant stack.
-*/
+* Returns true if and only if s is necessarily the THDL Extended Wylie
+* for a Sanskrit (non-Tibetan, to be more correct) multi-consonant
+* stack.  If s is "w", then it might be the EWTS for TWM7.69, and that
+* glyph is only used in non-Tibetan stacks, but "w" also stands for
+* TMW.53, which is Tibetan, so this will return false for such a
+* glyph. */
 public static boolean isWylieSanskritConsonantStack(String s) {
    return sanskritStackSet.contains(s);
 }
@ -1909,11 +1912,18 @@ public static String wylieForGlyph(String hashKey) {
    return sb.toString();
 }

-    // DLC DOC
+/** Returns the ACIP transliteration for a glyph with hash key
+    hashKey, or returns null if there is none. */
 private static String acipForGlyph(String hashKey) {
-    String ACIP // DLC FIXME: test this.
-        = org.thdl.tib.scanner.Manipulate.wylieToAcip(hashKey);
-    return ACIP;
+    if (1 == hashKey.length()
+        // ~X is a special case because the EWTS is 2 characters in
+        // length
+        || "~X".equals(hashKey)) // hard-coded EWTS value
+        return org.thdl.tib.text.ttt.ACIPRules.getACIPForEWTS(hashKey);
+    else
+        // else we are not be able to use it because it's not smart
+        // about stacks (e.g., W+W)
+        return org.thdl.tib.scanner.Manipulate.wylieToAcip(hashKey);
 }

 /** Error that appears in a document when some TMW cannot be
@ -1927,15 +1937,15 @@ private static String getTMWToWylieErrorString(DuffCode dc) {
 }

 /** Error that appears in a document when some TMW cannot be
- *  transcribed in ACIP.  This error message is
- *  documented in www/htdocs/TMW_RTF_TO_THDL_WYLIE.html (DLC NOT YET), so change
- *  them both when you change this. */
-static String getTMWToACIPErrorString(String it) {
-    return "[# JSKAD_TMW_TO_ACIP_ERROR_NO_SUCH_ACIP: Cannot convert " + it + " to ACIP.  Please transcribe this yourself.]";
+ *  transcribed in ACIP.  This error message is documented in
+ *  www/htdocs/TMW_or_TM_To_X_Converters.html, so change them both
+ *  when you change this. */
+static String getTMWToACIPErrorString(String it, String explanation) {
+    return "[# JSKAD_TMW_TO_ACIP_ERROR_NO_SUCH_ACIP: Cannot convert " + it + " to ACIP" + explanation + ".  Please transcribe this yourself.]";
 }

-private static String getTMWToACIPErrorString(DuffCode dc) {
-    return getTMWToACIPErrorString(dc.toString(true));
+private static String getTMWToACIPErrorString(DuffCode dc, String explanation) {
+    return getTMWToACIPErrorString(dc.toString(true), explanation);
 }

 /**
@ -1979,65 +1989,103 @@ public static String getWylieForGlyph(DuffCode dc, boolean noSuchWylie[]) {
 }

 /** Returns ACIP transliteration or an error message stating why no
-    ACIP transliteration exists for the sole glyph dc or the two
-    glyphs dc and optionalNextDC as a whole.  noSuchACIP[0] will be
-    set (to true) if and only if there is no ACIP representation for
-    dc; in that case, an error message is returned rather than valid
-    ACIP.  optionalNextDC should be null if there is no context
-    information available (such as if dc is the last DuffCode being
-    converted from TMW to ACIP) or the DuffCode following dc
-    otherwise.  If the ACIP (or error message) returned captures both
-    dc and the nonnull optionalNextDC, then howManyGlyphsUsed[0] will
-    be set to 2, otherwise it will be set to 1.
+    ACIP transliteration exists for one, two, or three TMW glyphs.
+    This gobbles up three TMW glyphs when and only when "#" is
+    returned; this gobbles up two TMW glyphs when and only when "@" is
+    returned; this gobbles up one TMW glyph otherwise.  The number
+    gobbled is stored into howManyGlyphsUsed[0].  Always pass in as
+    many glyphs as possible.
+
+    <p>noSuchACIP[0] will be set (to true) if and only if there is no
+    ACIP representation; in that case, an error message is returned
+    rather than valid ACIP.  dc2 and/or dc3 should be null if there is
+    no context information available (i.e., if dc1 or dc2 is the last
+    DuffCode being converted from TMW to ACIP).  Otherwise, dc2 should
+    be the DuffCode following dc1 and dc3 should be the DuffCode
+    following dc2.  If the ACIP (or error message) returned captures
+    both dc1 and the (nonnull) dc2 and the (nonnull) dc3, then
+    howManyGlyphsUsed[0] will be set to 3.  If the ACIP (or error
+    message) returned captures both dc1 and the nonnull dc2, then
+    howManyGlyphsUsed[0] will be set to 2.  Otherwise it will be set
+    to 1.

    <p>This would be more straightforward if it were not the case that
    a TMW-&gt;ACIP conversion requires context information in the case
-    of U+0F04 and U+0F05.  Because it does, two DuffCodes, not one,
+    of U+0F04 and U+0F05.  Because it does, three DuffCodes, not one,
    must be passed in whenever possible.

-    <p>We opt to treat a lone U+0F05 as an error in TMW-&gt;ACIP
-    conversions rather than return the pseudo-ACIP Unicode character
-    escape for U+0F05.  After all, the conversion is TMW-&gt;ACIP, not
-    TMW-&gt;pseudo-ACIP.
+    <p>We opt to treat a lone U+0F05 or U+0F04 as an error in
+    TMW-&gt;ACIP conversions rather than return the pseudo-ACIP
+    Unicode character escape.  After all, the conversion is
+    TMW-&gt;ACIP, not TMW-&gt;pseudo-ACIP.

    @return error message or valid ACIP, never pseudo-ACIP like
    Unicode character escapes
-    @param dc the leftmost DuffCode if optionalNextDC is nonnull, or
-    the sole DuffCode
-    @param optionalNextDC null if dc is the last (rightmost) DuffCode
-    in the sequence, or the DuffCode following dc.  If you pass in dc
-    equal to the DuffCode for U+0F04, and optionalNextDC null, then
-    "*" will be returned, so don't leave this out unless dc is the
-    rightmost DuffCode.
+    @param dc1 the leftmost TMW DuffCode if dc2 is nonnull,
+    or the sole TMW DuffCode
+    @param dc2 null if dc1 is the last (rightmost) TMW DuffCode in the
+    sequence, or the TMW DuffCode following dc1.  If you pass in dc1
+    equal to the TMW DuffCode for U+0F04, and dc2 null, then "*" will
+    be returned, so don't leave this out unless dc1 is the rightmost
+    TMW DuffCode.
+    @param dc3 null if dc2 is null or is the last (rightmost) TMW
+    DuffCode in the sequence, or the TMW DuffCode following dc2
+    otherwise.
    @param noSuchACIP an array whose first element will be set to true
    if and only if an error message is returned instead of valid ACIP;
    the first element is never set to false, so nominally caller will
    initialize the first element to false
    @param howManyGlyphsUsed an array whose first element will be set
-    to 2 if valid ACIP that describes both dc and optionalNextDC is
-    returned, or 1 otherwise */
-public static String getACIPForGlyph(DuffCode dc,
-                                     DuffCode optionalNextDC,
+    to 3 if valid ACIP that desribes dc1, dc2, and dc3 is returned, to
+    2 if valid ACIP that describes both dc1 and dc2 is returned, or to
+    1 otherwise */
+public static String getACIPForGlyph(DuffCode dc1,
+                                     DuffCode dc2,
+                                     DuffCode dc3,
                                     boolean noSuchACIP[],
                                     int howManyGlyphsUsed[]) {
-    String hashKey = getHashKeyForGlyph(dc);
+
+    // DLC FIXME: TMW.53 is probably going to come out all wrong (VA
+    // vs. WA) from this function, but
+    // ACIPRules.getACIPForEWTS(String) seems to come through... will
+    // it always?
+
+    String hashKey = getHashKeyForGlyph(dc1);
    if (null != hashKey && hashKey.equals("@")) { // hard-coded EWTS value
        String nextHashKey
-            = ((null == optionalNextDC)
-               ? null : getHashKeyForGlyph(optionalNextDC));
+            = ((null == dc2)
+               ? null : getHashKeyForGlyph(dc2));
        if (null != nextHashKey && nextHashKey.equals("#")) { // hard-coded EWTS value
+            String nextNextHashKey
+                = ((null == dc3)
+                   ? null : getHashKeyForGlyph(dc3));
+            if (null != nextNextHashKey && nextNextHashKey.equals("#")) { // hard-coded EWTS value
+                howManyGlyphsUsed[0] = 3;
+                return "#"; // hard-coded ACIP value
+            }
            howManyGlyphsUsed[0] = 2;
-            return "#"; // hard-coded ACIP value
-        } else {
-            howManyGlyphsUsed[0] = 1;
            return "*"; // hard-coded ACIP value
-        }
+        } // else fall through
    }
+    if (null != hashKey && hashKey.equals("@#")) { // hard-coded EWTS value
+        String nextHashKey
+            = ((null == dc2)
+               ? null : getHashKeyForGlyph(dc2));
+        if (null != nextHashKey && nextHashKey.equals("#")) { // hard-coded EWTS value
+            howManyGlyphsUsed[0] = 2; // not 3
+            return "#"; // hard-coded ACIP value
+        }
+        howManyGlyphsUsed[0] = 1; // not 2
+        return "*"; // hard-coded ACIP value
+    }
+
    howManyGlyphsUsed[0] = 1;
    String ans = (hashKey == null) ? null : acipForGlyph(hashKey);
-    if (hashKey == null || ans == null) {
+    if (null == ans) {
        noSuchACIP[0] = true;
-        return getTMWToACIPErrorString(dc);
+        if (null != hashKey && hashKey.startsWith("R+"))
+            return getTMWToACIPErrorString(dc1, " because the ACIP R+... could imply the short superscribed form, but this most likely intends the full form (i.e., Unicode character U+0F6A)");
+        return getTMWToACIPErrorString(dc1, "");
    }
    return ans;
 }
--- a/source/org/thdl/tib/text/tibwn.ini
+++ b/source/org/thdl/tib/text/tibwn.ini
@ -23,6 +23,8 @@
 // glyphs from TMW.  0F6A is not listed here (DLC FIXME: should it be?),
 // but the glyph for it is the glyph for 0F62.
 //
+// The EWTS is not a unique key -- see "r", for example.
+//
 // DuffPaneTest ensures that the na-ro column truly contains na-ros,
 // by the way.
 //
@ -70,7 +72,8 @@ __TILDE__X~102,5~~9,102~~~~~~~0F35
 // though, and we let it become U+0F7E when you convert TMW->Unicode.
 // That is, we treat them as interchangeable except for in TMW->TM
 // mappings, where [8,91] does not map to any TM glyph (though you
-// could argue that it should become what [8,90] becomes).
+// could argue that it should become what [8,90] becomes -- DLC
+// FIXME).
 M~~~8,91~~~~~~~0F7E
 __TILDE__M~241,1~~8,94~~~~~~~0F83

--- a/source/org/thdl/tib/text/ttt/ACIPConverter.java
+++ b/source/org/thdl/tib/text/ttt/ACIPConverter.java
@ -628,9 +628,16 @@ public class ACIPConverter {
                                        tdocLocation[0] += s.getText().length();
                                        continue; // FIXME: this means the unicode above doesn't go into the output if null != writer && null != tdoc?
                                    } else {
-                                        String wy = ACIPRules.getWylieForACIPOther(s.getText());
-                                        if (null == wy) throw new Error("No wylie for ACIP " + s.getText());
-                                        duff = new Object[] { TibetanMachineWeb.getGlyph(wy) };
+                                        if ("#".equals(s.getText())) { // hard-coded ACIP value
+                                            duff = new Object[] {
+                                                TibetanMachineWeb.getGlyph("@#"),
+                                                TibetanMachineWeb.getGlyph("#")
+                                            }; // hard-coded EWTS values
+                                        } else {
+                                            String wy = ACIPRules.getWylieForACIPOther(s.getText());
+                                            if (null == wy) throw new Error("No wylie for ACIP " + s.getText());
+                                            duff = new Object[] { TibetanMachineWeb.getGlyph(wy) };
+                                        }
                                    }
                                }
                            }
--- a/source/org/thdl/tib/text/ttt/ACIPRules.java
+++ b/source/org/thdl/tib/text/ttt/ACIPRules.java
@ -157,6 +157,9 @@ public class ACIPRules {
        getWylieForACIPOther(null);
        getWylieForACIPVowel(null);
        String ans = (String)wylieToACIP.get(EWTS);
+        boolean useCapitalW = false;
+        if (EWTS.startsWith("w"))
+            useCapitalW = true; // We want W+NA, not V+NA; we want WA, not VA.
        if (null == ans) {
            StringBuffer finalAns = new StringBuffer(EWTS.length());
            StringTokenizer sTok = new StringTokenizer(EWTS, "-+", true);
@ -182,9 +185,14 @@ public class ACIPRules {
                if (null == part) return null;
                finalAns.append(part);
            }
+            if (useCapitalW)
+                finalAns.setCharAt(0, 'W');
            return finalAns.toString();
        }
-        return ans;
+        if (useCapitalW)
+            return "W" + ans.substring(1);
+        else
+            return ans;
    }

    /** Registers acip->wylie mappings in toWylie; registers
@ -193,6 +201,12 @@ public class ACIPRules {
        toWylie.put(ACIP, EWTS);
        if (null == wylieToACIP) {
            wylieToACIP = new HashMap(75);
+
+            // We don't want to put "/" in toWylie:
+            wylieToACIP.put("(", "/");
+            wylieToACIP.put(")", "/");
+            wylieToACIP.put("?", "\\");
+
            wylieToACIP.put("_", " "); // oddball.
            wylieToACIP.put("o'i", "O'I"); // oddball for TMW9.61.
        }
@ -307,14 +321,20 @@ public class ACIPRules {
        if (acipOther2wylie == null) {
            acipOther2wylie = new HashMap(20);

+            // don't use putMapping for this.  We don't want TMW->ACIP
+            // to produce "." for a U+0F0C because ACIP doesn't say
+            // that "." means U+0F0C.  It just seems to in practice
+            // for ACIP Release IV texts.
+            acipOther2wylie.put(".", "*");
+
+            putMapping(acipOther2wylie, "m", "M");
+            putMapping(acipOther2wylie, ":", "H");
            putMapping(acipOther2wylie, ",", "/");
            putMapping(acipOther2wylie, " ", " ");
-            putMapping(acipOther2wylie, ".", "*");
-            putMapping(acipOther2wylie, "|", "|");
+            putMapping(acipOther2wylie, ";", "|");
            putMapping(acipOther2wylie, "`", "!");
-            putMapping(acipOther2wylie, ";", ";");
-            putMapping(acipOther2wylie, "*", "@");
-            putMapping(acipOther2wylie, "#", "@#");
+            putMapping(acipOther2wylie, "*", "@#");
+            // There is no glyph in TMW with the EWTS @##, so we don't do this: putMapping(acipOther2wylie, "#", "@##");
            putMapping(acipOther2wylie, "%", "~X");
            putMapping(acipOther2wylie, "o", "X");
            putMapping(acipOther2wylie, "&", "&");
--- a/source/org/thdl/tib/text/ttt/TParseTree.java
+++ b/source/org/thdl/tib/text/ttt/TParseTree.java
@ -359,6 +359,7 @@ class TParseTree {
                        }
                    }
                    if (stackSize > 1 && tp.getLeft() != null && tp.getLeft().length() > 1) {
+                        // DLC FIXME: gives a false positive warning for Rsh
                        hasAmbiguousConsonant = true;
                    }
                }