ACIP % {MTHAR%} and o {Ko} and ^ {^GONG SA} are now supported. A % always causes a warning.

2003-11-11 03:43:11 +00:00 · 2003-11-11 03:43:11 +00:00 · 4e6a9c299f
commit 4e6a9c299f
parent 2cb90bd231
4 changed files with 119 additions and 43 deletions
--- a/source/org/thdl/tib/text/ttt/ACIPConverter.java
+++ b/source/org/thdl/tib/text/ttt/ACIPConverter.java
@ -309,8 +309,7 @@ public class ACIPConverter {
            TString s = (TString)scan.get(i);
            int stype = s.getType();
            if (stype == TString.ERROR) {
-                lastGuyWasNonPunct = false;
-                lastGuy = null;
+                // leave lastGuyWasNonPunct and lastGuy alone; WARNINGs and ERRORs are invisible.
                hasErrors = true;
                String text = "[#ERROR CONVERTING ACIP DOCUMENT: Lexical error: " + s.getText() + "]";
                if (null != writer) writer.write(text);
@ -348,8 +347,7 @@ public class ACIPConverter {
                lastGuyWasNonPunct = true; // this stuff is not really punctuation
                lastGuy = null;
            } else if (stype == TString.WARNING) {
-                lastGuyWasNonPunct = false;
-                lastGuy = null;
+                // leave lastGuyWasNonPunct and lastGuy alone; WARNINGs and ERRORs are invisible.
                if (writeWarningsToOut) {
                    String text = "[#WARNING CONVERTING ACIP DOCUMENT: Lexical warning: " + s.getText() + "]";
                    if (null != writer) writer.write(text);
@ -646,3 +644,5 @@ public class ACIPConverter {
        }
    }
 }
+
+
--- a/source/org/thdl/tib/text/ttt/ACIPRules.java
+++ b/source/org/thdl/tib/text/ttt/ACIPRules.java
@ -296,6 +296,7 @@ public class ACIPRules {
            putMapping(acipOther2wylie, "*", "@");
            putMapping(acipOther2wylie, "#", "@#");
            putMapping(acipOther2wylie, "%", "~X");
+            putMapping(acipOther2wylie, "o", "X");
            putMapping(acipOther2wylie, "&", "&");

            putMapping(acipOther2wylie, "0", "0");
--- a/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java
+++ b/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java
@ -30,8 +30,12 @@ import org.thdl.util.ThdlDebug;
 * comments, and the like are segregated (so that consumers can ensure
 * that they remain in Latin), and Tibetan passages are broken up into
 * tsheg bars.
-* @author David Chandler
-*/
+*
+* <p><b>FIXME:</b> We should be handling {KA\n\nKHA} vs. {KA\nKHA} in
+* the parser, not here in the lexical analyzer.  That'd be cleaner,
+* and more like how you'd do things if you used lex and yacc.
+*
+* @author David Chandler */
 public class ACIPTshegBarScanner {
    /** Useful for testing.  Gives error messages on standard output
     *  about why we can't scan the document perfectly and exits with
@ -103,6 +107,19 @@ public class ACIPTshegBarScanner {
        return scan(s.toString(), errors, maxErrors);
    }

+    /** Helper.  Here because ACIP {MTHAR%\nKHA} should be treated the
+        same w.r.t. tsheg insertion regardless of the lex errors and
+        lex warnings found. */
+    private static boolean lastNonExceptionalThingWasNonPunctish(ArrayList al) {
+        int i = al.size() - 1;
+        while (i >= 0 && (((TString)al.get(i)).getType() == TString.WARNING
+                          || ((TString)al.get(i)).getType() == TString.ERROR))
+            --i;
+        return (i >= 0 && // FIXME: or maybe i < 0 || ...
+                (((TString)al.get(i)).getType() == TString.TIBETAN_NON_PUNCTUATION
+                 || ((TString)al.get(i)).getType() == TString.TSHEG_BAR_ADORNMENT));
+    }
+
    /** Returns a list of {@link TString TStrings} corresponding
     *  to s, possibly the empty list (when the empty string is the
     *  input).  Each String is either a Latin comment, some Latin
@ -771,6 +788,7 @@ public class ACIPTshegBarScanner {
            case '%':
            case 'x':
            case 'o':
+            case '^':

                boolean legalTshegBarAdornment = false;
                // The tsheg bar ends here; new token.
@ -788,8 +806,7 @@ public class ACIPTshegBarScanner {
                if (('\r' == ch
                     || ('\n' == ch && i > 0 && s.charAt(i - 1) != '\r'))
                    && !al.isEmpty()
-                    && (((TString)al.get(al.size() - 1)).getType() == TString.TIBETAN_NON_PUNCTUATION
-                        || ((TString)al.get(al.size() - 1)).getType() == TString.TSHEG_BAR_ADORNMENT)) {
+                    && lastNonExceptionalThingWasNonPunctish(al)) {
                    al.add(new TString(" ", TString.TIBETAN_PUNCTUATION));
                }

@ -797,8 +814,7 @@ public class ACIPTshegBarScanner {
                if (('\r' == ch
                     || ('\n' == ch && i > 0 && s.charAt(i - 1) != '\r'))
                    && !al.isEmpty()
-                    && (((TString)al.get(al.size() - 1)).getType() == TString.TIBETAN_PUNCTUATION
-                        || ((TString)al.get(al.size() - 1)).getType() == TString.TSHEG_BAR_ADORNMENT)
+                    && lastNonExceptionalThingWasNonPunctish(al)
                    && ((TString)al.get(al.size() - 1)).getText().equals(",")
                    && s.charAt(i-1) == ','
                    && (i + (('\r' == ch) ? 2 : 1) < sl
@ -806,33 +822,61 @@ public class ACIPTshegBarScanner {
                    al.add(new TString(" ", TString.TIBETAN_PUNCTUATION));
                }

-                // Don't add in a "\r\n" or "\n" unless there's a
-                // blank line.
-                boolean rn = false;
-                boolean realNewline = false;
-                if (('\n' != ch && '\r' != ch)
-                    || (realNewline
-                        = ((rn = ('\n' == ch && i >= 3 && s.charAt(i-3) == '\r' && s.charAt(i-2) == '\n' && s.charAt(i-1) == '\r'))
-                           || ('\n' == ch && i >= 1 && s.charAt(i-1) == '\n')))) {
-                    for (int h = 0; h < (realNewline ? 2 : 1); h++) {
-                        if (isTshegBarAdornment(ch) && !legalTshegBarAdornment) {
-                            al.add(new TString("The ACIP " + ch + " must be glued to the end of a tsheg bar, but this one was not",
-                                               TString.ERROR));
-                        } else {
-                            al.add(new TString(rn ? s.substring(i - 1, i+1) : s.substring(i, i+1),
-                                               (legalTshegBarAdornment
-                                                ? TString.TSHEG_BAR_ADORNMENT
-                                                : TString.TIBETAN_PUNCTUATION)));
+                if ('^' == ch) {
+                    // "^ GONG SA" is the same as "^GONG SA" or
+                    // "^\r\nGONG SA".  But "^\n\nGONG SA" is
+                    // different -- that has a true line break in the
+                    // output between ^ and GONG.  We give an error if
+                    // ^ isn't followed by an alphabetical character.
+                    
+                    boolean bad = false;
+                    if (i + 1 < sl && isAlpha(s.charAt(i+1))) {
+                        // leave i alone
+                    } else if (i + 2 < sl && (' ' == s.charAt(i+1)
+                                              || '\r' == s.charAt(i+1)
+                                              || '\n' == s.charAt(i+1))
+                               && isAlpha(s.charAt(i+2))) {
+                        ++i;
+                    } else if (i + 3 < sl && '\r' == s.charAt(i+1)
+                               && '\n' == s.charAt(i+2)
+                               && isAlpha(s.charAt(i+3))) {
+                        i += 2;
+                    } else {
+                        bad = true;
+                    }
+                    if (!bad)
+                        al.add(new TString("^", TString.TIBETAN_PUNCTUATION));
+                    else
+                        al.add(new TString("The ACIP {^} must precede a tsheg bar.", TString.ERROR));
+                } else {
+                    // Don't add in a "\r\n" or "\n" unless there's a
+                    // blank line.
+                    boolean rn = false;
+                    boolean realNewline = false;
+                    if (('\n' != ch && '\r' != ch)
+                        || (realNewline
+                            = ((rn = ('\n' == ch && i >= 3 && s.charAt(i-3) == '\r' && s.charAt(i-2) == '\n' && s.charAt(i-1) == '\r'))
+                               || ('\n' == ch && i >= 1 && s.charAt(i-1) == '\n')))) {
+                        for (int h = 0; h < (realNewline ? 2 : 1); h++) {
+                            if (isTshegBarAdornment(ch) && !legalTshegBarAdornment) {
+                                al.add(new TString("The ACIP " + ch + " must be glued to the end of a tsheg bar, but this one was not",
+                                                   TString.ERROR));
+                            } else {
+                                al.add(new TString(rn ? s.substring(i - 1, i+1) : s.substring(i, i+1),
+                                                   (legalTshegBarAdornment
+                                                    ? TString.TSHEG_BAR_ADORNMENT
+                                                    : TString.TIBETAN_PUNCTUATION)));
+                            }
                        }
                    }
-                }
-                if ('%' == ch) {
-                    al.add(new TString("The ACIP {%} is treated by this converter as U+0F35, but sometimes might represent U+0F14 in practice",
-                                       TString.WARNING));
+                    if ('%' == ch) {
+                        al.add(new TString("The ACIP {%} is treated by this converter as U+0F35, but sometimes might represent U+0F14 in practice",
+                                           TString.WARNING));
+                    }
                }
                startOfString = i+1;
                currentType = TString.ERROR;
-                break; // end TIBETAN_PUNCTUATION case
+                break; // end TIBETAN_PUNCTUATION | TSHEG_BAR_ADORNMENT case

            default:
                if (!bracketTypeStack.empty()) {
@ -843,8 +887,9 @@ public class ACIPTshegBarScanner {
                    }
                }
                if (i+1 == sl && 26 == (int)ch)
-                    // Silently allow the last character to be ^Z,
-                    // which just marks end of file.
+                    // Silently allow the last character to be
+                    // control-Z (sometimes printed as ^Z), which just
+                    // marks end of file.
                    break;
                if (!(isNumeric(ch) || isAlpha(ch))) {
                    if (startOfString < i) {
@ -935,6 +980,7 @@ public class ACIPTshegBarScanner {
    /** See implementation. */
    private static boolean isTshegBarAdornment(char ch) {
        return (ch == '%' || ch == 'o' || ch == 'x');
+        // ^ is a pre-adornment; these are post-adornments.
    }

    /** See implementation. */
@ -944,7 +990,6 @@ public class ACIPTshegBarScanner {
            // combining punctuation, vowels:
            || ch == 'm'
            || ch == ':'
-            || ch == '^'
            // FIXME: we must treat this guy like a vowel, a special vowel that numerals can take on.  Until then, warn.  See bug 838588          || ch == '\\'

            || ch == '-'
--- a/source/org/thdl/tib/text/ttt/PackageTest.java
+++ b/source/org/thdl/tib/text/ttt/PackageTest.java
@ -7206,10 +7206,30 @@ tstHelper("ZUR");
              "[TIBETAN_NON_PUNCTUATION:{PAS}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, WARNING:{A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{LA}]");
        shelp("^GONG SA,",
              "",
-              "[TIBETAN_NON_PUNCTUATION:{^GONG}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{SA}, TIBETAN_PUNCTUATION:{,}]");
+              "[TIBETAN_PUNCTUATION:{^}, TIBETAN_NON_PUNCTUATION:{GONG}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{SA}, TIBETAN_PUNCTUATION:{,}]");
        shelp("^ GONG SA,",
              "",
-              "[TIBETAN_NON_PUNCTUATION:{^}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{GONG}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{SA}, TIBETAN_PUNCTUATION:{,}]");
+              "[TIBETAN_PUNCTUATION:{^}, TIBETAN_NON_PUNCTUATION:{GONG}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{SA}, TIBETAN_PUNCTUATION:{,}]");
+        shelp("^\nGONG SA,",
+              "",
+              "[TIBETAN_PUNCTUATION:{^}, TIBETAN_NON_PUNCTUATION:{GONG}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{SA}, TIBETAN_PUNCTUATION:{,}]");
+        shelp("^\rGONG SA,",
+              "",
+              "[TIBETAN_PUNCTUATION:{^}, TIBETAN_NON_PUNCTUATION:{GONG}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{SA}, TIBETAN_PUNCTUATION:{,}]");
+        shelp("^\r\nGONG SA,",
+              "",
+              "[TIBETAN_PUNCTUATION:{^}, TIBETAN_NON_PUNCTUATION:{GONG}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{SA}, TIBETAN_PUNCTUATION:{,}]");
+
+        // FIXME: what should this be?  We treat {^ GONG SA} like
+        // {^GONG SA}, but {^ GONG SA} isn't so obvious.  We give an
+        // error.
+        shelp("^  GONG SA,",
+              "",
+              "[ERROR:{The ACIP {^} must precede a tsheg bar.}, TIBETAN_PUNCTUATION:{ }, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{GONG}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{SA}, TIBETAN_PUNCTUATION:{,}]");
+
+        shelp("^\n\nGONG SA,",
+              "",
+              "[ERROR:{The ACIP {^} must precede a tsheg bar.}, TIBETAN_PUNCTUATION:{\n}, TIBETAN_PUNCTUATION:{\n}, TIBETAN_NON_PUNCTUATION:{GONG}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{SA}, TIBETAN_PUNCTUATION:{,}]");
        shelp("", "", "[]");
        shelp("[DD]", "");
        shelp("[",
@ -7280,11 +7300,11 @@ tstHelper("ZUR");
              "[TIBETAN_NON_PUNCTUATION:{K}, ERROR:{Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly.  Sorry!  Please do complain to the maintainers.}, TIBETAN_PUNCTUATION:{,}]");


-        shelp("MTHAR%", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR}, TSHEG_BAR_ADORNMENT:{%}]");
+        shelp("MTHAR%", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR}, TSHEG_BAR_ADORNMENT:{%}, WARNING:{The ACIP {%} is treated by this converter as U+0F35, but sometimes might represent U+0F14 in practice}]");
        shelp("MTHARo", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR}, TSHEG_BAR_ADORNMENT:{o}]");
        shelp("MTHARx", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR}, TSHEG_BAR_ADORNMENT:{x}]");

-        shelp("MTHAR\n%", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR}, TIBETAN_PUNCTUATION:{ }, ERROR:{The ACIP % must be glued to the end of a tsheg bar, but this one was not}]");
+        shelp("MTHAR\n%", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR}, TIBETAN_PUNCTUATION:{ }, ERROR:{The ACIP % must be glued to the end of a tsheg bar, but this one was not}, WARNING:{The ACIP {%} is treated by this converter as U+0F35, but sometimes might represent U+0F14 in practice}]");
        shelp("MTHAR x", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR}, TIBETAN_PUNCTUATION:{ }, ERROR:{The ACIP x must be glued to the end of a tsheg bar, but this one was not}]");

        shelp("PHYIR;", "", "[TIBETAN_NON_PUNCTUATION:{PHYIR}, TIBETAN_PUNCTUATION:{;}]");
@ -7397,6 +7417,12 @@ MNA'
 M+NA
 */
        uhelp("BNA", "[#WARNING CONVERTING ACIP DOCUMENT: Warning: We're going with {B+NA}, but only because our knowledge of prefix rules says that {B}{NA} is not a legal Tibetan tsheg bar (\"syllable\")]\u0f56\u0fa3");
+        uhelp("^GONG SA", "\u0f38\u0f42\u0f7c\u0f44\u0f0b\u0f66");
+        uhelp("^ GONG SA", "\u0f38\u0f42\u0f7c\u0f44\u0f0b\u0f66");
+        uhelp("^\rGONG SA", "\u0f38\u0f42\u0f7c\u0f44\u0f0b\u0f66");
+        uhelp("^\r\nGONG SA", "\u0f38\u0f42\u0f7c\u0f44\u0f0b\u0f66");
+        uhelp("^\nGONG SA", "\u0f38\u0f42\u0f7c\u0f44\u0f0b\u0f66");
+        uhelp("^  GONG SA", "[#ERROR CONVERTING ACIP DOCUMENT: Lexical error: The ACIP {^} must precede a tsheg bar.]        \u0f42\u0f7c\u0f44\u0f0b\u0f66");
        uhelp("BGLA", "\u0f56\u0f42\u0fb3");
        uhelp("BLCAG", "\u0f56\u0f63\u0f95\u0f42");
        uhelp("DBA", "[#WARNING CONVERTING ACIP DOCUMENT: The ACIP DBA has been interpreted as two stacks, not one, but you may wish to confirm that the original text had two stacks as it would be an easy mistake to make to see one stack and forget to input it with '+' characters.]\u0f51\u0f56");
@ -7411,7 +7437,7 @@ M+NA
    public void testACIPConversion() {
        uhelp("x", "[#ERROR CONVERTING ACIP DOCUMENT: Lexical error: The ACIP x must be glued to the end of a tsheg bar, but this one was not]");
        uhelp("o", "[#ERROR CONVERTING ACIP DOCUMENT: Lexical error: The ACIP o must be glued to the end of a tsheg bar, but this one was not]");
-        uhelp("%", "[#ERROR CONVERTING ACIP DOCUMENT: Lexical error: The ACIP % must be glued to the end of a tsheg bar, but this one was not]");
+        uhelp("%", "[#ERROR CONVERTING ACIP DOCUMENT: Lexical error: The ACIP % must be glued to the end of a tsheg bar, but this one was not][#WARNING CONVERTING ACIP DOCUMENT: Lexical warning: The ACIP {%} is treated by this converter as U+0F35, but sometimes might represent U+0F14 in practice]");
        uhelp(":", "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") : HAS THESE ERRORS: Cannot convert ACIP : because : is not an ACIP consonant]");
        uhelp("m", "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") m HAS THESE ERRORS: Cannot convert ACIP m because m is not an ACIP consonant]");

@ -7432,9 +7458,13 @@ M+NA

        uhelp("THAG PA", "\u0f50\u0f42\u0f0b\u0f54");
        uhelp("KA \nKHA\n\nGA", "\u0f40\u0f0b\u0f41\u0f0b\n\n\u0f42");
-        uhelp("KA%\nKHA", "\u0f40\u0f35\u0f0b\u0f41");
-        uhelp("KA%", "\u0f40\u0f35");
-        uhelp("KAo", "\u0f40[#ERROR CONVERTING ACIP DOCUMENT: This converter cannot yet convert o because the converter's author is unclear what the result should be.]");
+        uhelp("KA%\nKHA", "\u0f40\u0f35[#WARNING CONVERTING ACIP DOCUMENT: Lexical warning: The ACIP {%} is treated by this converter as U+0F35, but sometimes might represent U+0F14 in practice]\u0f0b\u0f41");
+        uhelp("KA%", "\u0f40\u0f35[#WARNING CONVERTING ACIP DOCUMENT: Lexical warning: The ACIP {%} is treated by this converter as U+0F35, but sometimes might represent U+0F14 in practice]");
+        uhelp("KAo", "\u0f40\u0f37");
+        uhelp("KAo\n\nKA", "\u0f40\u0f37\u0f0b\n\n\u0f40");
+        uhelp("KAo\nKHA", "\u0f40\u0f37\u0f0b\u0f41");
+        uhelp("KAo KHA", "\u0f40\u0f37\u0f0b\u0f41");
+        uhelp("KA KAo KHA", "\u0f40\u0f0b\u0f40\u0f37\u0f0b\u0f41");
        uhelp("KAx", "\u0f40[#ERROR CONVERTING ACIP DOCUMENT: This converter cannot yet convert x because the converter's author is unclear what the result should be.]");
        uhelp("G+DHA", "\u0f42\u0fa2");
        uhelp("P'EE", "\u0f54\u0f71\u0f7b");