I've improved the ACIP tsheg bar scanner to handle a lot of illegal

constructions that occur in practice.
2003-08-16 16:13:53 +00:00 · 2003-08-16 16:13:53 +00:00 · 0b91ed0beb
commit 0b91ed0beb
parent 2a57439516
1 changed files with 266 additions and 35 deletions
--- a/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java
+++ b/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java
@ -33,6 +33,41 @@ import org.thdl.util.ThdlDebug;
 * @author David Chandler
 */
 public class ACIPTshegBarScanner {
    // DLC DOC
    public static void main(String[] args) throws IOException {
        if (args.length != 1) {
            System.out.println("Bad args!  Need just the ACIP file's path.");
            System.exit(1);
        }
        StringBuffer errors = new StringBuffer();
        ArrayList al = scanFile(args[0], errors);
        if (errors.length() > 0) {
            System.out.println("Errors scanning ACIP input file: ");
            System.out.println(errors);
            System.out.println("Exiting; please fix input file and try again.");
            System.exit(1);
        }
        System.out.println("Good scan!");
        System.exit(0);
    }
    // DLC DOC
    // DLC FIXME: not so efficient; copies the whole file into memory first
    public static ArrayList scanFile(String fname, StringBuffer errors) throws IOException {
        StringBuffer s = new StringBuffer();
        char ch[] = new char[8192];
        BufferedReader in
            = new BufferedReader(new InputStreamReader(new FileInputStream(fname))); // DLC FIXME: specify encoding.
        int amt;
        while (-1 != (amt = in.read(ch))) {
            s.append(ch, 0, amt);
        }
        return scan(s.toString(), errors);
    }
    /** Returns a list of {@link ACIPString ACIPStrings} corresponding
     *  to s, possibly the empty list (when the empty string is the
     *  input).  Each String is either a Latin comment, some Latin
@ -41,9 +76,13 @@ public class ACIPTshegBarScanner {
     *
     *  <p>This not only scans; it finds all the errors a parser would
     *  too, like "NYA x" and "(" and ")" and "/NYA" etc.  It puts
-     *  those in as ACIPStrings with type {@link ACIPString#ERROR}.
+     *  those in as ACIPStrings with type {@link ACIPString#ERROR},
     *  and also, if errors is non-null, appends helpful messages to
     *  errors, each followed by a '\n'.  There is at least one case
     *  where no ERROR ACIPString will appear but errors will be
     *  modified.
    */
-    public static ArrayList scan(String s) {
+    public static ArrayList scan(String s, StringBuffer errors) {
        // the size depends on whether it's mostly Tibetan or mostly
        // Latin and a number of other factors.  This is meant to be
@ -60,9 +99,18 @@ public class ACIPTshegBarScanner {
            if (i < startOfString) throw new Error("bad reset");
            char ch;
            ch = s.charAt(i);
-            if (ACIPString.COMMENT == currentType && ch != ']')
+            if (ACIPString.COMMENT == currentType && ch != ']') {
                if ('[' == ch) {
                    al.add(new ACIPString("Found an open square bracket, [, within a [#COMMENT]-style comment.  Square brackets may not appear in comments.\n",
                                          ACIPString.ERROR));
                    if (null != errors)
                        errors.append("Offset " + i + ": "
                                      + "Found an open square bracket, [, within a [#COMMENT]-style comment.  Square brackets may not appear in comments.\n");
                }
                continue;
            }
            switch (ch) {
            case '}':
            case ']':
                if (bracketTypeStack.empty()) {
                    // Error.
@ -71,6 +119,9 @@ public class ACIPTshegBarScanner {
                                              currentType));
                    }
                    al.add(new ACIPString(s.substring(i, i+1), ACIPString.ERROR));
                    if (null != errors)
                        errors.append("Offset " + i + ": "
                                      + "Found a closing square bracket, ], without a matching open square bracket, [.  Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
                    startOfString = i+1;
                    currentType = ACIPString.ERROR;
                } else {
@ -89,8 +140,11 @@ public class ACIPTshegBarScanner {
                    startOfString = i+1;
                    currentType = ACIPString.ERROR;
                }
-                break;
+                break; // end ']','}' case
            case '{': // NOTE WELL: KX0016I.ACT, KD0095M.ACT, and a
                      // host of other ACIP files use {} brackets like
                      // [] brackets.  I treat both the same.
            case '[':
                // This definitely indicates a new token.
                if (startOfString < i) {
@ -102,37 +156,107 @@ public class ACIPTshegBarScanner {
                String thingy = null;
                if (i + "[DD]".length() <= sl
-                    && s.substring(i, i + "[DD]".length()).equals("[DD]")) {
+                    && (s.substring(i, i + "[DD]".length()).equals("[DD]")
                        || s.substring(i, i + "[DD]".length()).equals("{DD}"))) {
                    thingy = "[DD]";
                    currentType = ACIPString.DD;
                } else if (i + "[DD1]".length() <= sl
-                           && s.substring(i, i + "[DD1]".length()).equals("[DD1]")) {
+                           && (s.substring(i, i + "[DD1]".length()).equals("[DD1]")
                               || s.substring(i, i + "[DD1]".length()).equals("{DD1}"))) {
                    thingy = "[DD1]";
                    currentType = ACIPString.DD;
                } else if (i + "[DD2]".length() <= sl
-                           && s.substring(i, i + "[DD2]".length()).equals("[DD2]")) {
+                           && (s.substring(i, i + "[DD2]".length()).equals("[DD2]")
                               || s.substring(i, i + "[DD2]".length()).equals("{DD2}"))) {
                    thingy = "[DD2]";
                    currentType = ACIPString.DD;
                } else if (i + "[DDD]".length() <= sl
-                           && s.substring(i, i + "[DDD]".length()).equals("[DDD]")) {
+                           && (s.substring(i, i + "[DDD]".length()).equals("[DDD]")
                               || s.substring(i, i + "[DDD]".length()).equals("{DDD}"))) {
                    thingy = "[DDD]";
                    currentType = ACIPString.DD;
                } else if (i + "[DR]".length() <= sl
-                           && s.substring(i, i + "[DR]".length()).equals("[DR]")) {
+                           && (s.substring(i, i + "[DR]".length()).equals("[DR]")
                               || s.substring(i, i + "[DR]".length()).equals("{DR}"))) {
                    thingy = "[DR]";
                    currentType = ACIPString.DR;
                } else if (i + "[LS]".length() <= sl
-                           && s.substring(i, i + "[LS]".length()).equals("[LS]")) {
+                           && (s.substring(i, i + "[LS]".length()).equals("[LS]")
                               || s.substring(i, i + "[LS]".length()).equals("{LS}"))) {
                    thingy = "[LS]";
                    currentType = ACIPString.LS;
                } else if (i + "[BP]".length() <= sl
-                           && s.substring(i, i + "[BP]".length()).equals("[BP]")) {
+                           && (s.substring(i, i + "[BP]".length()).equals("[BP]")
                               || s.substring(i, i + "[BP]".length()).equals("{BP}"))) {
                    thingy = "[BP]";
                    currentType = ACIPString.BP;
                } else if (i + "[ BP ]".length() <= sl
                           && (s.substring(i, i + "[ BP ]".length()).equals("[ BP ]")
                               || s.substring(i, i + "[ BP ]".length()).equals("{ BP }"))) {
                    thingy = "{ BP }"; // found in TD3790E2.ACT
                    currentType = ACIPString.BP;
                } else if (i + "[ DD ]".length() <= sl
                           && (s.substring(i, i + "[ DD ]".length()).equals("[ DD ]")
                               || s.substring(i, i + "[ DD ]".length()).equals("{ DD }"))) {
                    thingy = "{ DD }"; // found in TD3790E2.ACT
                    currentType = ACIPString.DD;
                } else if (i + "[?]".length() <= sl
-                           && s.substring(i, i + "[?]".length()).equals("[?]")) {
+                           && (s.substring(i, i + "[?]".length()).equals("[?]")
                               || s.substring(i, i + "[?]".length()).equals("{?}"))) {
                    thingy = "[?]";
                    currentType = ACIPString.QUESTION;
                } else {
                    //  We see comments appear not as [#COMMENT], but
                    //  as [COMMENT] sometimes.  We make special cases
                    //  for some English comments.  DLC FIXME: put
                    //  these in a config file.
                    String[] englishComments = new String[] {
                        "FIRST", "SECOND", // S5274I.ACT
                        "Additional verses added by Khen Rinpoche here are", // S0216M.ACT
                        "ADDENDUM: The text of", // S0216M.ACT
                        "END OF ADDENDUM", // S0216M.ACT
                        "Some of the verses added here by Khen Rinpoche include:", // S0216M.ACT
                        "Note that, in the second verse, the {YUL LJONG} was orignally {GANG LJONG},\nand is now recited this way since the ceremony is not only taking place in Tibet.", // S0216M.ACT
                        "Note that, in the second verse, the {YUL LJONG} was orignally {GANG LJONG},\r\nand is now recited this way since the ceremony is not only taking place in Tibet.", // S0216M.ACT
                        "text missing", // S6954E1.ACT
                        "INCOMPLETE", // TD3817I.INC
                        "MISSING PAGE", // S0935m.act
                        "MISSING FOLIO", // S0975I.INC
                        "UNCLEAR LINE", // S0839D1I.INC
                        "THE FOLLOWING TEXT HAS INCOMPLETE SECTIONS, WHICH ARE ON ORDER", // SE6260A.INC
                        "@DATA INCOMPLETE HERE", // SE6260A.INC
                        "@DATA MISSING HERE", // SE6260A.INC
                        "DATA INCOMPLETE HERE", // TD4226I2.INC
                        "DATA MISSING HERE", // just being consistent
                        "FOLLOWING SECTION WAS NOT AVAILABLE WHEN THIS EDITION WAS\nPRINTED, AND IS SUPPLIED FROM ANOTHER, PROBABLY THE ORIGINAL:", // S0018N.ACT
                        "FOLLOWING SECTION WAS NOT AVAILABLE WHEN THIS EDITION WAS\r\nPRINTED, AND IS SUPPLIED FROM ANOTHER, PROBABLY THE ORIGINAL:", // S0018N.ACT
                        "THESE PAGE NUMBERS RESERVED IN THIS EDITION FOR PAGES\nMISSING FROM ORIGINAL ON WHICH IT WAS BASED", // S0018N.ACT
                        "THESE PAGE NUMBERS RESERVED IN THIS EDITION FOR PAGES\r\nMISSING FROM ORIGINAL ON WHICH IT WAS BASED", // S0018N.ACT
                        "PAGE NUMBERS RESERVED FROM THIS EDITION FOR MISSING\nSECTION SUPPLIED BY PRECEDING", // S0018N.ACT
                        "PAGE NUMBERS RESERVED FROM THIS EDITION FOR MISSING\r\nSECTION SUPPLIED BY PRECEDING", // S0018N.ACT
                        "SW: OK", // S0057M.ACT
                        "m:ok", // S0057M.ACT
                        "A FIRST ONE\nMISSING HERE?", // S0057M.ACT
                        "A FIRST ONE\r\nMISSING HERE?", // S0057M.ACT
                        "THE INITIAL PART OF THIS TEXT WAS INPUT BY THE SERA MEY LIBRARY IN\nTIBETAN FONT AND NEEDS TO BE REDONE BY DOUBLE INPUT", // S0195A1.INC
                        "THE INITIAL PART OF THIS TEXT WAS INPUT BY THE SERA MEY LIBRARY IN\r\nTIBETAN FONT AND NEEDS TO BE REDONE BY DOUBLE INPUT", // S0195A1.INC
                    };
                    boolean foundOne = false;
                    for (int ec = 0; ec < englishComments.length; ec++) {
                        if (i + 2 + englishComments[ec].length() <= sl
                            && (s.substring(i, i + 2 + englishComments[ec].length()).equals("[" + englishComments[ec] + "]")
                                || s.substring(i, i + 2 + englishComments[ec].length()).equals("[" + englishComments[ec] + "]"))) {
                            al.add(new ACIPString("[#" + englishComments[ec] + "]",
                                                  ACIPString.COMMENT));
                            startOfString = i + 2 + englishComments[ec].length();
                            i = startOfString - 1;
                            foundOne = true;
                            break;
                        }
                    }
                    if (foundOne)
                        break;
                }
                if (null != thingy) {
                    al.add(new ACIPString(thingy,
@ -157,10 +281,18 @@ public class ACIPTshegBarScanner {
                    // "... [" could cause this too.
                    al.add(new ACIPString(s.substring(i, i+1),
                                          ACIPString.ERROR));
                    if (null != errors) {
                        String inContext = s.substring(i, i+Math.min(sl-i, 10));
                        if (sl-i > 10) {
                            inContext = inContext + "...";
                        }
                        errors.append("Offset " + i + ": "
                                      + "Found an illegal open square bracket, [ (in context, this is " + inContext + ").  Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open square bracket?\n");
                    }
                    startOfString = i + 1;
                    currentType = ACIPString.ERROR;
                }
-                break; // end '[' case
+                break; // end '[','{' case
            case '@':
                // This definitely indicates a new token.
@ -185,8 +317,31 @@ public class ACIPTshegBarScanner {
                            }
                        }
                        if (allAreNumeric) {
-                            al.add(new ACIPString(s.substring(i, i+numdigits+2), ACIPString.FOLIO_MARKER));
+                            al.add(new ACIPString(s.substring(i, i+numdigits+2),
                                                  ACIPString.FOLIO_MARKER));
                            startOfString = i+numdigits+2;
                            i = startOfString - 1;
                            currentType = ACIPString.ERROR;
                            break;
                        }
                    }
                    //                    System.out.println("DLC NOW HERE xxx y:" + (i+numdigits+3 < sl) + " z:" + s.charAt(i+1) + s.charAt(i+numdigits+2) + s.charAt(i+numdigits+3));
                    if (i+numdigits+3 < sl
                        && s.charAt(i+1) == '[' && s.charAt(i+numdigits+3) == ']'
                        && (s.charAt(i+numdigits+2) == 'A' || s.charAt(i+numdigits+2) == 'B')) {
                        boolean allAreNumeric = true;
                        for (int k = 1; k <= numdigits; k++) {
                            if (!isNumeric(s.charAt(i+1+k))) {
                                allAreNumeric = false;
                                break;
                            }
                        }
                        if (allAreNumeric) {
                            al.add(new ACIPString(s.substring(i, i+numdigits+4),
                                                  ACIPString.FOLIO_MARKER));
                            startOfString = i+numdigits+4;
                            i = startOfString - 1;
                            currentType = ACIPString.ERROR;
                            break;
                        }
@ -194,6 +349,9 @@ public class ACIPTshegBarScanner {
                }
                if (startOfString == i) {
                    al.add(new ACIPString(s.substring(i, i+1), ACIPString.ERROR));
                    if (null != errors)
                        errors.append("Offset " + i + ": "
                                      + "Found an illegal at sign, @.  @012B is an example of a legal folio marker.\n");
                    startOfString = i+1;
                    currentType = ACIPString.ERROR;
                }
@ -209,13 +367,15 @@ public class ACIPTshegBarScanner {
                }
                if (startSlashIndex >= 0) {
-                    al.add(new ACIPString(s.substring(i, i+1), ACIPString.END_SLASH));
+                    al.add(new ACIPString(s.substring(i, i+1),
                                          ACIPString.END_SLASH));
                    startOfString = i+1;
                    currentType = ACIPString.ERROR;
                    startSlashIndex = -1;
                } else {
                    startSlashIndex = i;
-                    al.add(new ACIPString(s.substring(i, i+1), ACIPString.START_SLASH));
+                    al.add(new ACIPString(s.substring(i, i+1),
                                          ACIPString.START_SLASH));
                    startOfString = i+1;
                    currentType = ACIPString.ERROR;
                }
@ -234,26 +394,68 @@ public class ACIPTshegBarScanner {
                // DLC support nesting like (NYA (BA))?
                if (startParenIndex >= 0) {
-                    if (ch == '(')
+                    if (ch == '(') {
                        al.add(new ACIPString("Nesting of parentheses () is not allowed", ACIPString.ERROR));
-                    else {
+                        if (null != errors)
                            errors.append("Offset " + i + ": "
                                          + "Found an illegal open parenthesis, (.  Nesting of parentheses is not allowed.\n");
                    } else {
                        al.add(new ACIPString(s.substring(i, i+1), ACIPString.END_PAREN));
                        startParenIndex = -1;
                    }
                    startOfString = i+1;
                    currentType = ACIPString.ERROR;
                } else {
-                    if (ch == ')')
+                    if (ch == ')') {
                        al.add(new ACIPString("Unexpected closing parenthesis )", ACIPString.ERROR));
-                    else {
+                        if (null != errors)
                            errors.append("Offset " + i + ": "
                                          + "Unexpected closing parenthesis, ), found.\n");
                    } else {
                        startParenIndex = i;
                        al.add(new ACIPString(s.substring(i, i+1), ACIPString.START_PAREN));
                    }
                    startOfString = i+1;
                    currentType = ACIPString.ERROR;
                }
-                break; // end '/' case
+                break; // end '(',')' case
            case '?':
                if (bracketTypeStack.empty()) {
                    // The tsheg bar ends here; new token.
                    if (startOfString < i) {
                        al.add(new ACIPString(s.substring(startOfString, i),
                                              currentType));
                    }
                    al.add(new ACIPString(s.substring(i, i+1),
                                          ACIPString.QUESTION));
                    startOfString = i+1;
                    currentType = ACIPString.ERROR;
                } // else this is [*TR'A ?] or the like.
                break; // end '?' case
            case '.':
                // This definitely indicates a new token.
                if (startOfString < i) {
                    al.add(new ACIPString(s.substring(startOfString, i),
                                          currentType));
                    startOfString = i;
                    currentType = ACIPString.ERROR;
                }
                // . is used for a non-breaking tsheg, such as in {NGO.,} and {....,DAM}.  We give an error unless , or . follows '.'.
                if (i + 1 < sl && (s.charAt(i+1) == '.' || s.charAt(i+1) == ',')) {
                    al.add(new ACIPString(s.substring(i, i+1),
                                          ACIPString.TIBETAN_PUNCTUATION));
                } else {
                    al.add(new ACIPString("A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\".",
                                          ACIPString.ERROR));
                    if (null != errors)
                        errors.append("Offset " + i + ": "
                                      + "A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\".\n");
                }
                break; // end '.' case
            // Classic tsheg bar enders:
            case ' ':
@ -277,6 +479,13 @@ public class ACIPTshegBarScanner {
                break; // end TIBETAN_PUNCTUATION case
            default:
                if (!bracketTypeStack.empty()) {
                    int stackTop = ((Integer)bracketTypeStack.peek()).intValue();
                    if (ACIPString.CORRECTION_START == stackTop && '?' == ch) {
                        // allow it through...
                        break;
                    }
                }
                if (!(isNumeric(ch) || isAlpha(ch))) {
                    if (startOfString < i) {
                        al.add(new ACIPString(s.substring(startOfString, i),
@ -284,6 +493,9 @@ public class ACIPTshegBarScanner {
                    }
                    al.add(new ACIPString(s.substring(i, i+1),
                                          ACIPString.ERROR));
                    if (null != errors)
                        errors.append("Offset " + i + ": "
                                      + "Found an illegal character, " + ch + "\n");
                    startOfString = i+1;
                    currentType = ACIPString.ERROR;
                } else {
@ -297,18 +509,33 @@ public class ACIPTshegBarScanner {
        if (startOfString < sl) {
            al.add(new ACIPString(s.substring(startOfString, sl),
                                  currentType));
        }
        if (!bracketTypeStack.empty()) {
            al.add(new ACIPString("UNEXPECTED END OF INPUT",
                                  ACIPString.ERROR));
            if (null != errors) {
                if (ACIPString.COMMENT == currentType) {
                    errors.append("Offset END: "
                                  + "Unmatched open square bracket, [, found.  A comment does not terminate.\n");
                } else {
                    errors.append("Offset END: "
                                  + "Unmatched open square bracket, [, found.  A correction does not terminate.\n");
                }
            }
        }
        if (startSlashIndex >= 0) {
            al.add(new ACIPString("Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.",
                                  ACIPString.ERROR));
            if (null != errors)
                errors.append("Offset END: "
                              + "Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.\n");
        }
        if (startParenIndex >= 0) {
            al.add(new ACIPString("Parentheses are supposed to occur in pairs, but the input had an unmatched parenthesis.",
                                  ACIPString.ERROR));
-            }
+            if (null != errors)
                errors.append("Offset END: "
                              + "Unmatched open parenthesis, (, found.\n");
        }
        return al;
    }
@ -320,12 +547,16 @@ public class ACIPTshegBarScanner {
    /** See implementation. */
    private static boolean isAlpha(char ch) {
-        return ch == '\''
+        return ch == '\'' // 23rd consonant
-            // combining punctuation:
+            // combining punctuation, vowels:
            || ch == '%'
            || ch == 'o'
            || ch == 'x'
            || ch == ':'
            || ch == '-'
            || ch == '+'
            || (ch >= 'A' && ch <= 'Z')
            || (ch >= 'a' && ch <= 'z');