I now have a function that takes as input a String of ACIP and breaks

up that String into tsheg bars, punctuation, etc., while finding errors. I've tested it some, but I'm not yet committing the tests. Next step: a converter that takes an ACIP file as input and outputs TMW+Latin.
2003-08-14 05:10:47 +00:00 · 2003-08-14 05:10:47 +00:00 · 2b59d9838d
commit 2b59d9838d
parent d4732938a3
2 changed files with 458 additions and 0 deletions
--- a/source/org/thdl/tib/text/ttt/ACIPString.java
+++ b/source/org/thdl/tib/text/ttt/ACIPString.java
@ -0,0 +1,125 @@
+/*
+The contents of this file are subject to the THDL Open Community License
+Version 1.0 (the "License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License on the THDL web site 
+(http://www.thdl.org/).
+
+Software distributed under the License is distributed on an "AS IS" basis, 
+WITHOUT WARRANTY OF ANY KIND, either express or implied. See the 
+License for the specific terms governing rights and limitations under the 
+License. 
+
+The Initial Developer of this software is the Tibetan and Himalayan Digital
+Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
+All Rights Reserved. 
+
+Contributor(s): ______________________________________.
+*/
+
+package org.thdl.tib.text.ttt;
+
+/**
+* An ACIPString is some Latin text and a type, the type stating
+* whether said text is Latin (usually English) or transliteration of
+* Tibetan and which particular kind.  Scanning errors are also encoded
+* as ACIPStrings using a special type.
+*
+* @author David Chandler
+*/
+public class ACIPString {
+    private int type;
+    private String text;
+
+    /** For [#COMMENTS] */
+    public static final int COMMENT = 0;
+    /** For Folio markers like @012B */
+    public static final int FOLIO_MARKER = 1;
+    /** For Tibetan letters and numbers etc. */
+    public static final int TIBETAN_NON_PUNCTUATION = 2;
+    /** For tshegs, whitespace and the like, but not combining
+     *  punctutation like %, o, :, m, and x */
+    public static final int TIBETAN_PUNCTUATION = 3;
+    /** For the start of a [*probable correction] or [*possible correction?] */
+    public static final int CORRECTION_START = 5;
+    /** Denotes the end of a [*probable correction] */
+    public static final int PROBABLE_CORRECTION = 6;
+    /** Denotes the end of a [*possible correction?] */
+    public static final int POSSIBLE_CORRECTION = 7;
+    /** For [BP] -- blank page */
+    public static final int BP = 8;
+    /** For [LS] -- Lanycha script on page */
+    public static final int LS = 9;
+    /** For [DR] -- picture (without caption) on page */
+    public static final int DR = 10;
+    /** For [DD], [DDD], [DD1], [DD2], etc. -- picture with caption on page */
+    public static final int DD = 11;
+    /** For [?] */
+    public static final int QUESTION = 12;
+    /** For the first / in /NYA/ */
+    public static final int START_SLASH = 13;
+    /** For the last / in /NYA/ */
+    public static final int END_SLASH = 14;
+    /** For the opening ( in (NYA) */
+    public static final int START_PAREN = 15;
+    /** For the closing ) in (NYA) */
+    public static final int END_PAREN = 16;
+    /** For things that are not legal syntax, such as a file that
+     * contains just "[# HALF A COMMEN" */
+    public static final int ERROR = 17; /* DLC let the user know. */
+
+    /** Returns true if and only if this string is Latin (usually
+     *  English).  Returns false if this string is transliteration of
+     *  Tibetan. */
+    public int getType() {
+        return type;
+    }
+
+    /** Returns the non-null, non-empty String of text associated with
+     *  this string. */
+    public String getText() {
+        return text;
+    }
+
+    private void setType(int t) {
+        if (t < COMMENT || t > ERROR)
+            throw new IllegalArgumentException("Bad type");
+        type = t;
+    }
+
+    private void setText(String t) {
+        if (t == null || "".equals(t))
+            throw new IllegalArgumentException("null or empty text, DD should have text [DD] e.g.");
+        text = t;
+    }
+
+    /** Don't instantiate me. */
+    private ACIPString() { }
+
+    /** Creates a new ACIPString with source text <i>text</i> and type
+     *  <i>type</i> being a characterization like {@link DD}. */
+    public ACIPString(String text, int type) {
+        setType(type);
+        setText(text);
+    }
+    public String toString() {
+        String typeString = "HUH?????";
+        if (type == COMMENT) typeString = "COMMENT";
+        if (type == FOLIO_MARKER) typeString = "FOLIO_MARKER";
+        if (type == TIBETAN_NON_PUNCTUATION) typeString = "TIBETAN_NON_PUNCTUATION";
+        if (type == TIBETAN_PUNCTUATION) typeString = "TIBETAN_PUNCTUATION";
+        if (type == CORRECTION_START) typeString = "CORRECTION_START";
+        if (type == PROBABLE_CORRECTION) typeString = "PROBABLE_CORRECTION";
+        if (type == POSSIBLE_CORRECTION) typeString = "POSSIBLE_CORRECTION";
+        if (type == BP) typeString = "BP";
+        if (type == LS) typeString = "LS";
+        if (type == DR) typeString = "DR";
+        if (type == DD) typeString = "DD";
+        if (type == QUESTION) typeString = "QUESTION";
+        if (type == START_SLASH) typeString = "START_SLASH";
+        if (type == END_SLASH) typeString = "END_SLASH";
+        if (type == START_PAREN) typeString = "START_PAREN";
+        if (type == END_PAREN) typeString = "END_PAREN";
+        if (type == ERROR) typeString = "ERROR";
+        return typeString + ":\"" + getText() + "\"";
+    }
+}
--- a/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java
+++ b/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java
@ -0,0 +1,333 @@
+/*
+The contents of this file are subject to the THDL Open Community License
+Version 1.0 (the "License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License on the THDL web site 
+(http://www.thdl.org/).
+
+Software distributed under the License is distributed on an "AS IS" basis, 
+WITHOUT WARRANTY OF ANY KIND, either express or implied. See the 
+License for the specific terms governing rights and limitations under the 
+License. 
+
+The Initial Developer of this software is the Tibetan and Himalayan Digital
+Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
+All Rights Reserved. 
+
+Contributor(s): ______________________________________.
+*/
+
+package org.thdl.tib.text.ttt;
+
+import java.io.*;
+import java.util.ArrayList;
+import java.util.Stack;
+
+import org.thdl.util.ThdlDebug;
+
+/**
+* This class is able to break up Strings of ACIP text (for example, an
+* entire sutra file) into tsheg bars, comments, etc. Folio markers,
+* comments, and the like are segregated (so that consumers can ensure
+* that they remain in Latin), and Tibetan passages are broken up into
+* tsheg bars.
+* @author David Chandler
+*/
+public class ACIPTshegBarScanner {
+    /** Returns a list of {@link ACIPString ACIPStrings} corresponding
+     *  to s, possibly the empty list (when the empty string is the
+     *  input).  Each String is either a Latin comment, some Latin
+     *  text, a tsheg bar (minus the tsheg or shad or whatever), a
+     *  String of inter-tsheg-bar punctuation, etc.
+     *
+     *  <p>This not only scans; it finds all the errors a parser would
+     *  too, like "NYA x" and "(" and ")" and "/NYA" etc.  It puts
+     *  those in as ACIPStrings with type {@link ACIPString#ERROR}.
+    */
+    public static ArrayList scan(String s) {
+
+        // the size depends on whether it's mostly Tibetan or mostly
+        // Latin and a number of other factors.  This is meant to be
+        // an underestimate, but not too much of an underestimate.
+        ArrayList al = new ArrayList(s.length() / 10);
+        
+        int sl = s.length();
+        int currentType = ACIPString.ERROR;
+        int startOfString = 0;
+        Stack bracketTypeStack = new Stack();
+        int startSlashIndex = -1;
+        int startParenIndex = -1;
+        for (int i = 0; i < sl; i++) {
+            if (i < startOfString) throw new Error("bad reset");
+            char ch;
+            ch = s.charAt(i);
+            if (ACIPString.COMMENT == currentType && ch != ']')
+                continue;
+            switch (ch) {
+            case ']':
+                if (bracketTypeStack.empty()) {
+                    // Error.
+                    if (startOfString < i) {
+                        al.add(new ACIPString(s.substring(startOfString, i),
+                                              currentType));
+                    }
+                    al.add(new ACIPString(s.substring(i, i+1), ACIPString.ERROR));
+                    startOfString = i+1;
+                    currentType = ACIPString.ERROR;
+                } else {
+                    int stackTop = ((Integer)bracketTypeStack.pop()).intValue();
+
+                    String text = s.substring(startOfString, i+1);
+                    if (ACIPString.CORRECTION_START == stackTop) {
+                        char prevCh = s.charAt(i-1);
+                        if ('?' != prevCh) {
+                            currentType = ACIPString.PROBABLE_CORRECTION;
+                        } else {
+                            currentType = ACIPString.POSSIBLE_CORRECTION;
+                        }
+                    }
+                    al.add(new ACIPString(text, currentType));
+                    startOfString = i+1;
+                    currentType = ACIPString.ERROR;
+                }
+                break;
+
+            case '[':
+                // This definitely indicates a new token.
+                if (startOfString < i) {
+                    al.add(new ACIPString(s.substring(startOfString, i),
+                                          currentType));
+                    startOfString = i;
+                    currentType = ACIPString.ERROR;
+                }
+                String thingy = null;
+
+                if (i + "[DD]".length() <= sl
+                    && s.substring(i, i + "[DD]".length()).equals("[DD]")) {
+                    thingy = "[DD]";
+                    currentType = ACIPString.DD;
+                } else if (i + "[DD1]".length() <= sl
+                           && s.substring(i, i + "[DD1]".length()).equals("[DD1]")) {
+                    thingy = "[DD1]";
+                    currentType = ACIPString.DD;
+                } else if (i + "[DD2]".length() <= sl
+                           && s.substring(i, i + "[DD2]".length()).equals("[DD2]")) {
+                    thingy = "[DD2]";
+                    currentType = ACIPString.DD;
+                } else if (i + "[DDD]".length() <= sl
+                           && s.substring(i, i + "[DDD]".length()).equals("[DDD]")) {
+                    thingy = "[DDD]";
+                    currentType = ACIPString.DD;
+                } else if (i + "[DR]".length() <= sl
+                           && s.substring(i, i + "[DR]".length()).equals("[DR]")) {
+                    thingy = "[DR]";
+                    currentType = ACIPString.DR;
+                } else if (i + "[LS]".length() <= sl
+                           && s.substring(i, i + "[LS]".length()).equals("[LS]")) {
+                    thingy = "[LS]";
+                    currentType = ACIPString.LS;
+                } else if (i + "[BP]".length() <= sl
+                           && s.substring(i, i + "[BP]".length()).equals("[BP]")) {
+                    thingy = "[BP]";
+                    currentType = ACIPString.BP;
+                } else if (i + "[?]".length() <= sl
+                           && s.substring(i, i + "[?]".length()).equals("[?]")) {
+                    thingy = "[?]";
+                    currentType = ACIPString.QUESTION;
+                }
+                if (null != thingy) {
+                    al.add(new ACIPString(thingy,
+                                          currentType));
+                    startOfString = i + thingy.length();
+                    i = startOfString - 1;
+                } else {
+                    if (i + 1 < sl) {
+                        char nextCh = s.charAt(i+1);
+                        if ('*' == nextCh) {
+                            currentType = ACIPString.CORRECTION_START;
+                            bracketTypeStack.push(new Integer(currentType));
+                            break;
+                        } else if ('#' == nextCh) {
+                            currentType = ACIPString.COMMENT;
+                            bracketTypeStack.push(new Integer(currentType));
+                            break;
+                        }
+                    }
+                    // This is an error.  DLC FIXME: in practice
+                    // [COMMENTS APPEAR WITHOUT # MARKS].  Though
+                    // "... [" could cause this too.
+                    al.add(new ACIPString(s.substring(i, i+1),
+                                          ACIPString.ERROR));
+                    startOfString = i + 1;
+                    currentType = ACIPString.ERROR;
+                }
+                break; // end '[' case
+
+            case '@':
+                // This definitely indicates a new token.
+                if (startOfString < i) {
+                    al.add(new ACIPString(s.substring(startOfString, i),
+                                          currentType));
+                    startOfString = i;
+                    currentType = ACIPString.ERROR;
+                }
+
+                // We look for @N[AB], @NN[AB], @NNN[AB], @NNNN[AB],
+                // @NNNNN[AB], and @NNNNNN[AB] only, that is from one
+                // to six digits.
+                for (int numdigits = 1; numdigits <= 5; numdigits++) {
+                    if (i+numdigits+1 < sl
+                        && (s.charAt(i+numdigits+1) == 'A' || s.charAt(i+numdigits+1) == 'B')) {
+                        boolean allAreNumeric = true;
+                        for (int k = 1; k <= numdigits; k++) {
+                            if (!isNumeric(s.charAt(i+k))) {
+                                allAreNumeric = false;
+                                break;
+                            }
+                        }
+                        if (allAreNumeric) {
+                            al.add(new ACIPString(s.substring(i, i+numdigits+2), ACIPString.FOLIO_MARKER));
+                            startOfString = i+numdigits+2;
+                            currentType = ACIPString.ERROR;
+                            break;
+                        }
+                    }
+                }
+                if (startOfString == i) {
+                    al.add(new ACIPString(s.substring(i, i+1), ACIPString.ERROR));
+                    startOfString = i+1;
+                    currentType = ACIPString.ERROR;
+                }
+                break; // end '@' case
+
+            case '/':
+                // This definitely indicates a new token.
+                if (startOfString < i) {
+                    al.add(new ACIPString(s.substring(startOfString, i),
+                                          currentType));
+                    startOfString = i;
+                    currentType = ACIPString.ERROR;
+                }
+
+                if (startSlashIndex >= 0) {
+                    al.add(new ACIPString(s.substring(i, i+1), ACIPString.END_SLASH));
+                    startOfString = i+1;
+                    currentType = ACIPString.ERROR;
+                    startSlashIndex = -1;
+                } else {
+                    startSlashIndex = i;
+                    al.add(new ACIPString(s.substring(i, i+1), ACIPString.START_SLASH));
+                    startOfString = i+1;
+                    currentType = ACIPString.ERROR;
+                }
+                break; // end '/' case
+
+            case '(':
+            case ')':
+                // This definitely indicates a new token.
+                if (startOfString < i) {
+                    al.add(new ACIPString(s.substring(startOfString, i),
+                                          currentType));
+                    startOfString = i;
+                    currentType = ACIPString.ERROR;
+                }
+
+                // DLC support nesting like (NYA (BA))?
+
+                if (startParenIndex >= 0) {
+                    if (ch == '(')
+                        al.add(new ACIPString("Nesting of parentheses () is not allowed", ACIPString.ERROR));
+                    else {
+                        al.add(new ACIPString(s.substring(i, i+1), ACIPString.END_PAREN));
+                        startParenIndex = -1;
+                    }
+                    startOfString = i+1;
+                    currentType = ACIPString.ERROR;
+                } else {
+                    if (ch == ')')
+                        al.add(new ACIPString("Unexpected closing parenthesis )", ACIPString.ERROR));
+                    else {
+                        startParenIndex = i;
+                        al.add(new ACIPString(s.substring(i, i+1), ACIPString.START_PAREN));
+                    }
+                    startOfString = i+1;
+                    currentType = ACIPString.ERROR;
+                }
+                break; // end '/' case
+
+
+            // Classic tsheg bar enders:
+            case ' ':
+            case '\t':
+            case '\r':
+            case '\n':
+            case ',':
+            case '*':
+            case ';':
+            case '`':
+            case '#':
+                // The tsheg bar ends here; new token.
+                if (startOfString < i) {
+                    al.add(new ACIPString(s.substring(startOfString, i),
+                                          currentType));
+                }
+                al.add(new ACIPString(s.substring(i, i+1),
+                                      ACIPString.TIBETAN_PUNCTUATION));
+                startOfString = i+1;
+                currentType = ACIPString.ERROR;
+                break; // end TIBETAN_PUNCTUATION case
+
+            default:
+                if (!(isNumeric(ch) || isAlpha(ch))) {
+                    if (startOfString < i) {
+                        al.add(new ACIPString(s.substring(startOfString, i),
+                                              currentType));
+                    }
+                    al.add(new ACIPString(s.substring(i, i+1),
+                                          ACIPString.ERROR));
+                    startOfString = i+1;
+                    currentType = ACIPString.ERROR;
+                } else {
+                    // Continue through the loop.
+                    if (ACIPString.ERROR == currentType)
+                        currentType = ACIPString.TIBETAN_NON_PUNCTUATION;
+                }
+                break; // end default case
+            }
+        }
+        if (startOfString < sl) {
+            al.add(new ACIPString(s.substring(startOfString, sl),
+                                  currentType));
+            if (!bracketTypeStack.empty()) {
+                al.add(new ACIPString("UNEXPECTED END OF INPUT",
+                                      ACIPString.ERROR));
+            }
+            if (startSlashIndex >= 0) {
+                al.add(new ACIPString("Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.",
+                                      ACIPString.ERROR));
+            }
+            if (startParenIndex >= 0) {
+                al.add(new ACIPString("Parentheses are supposed to occur in pairs, but the input had an unmatched parenthesis.",
+                                      ACIPString.ERROR));
+            }
+        }
+        return al;
+    }
+    
+    /** See implementation. */
+    private static boolean isNumeric(char ch) {
+        return ch >= '0' && ch <= '9';
+    }
+
+    /** See implementation. */
+    private static boolean isAlpha(char ch) {
+        return ch == '\''
+
+            // combining punctuation:
+            || ch == '%'
+            || ch == 'o'
+            || ch == 'x'
+            
+            || (ch >= 'A' && ch <= 'Z')
+            || (ch >= 'a' && ch <= 'z');
+    }
+}