Improved the ACIP scanner (the part of the converter that says, "This

is a correction, that's a comment, this is Tibetan, that's Latin (English), that's Tibetan inter-tsheg-bar punctuation, etc.) It now accepts more real-world ACIP files, i.e. it handles illegal constructs. The error checking is more user-friendly. There are now tests. Added some tsheg bars that Peter E. Hauer of Linguasoft sent me to the tests. Many thanks, Peter. I still need to implement rules that say, "This is not Tibetan, it must be Sanskrit, because that letter doesn't take a MA prefix."
2003-08-17 01:45:55 +00:00 · 2003-08-17 01:45:55 +00:00 · 4581a2d8ab
commit 4581a2d8ab
parent 0b91ed0beb
3 changed files with 2049 additions and 46 deletions
--- a/source/org/thdl/tib/text/ttt/ACIPString.java
+++ b/source/org/thdl/tib/text/ttt/ACIPString.java
@ -34,11 +34,14 @@ public class ACIPString {
    public static final int COMMENT = 0;
    /** For Folio markers like @012B */
    public static final int FOLIO_MARKER = 1;
+    /** For Latin letters and numbers etc.  [*LINE BREAK?] uses this,
+     *  for example. */
+    public static final int LATIN = 2;
    /** For Tibetan letters and numbers etc. */
-    public static final int TIBETAN_NON_PUNCTUATION = 2;
+    public static final int TIBETAN_NON_PUNCTUATION = 3;
    /** For tshegs, whitespace and the like, but not combining
     *  punctutation like %, o, :, m, and x */
-    public static final int TIBETAN_PUNCTUATION = 3;
+    public static final int TIBETAN_PUNCTUATION = 4;
    /** For the start of a [*probable correction] or [*possible correction?] */
    public static final int CORRECTION_START = 5;
    /** Denotes the end of a [*probable correction] */
@ -65,7 +68,7 @@ public class ACIPString {
    public static final int END_PAREN = 16;
    /** For things that are not legal syntax, such as a file that
     * contains just "[# HALF A COMMEN" */
-    public static final int ERROR = 17; /* DLC let the user know. */
+    public static final int ERROR = 17;

    /** Returns true if and only if this string is Latin (usually
     *  English).  Returns false if this string is transliteration of
@ -105,6 +108,7 @@ public class ACIPString {
        String typeString = "HUH?????";
        if (type == COMMENT) typeString = "COMMENT";
        if (type == FOLIO_MARKER) typeString = "FOLIO_MARKER";
+        if (type == LATIN) typeString = "LATIN";
        if (type == TIBETAN_NON_PUNCTUATION) typeString = "TIBETAN_NON_PUNCTUATION";
        if (type == TIBETAN_PUNCTUATION) typeString = "TIBETAN_PUNCTUATION";
        if (type == CORRECTION_START) typeString = "CORRECTION_START";
@ -120,6 +124,6 @@ public class ACIPString {
        if (type == START_PAREN) typeString = "START_PAREN";
        if (type == END_PAREN) typeString = "END_PAREN";
        if (type == ERROR) typeString = "ERROR";
-        return typeString + ":\"" + getText() + "\"";
+        return typeString + ":{" + getText() + "}";
    }
 }