Improved the ACIP scanner (the part of the converter that says, "This

is a correction, that's a comment, this is Tibetan, that's Latin
(English), that's Tibetan inter-tsheg-bar punctuation, etc.)  It now
accepts more real-world ACIP files, i.e. it handles illegal
constructs.  The error checking is more user-friendly.  There are now
tests.

Added some tsheg bars that Peter E. Hauer of Linguasoft sent me to the
tests.  Many thanks, Peter.  I still need to implement rules that say,
"This is not Tibetan, it must be Sanskrit, because that letter doesn't
take a MA prefix."
This commit is contained in:
dchandler 2003-08-17 01:45:55 +00:00
parent 0b91ed0beb
commit 4581a2d8ab
3 changed files with 2049 additions and 46 deletions

View file

@ -34,11 +34,14 @@ public class ACIPString {
public static final int COMMENT = 0;
/** For Folio markers like @012B */
public static final int FOLIO_MARKER = 1;
/** For Latin letters and numbers etc. [*LINE BREAK?] uses this,
* for example. */
public static final int LATIN = 2;
/** For Tibetan letters and numbers etc. */
public static final int TIBETAN_NON_PUNCTUATION = 2;
public static final int TIBETAN_NON_PUNCTUATION = 3;
/** For tshegs, whitespace and the like, but not combining
* punctutation like %, o, :, m, and x */
public static final int TIBETAN_PUNCTUATION = 3;
public static final int TIBETAN_PUNCTUATION = 4;
/** For the start of a [*probable correction] or [*possible correction?] */
public static final int CORRECTION_START = 5;
/** Denotes the end of a [*probable correction] */
@ -65,7 +68,7 @@ public class ACIPString {
public static final int END_PAREN = 16;
/** For things that are not legal syntax, such as a file that
* contains just "[# HALF A COMMEN" */
public static final int ERROR = 17; /* DLC let the user know. */
public static final int ERROR = 17;
/** Returns true if and only if this string is Latin (usually
* English). Returns false if this string is transliteration of
@ -105,6 +108,7 @@ public class ACIPString {
String typeString = "HUH?????";
if (type == COMMENT) typeString = "COMMENT";
if (type == FOLIO_MARKER) typeString = "FOLIO_MARKER";
if (type == LATIN) typeString = "LATIN";
if (type == TIBETAN_NON_PUNCTUATION) typeString = "TIBETAN_NON_PUNCTUATION";
if (type == TIBETAN_PUNCTUATION) typeString = "TIBETAN_PUNCTUATION";
if (type == CORRECTION_START) typeString = "CORRECTION_START";
@ -120,6 +124,6 @@ public class ACIPString {
if (type == START_PAREN) typeString = "START_PAREN";
if (type == END_PAREN) typeString = "END_PAREN";
if (type == ERROR) typeString = "ERROR";
return typeString + ":\"" + getText() + "\"";
return typeString + ":{" + getText() + "}";
}
}