I now have a function that takes as input a String of ACIP and breaks
up that String into tsheg bars, punctuation, etc., while finding errors. I've tested it some, but I'm not yet committing the tests. Next step: a converter that takes an ACIP file as input and outputs TMW+Latin.
This commit is contained in:
parent
d4732938a3
commit
2b59d9838d
2 changed files with 458 additions and 0 deletions
125
source/org/thdl/tib/text/ttt/ACIPString.java
Normal file
125
source/org/thdl/tib/text/ttt/ACIPString.java
Normal file
|
@ -0,0 +1,125 @@
|
|||
/*
|
||||
The contents of this file are subject to the THDL Open Community License
|
||||
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License on the THDL web site
|
||||
(http://www.thdl.org/).
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||
License for the specific terms governing rights and limitations under the
|
||||
License.
|
||||
|
||||
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
||||
All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
*/
|
||||
|
||||
package org.thdl.tib.text.ttt;
|
||||
|
||||
/**
|
||||
* An ACIPString is some Latin text and a type, the type stating
|
||||
* whether said text is Latin (usually English) or transliteration of
|
||||
* Tibetan and which particular kind. Scanning errors are also encoded
|
||||
* as ACIPStrings using a special type.
|
||||
*
|
||||
* @author David Chandler
|
||||
*/
|
||||
public class ACIPString {
|
||||
private int type;
|
||||
private String text;
|
||||
|
||||
/** For [#COMMENTS] */
|
||||
public static final int COMMENT = 0;
|
||||
/** For Folio markers like @012B */
|
||||
public static final int FOLIO_MARKER = 1;
|
||||
/** For Tibetan letters and numbers etc. */
|
||||
public static final int TIBETAN_NON_PUNCTUATION = 2;
|
||||
/** For tshegs, whitespace and the like, but not combining
|
||||
* punctutation like %, o, :, m, and x */
|
||||
public static final int TIBETAN_PUNCTUATION = 3;
|
||||
/** For the start of a [*probable correction] or [*possible correction?] */
|
||||
public static final int CORRECTION_START = 5;
|
||||
/** Denotes the end of a [*probable correction] */
|
||||
public static final int PROBABLE_CORRECTION = 6;
|
||||
/** Denotes the end of a [*possible correction?] */
|
||||
public static final int POSSIBLE_CORRECTION = 7;
|
||||
/** For [BP] -- blank page */
|
||||
public static final int BP = 8;
|
||||
/** For [LS] -- Lanycha script on page */
|
||||
public static final int LS = 9;
|
||||
/** For [DR] -- picture (without caption) on page */
|
||||
public static final int DR = 10;
|
||||
/** For [DD], [DDD], [DD1], [DD2], etc. -- picture with caption on page */
|
||||
public static final int DD = 11;
|
||||
/** For [?] */
|
||||
public static final int QUESTION = 12;
|
||||
/** For the first / in /NYA/ */
|
||||
public static final int START_SLASH = 13;
|
||||
/** For the last / in /NYA/ */
|
||||
public static final int END_SLASH = 14;
|
||||
/** For the opening ( in (NYA) */
|
||||
public static final int START_PAREN = 15;
|
||||
/** For the closing ) in (NYA) */
|
||||
public static final int END_PAREN = 16;
|
||||
/** For things that are not legal syntax, such as a file that
|
||||
* contains just "[# HALF A COMMEN" */
|
||||
public static final int ERROR = 17; /* DLC let the user know. */
|
||||
|
||||
/** Returns true if and only if this string is Latin (usually
|
||||
* English). Returns false if this string is transliteration of
|
||||
* Tibetan. */
|
||||
public int getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
/** Returns the non-null, non-empty String of text associated with
|
||||
* this string. */
|
||||
public String getText() {
|
||||
return text;
|
||||
}
|
||||
|
||||
private void setType(int t) {
|
||||
if (t < COMMENT || t > ERROR)
|
||||
throw new IllegalArgumentException("Bad type");
|
||||
type = t;
|
||||
}
|
||||
|
||||
private void setText(String t) {
|
||||
if (t == null || "".equals(t))
|
||||
throw new IllegalArgumentException("null or empty text, DD should have text [DD] e.g.");
|
||||
text = t;
|
||||
}
|
||||
|
||||
/** Don't instantiate me. */
|
||||
private ACIPString() { }
|
||||
|
||||
/** Creates a new ACIPString with source text <i>text</i> and type
|
||||
* <i>type</i> being a characterization like {@link DD}. */
|
||||
public ACIPString(String text, int type) {
|
||||
setType(type);
|
||||
setText(text);
|
||||
}
|
||||
public String toString() {
|
||||
String typeString = "HUH?????";
|
||||
if (type == COMMENT) typeString = "COMMENT";
|
||||
if (type == FOLIO_MARKER) typeString = "FOLIO_MARKER";
|
||||
if (type == TIBETAN_NON_PUNCTUATION) typeString = "TIBETAN_NON_PUNCTUATION";
|
||||
if (type == TIBETAN_PUNCTUATION) typeString = "TIBETAN_PUNCTUATION";
|
||||
if (type == CORRECTION_START) typeString = "CORRECTION_START";
|
||||
if (type == PROBABLE_CORRECTION) typeString = "PROBABLE_CORRECTION";
|
||||
if (type == POSSIBLE_CORRECTION) typeString = "POSSIBLE_CORRECTION";
|
||||
if (type == BP) typeString = "BP";
|
||||
if (type == LS) typeString = "LS";
|
||||
if (type == DR) typeString = "DR";
|
||||
if (type == DD) typeString = "DD";
|
||||
if (type == QUESTION) typeString = "QUESTION";
|
||||
if (type == START_SLASH) typeString = "START_SLASH";
|
||||
if (type == END_SLASH) typeString = "END_SLASH";
|
||||
if (type == START_PAREN) typeString = "START_PAREN";
|
||||
if (type == END_PAREN) typeString = "END_PAREN";
|
||||
if (type == ERROR) typeString = "ERROR";
|
||||
return typeString + ":\"" + getText() + "\"";
|
||||
}
|
||||
}
|
333
source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java
Normal file
333
source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java
Normal file
|
@ -0,0 +1,333 @@
|
|||
/*
|
||||
The contents of this file are subject to the THDL Open Community License
|
||||
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License on the THDL web site
|
||||
(http://www.thdl.org/).
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||
License for the specific terms governing rights and limitations under the
|
||||
License.
|
||||
|
||||
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
||||
All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
*/
|
||||
|
||||
package org.thdl.tib.text.ttt;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Stack;
|
||||
|
||||
import org.thdl.util.ThdlDebug;
|
||||
|
||||
/**
|
||||
* This class is able to break up Strings of ACIP text (for example, an
|
||||
* entire sutra file) into tsheg bars, comments, etc. Folio markers,
|
||||
* comments, and the like are segregated (so that consumers can ensure
|
||||
* that they remain in Latin), and Tibetan passages are broken up into
|
||||
* tsheg bars.
|
||||
* @author David Chandler
|
||||
*/
|
||||
public class ACIPTshegBarScanner {
|
||||
/** Returns a list of {@link ACIPString ACIPStrings} corresponding
|
||||
* to s, possibly the empty list (when the empty string is the
|
||||
* input). Each String is either a Latin comment, some Latin
|
||||
* text, a tsheg bar (minus the tsheg or shad or whatever), a
|
||||
* String of inter-tsheg-bar punctuation, etc.
|
||||
*
|
||||
* <p>This not only scans; it finds all the errors a parser would
|
||||
* too, like "NYA x" and "(" and ")" and "/NYA" etc. It puts
|
||||
* those in as ACIPStrings with type {@link ACIPString#ERROR}.
|
||||
*/
|
||||
public static ArrayList scan(String s) {
|
||||
|
||||
// the size depends on whether it's mostly Tibetan or mostly
|
||||
// Latin and a number of other factors. This is meant to be
|
||||
// an underestimate, but not too much of an underestimate.
|
||||
ArrayList al = new ArrayList(s.length() / 10);
|
||||
|
||||
int sl = s.length();
|
||||
int currentType = ACIPString.ERROR;
|
||||
int startOfString = 0;
|
||||
Stack bracketTypeStack = new Stack();
|
||||
int startSlashIndex = -1;
|
||||
int startParenIndex = -1;
|
||||
for (int i = 0; i < sl; i++) {
|
||||
if (i < startOfString) throw new Error("bad reset");
|
||||
char ch;
|
||||
ch = s.charAt(i);
|
||||
if (ACIPString.COMMENT == currentType && ch != ']')
|
||||
continue;
|
||||
switch (ch) {
|
||||
case ']':
|
||||
if (bracketTypeStack.empty()) {
|
||||
// Error.
|
||||
if (startOfString < i) {
|
||||
al.add(new ACIPString(s.substring(startOfString, i),
|
||||
currentType));
|
||||
}
|
||||
al.add(new ACIPString(s.substring(i, i+1), ACIPString.ERROR));
|
||||
startOfString = i+1;
|
||||
currentType = ACIPString.ERROR;
|
||||
} else {
|
||||
int stackTop = ((Integer)bracketTypeStack.pop()).intValue();
|
||||
|
||||
String text = s.substring(startOfString, i+1);
|
||||
if (ACIPString.CORRECTION_START == stackTop) {
|
||||
char prevCh = s.charAt(i-1);
|
||||
if ('?' != prevCh) {
|
||||
currentType = ACIPString.PROBABLE_CORRECTION;
|
||||
} else {
|
||||
currentType = ACIPString.POSSIBLE_CORRECTION;
|
||||
}
|
||||
}
|
||||
al.add(new ACIPString(text, currentType));
|
||||
startOfString = i+1;
|
||||
currentType = ACIPString.ERROR;
|
||||
}
|
||||
break;
|
||||
|
||||
case '[':
|
||||
// This definitely indicates a new token.
|
||||
if (startOfString < i) {
|
||||
al.add(new ACIPString(s.substring(startOfString, i),
|
||||
currentType));
|
||||
startOfString = i;
|
||||
currentType = ACIPString.ERROR;
|
||||
}
|
||||
String thingy = null;
|
||||
|
||||
if (i + "[DD]".length() <= sl
|
||||
&& s.substring(i, i + "[DD]".length()).equals("[DD]")) {
|
||||
thingy = "[DD]";
|
||||
currentType = ACIPString.DD;
|
||||
} else if (i + "[DD1]".length() <= sl
|
||||
&& s.substring(i, i + "[DD1]".length()).equals("[DD1]")) {
|
||||
thingy = "[DD1]";
|
||||
currentType = ACIPString.DD;
|
||||
} else if (i + "[DD2]".length() <= sl
|
||||
&& s.substring(i, i + "[DD2]".length()).equals("[DD2]")) {
|
||||
thingy = "[DD2]";
|
||||
currentType = ACIPString.DD;
|
||||
} else if (i + "[DDD]".length() <= sl
|
||||
&& s.substring(i, i + "[DDD]".length()).equals("[DDD]")) {
|
||||
thingy = "[DDD]";
|
||||
currentType = ACIPString.DD;
|
||||
} else if (i + "[DR]".length() <= sl
|
||||
&& s.substring(i, i + "[DR]".length()).equals("[DR]")) {
|
||||
thingy = "[DR]";
|
||||
currentType = ACIPString.DR;
|
||||
} else if (i + "[LS]".length() <= sl
|
||||
&& s.substring(i, i + "[LS]".length()).equals("[LS]")) {
|
||||
thingy = "[LS]";
|
||||
currentType = ACIPString.LS;
|
||||
} else if (i + "[BP]".length() <= sl
|
||||
&& s.substring(i, i + "[BP]".length()).equals("[BP]")) {
|
||||
thingy = "[BP]";
|
||||
currentType = ACIPString.BP;
|
||||
} else if (i + "[?]".length() <= sl
|
||||
&& s.substring(i, i + "[?]".length()).equals("[?]")) {
|
||||
thingy = "[?]";
|
||||
currentType = ACIPString.QUESTION;
|
||||
}
|
||||
if (null != thingy) {
|
||||
al.add(new ACIPString(thingy,
|
||||
currentType));
|
||||
startOfString = i + thingy.length();
|
||||
i = startOfString - 1;
|
||||
} else {
|
||||
if (i + 1 < sl) {
|
||||
char nextCh = s.charAt(i+1);
|
||||
if ('*' == nextCh) {
|
||||
currentType = ACIPString.CORRECTION_START;
|
||||
bracketTypeStack.push(new Integer(currentType));
|
||||
break;
|
||||
} else if ('#' == nextCh) {
|
||||
currentType = ACIPString.COMMENT;
|
||||
bracketTypeStack.push(new Integer(currentType));
|
||||
break;
|
||||
}
|
||||
}
|
||||
// This is an error. DLC FIXME: in practice
|
||||
// [COMMENTS APPEAR WITHOUT # MARKS]. Though
|
||||
// "... [" could cause this too.
|
||||
al.add(new ACIPString(s.substring(i, i+1),
|
||||
ACIPString.ERROR));
|
||||
startOfString = i + 1;
|
||||
currentType = ACIPString.ERROR;
|
||||
}
|
||||
break; // end '[' case
|
||||
|
||||
case '@':
|
||||
// This definitely indicates a new token.
|
||||
if (startOfString < i) {
|
||||
al.add(new ACIPString(s.substring(startOfString, i),
|
||||
currentType));
|
||||
startOfString = i;
|
||||
currentType = ACIPString.ERROR;
|
||||
}
|
||||
|
||||
// We look for @N[AB], @NN[AB], @NNN[AB], @NNNN[AB],
|
||||
// @NNNNN[AB], and @NNNNNN[AB] only, that is from one
|
||||
// to six digits.
|
||||
for (int numdigits = 1; numdigits <= 5; numdigits++) {
|
||||
if (i+numdigits+1 < sl
|
||||
&& (s.charAt(i+numdigits+1) == 'A' || s.charAt(i+numdigits+1) == 'B')) {
|
||||
boolean allAreNumeric = true;
|
||||
for (int k = 1; k <= numdigits; k++) {
|
||||
if (!isNumeric(s.charAt(i+k))) {
|
||||
allAreNumeric = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (allAreNumeric) {
|
||||
al.add(new ACIPString(s.substring(i, i+numdigits+2), ACIPString.FOLIO_MARKER));
|
||||
startOfString = i+numdigits+2;
|
||||
currentType = ACIPString.ERROR;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (startOfString == i) {
|
||||
al.add(new ACIPString(s.substring(i, i+1), ACIPString.ERROR));
|
||||
startOfString = i+1;
|
||||
currentType = ACIPString.ERROR;
|
||||
}
|
||||
break; // end '@' case
|
||||
|
||||
case '/':
|
||||
// This definitely indicates a new token.
|
||||
if (startOfString < i) {
|
||||
al.add(new ACIPString(s.substring(startOfString, i),
|
||||
currentType));
|
||||
startOfString = i;
|
||||
currentType = ACIPString.ERROR;
|
||||
}
|
||||
|
||||
if (startSlashIndex >= 0) {
|
||||
al.add(new ACIPString(s.substring(i, i+1), ACIPString.END_SLASH));
|
||||
startOfString = i+1;
|
||||
currentType = ACIPString.ERROR;
|
||||
startSlashIndex = -1;
|
||||
} else {
|
||||
startSlashIndex = i;
|
||||
al.add(new ACIPString(s.substring(i, i+1), ACIPString.START_SLASH));
|
||||
startOfString = i+1;
|
||||
currentType = ACIPString.ERROR;
|
||||
}
|
||||
break; // end '/' case
|
||||
|
||||
case '(':
|
||||
case ')':
|
||||
// This definitely indicates a new token.
|
||||
if (startOfString < i) {
|
||||
al.add(new ACIPString(s.substring(startOfString, i),
|
||||
currentType));
|
||||
startOfString = i;
|
||||
currentType = ACIPString.ERROR;
|
||||
}
|
||||
|
||||
// DLC support nesting like (NYA (BA))?
|
||||
|
||||
if (startParenIndex >= 0) {
|
||||
if (ch == '(')
|
||||
al.add(new ACIPString("Nesting of parentheses () is not allowed", ACIPString.ERROR));
|
||||
else {
|
||||
al.add(new ACIPString(s.substring(i, i+1), ACIPString.END_PAREN));
|
||||
startParenIndex = -1;
|
||||
}
|
||||
startOfString = i+1;
|
||||
currentType = ACIPString.ERROR;
|
||||
} else {
|
||||
if (ch == ')')
|
||||
al.add(new ACIPString("Unexpected closing parenthesis )", ACIPString.ERROR));
|
||||
else {
|
||||
startParenIndex = i;
|
||||
al.add(new ACIPString(s.substring(i, i+1), ACIPString.START_PAREN));
|
||||
}
|
||||
startOfString = i+1;
|
||||
currentType = ACIPString.ERROR;
|
||||
}
|
||||
break; // end '/' case
|
||||
|
||||
|
||||
// Classic tsheg bar enders:
|
||||
case ' ':
|
||||
case '\t':
|
||||
case '\r':
|
||||
case '\n':
|
||||
case ',':
|
||||
case '*':
|
||||
case ';':
|
||||
case '`':
|
||||
case '#':
|
||||
// The tsheg bar ends here; new token.
|
||||
if (startOfString < i) {
|
||||
al.add(new ACIPString(s.substring(startOfString, i),
|
||||
currentType));
|
||||
}
|
||||
al.add(new ACIPString(s.substring(i, i+1),
|
||||
ACIPString.TIBETAN_PUNCTUATION));
|
||||
startOfString = i+1;
|
||||
currentType = ACIPString.ERROR;
|
||||
break; // end TIBETAN_PUNCTUATION case
|
||||
|
||||
default:
|
||||
if (!(isNumeric(ch) || isAlpha(ch))) {
|
||||
if (startOfString < i) {
|
||||
al.add(new ACIPString(s.substring(startOfString, i),
|
||||
currentType));
|
||||
}
|
||||
al.add(new ACIPString(s.substring(i, i+1),
|
||||
ACIPString.ERROR));
|
||||
startOfString = i+1;
|
||||
currentType = ACIPString.ERROR;
|
||||
} else {
|
||||
// Continue through the loop.
|
||||
if (ACIPString.ERROR == currentType)
|
||||
currentType = ACIPString.TIBETAN_NON_PUNCTUATION;
|
||||
}
|
||||
break; // end default case
|
||||
}
|
||||
}
|
||||
if (startOfString < sl) {
|
||||
al.add(new ACIPString(s.substring(startOfString, sl),
|
||||
currentType));
|
||||
if (!bracketTypeStack.empty()) {
|
||||
al.add(new ACIPString("UNEXPECTED END OF INPUT",
|
||||
ACIPString.ERROR));
|
||||
}
|
||||
if (startSlashIndex >= 0) {
|
||||
al.add(new ACIPString("Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.",
|
||||
ACIPString.ERROR));
|
||||
}
|
||||
if (startParenIndex >= 0) {
|
||||
al.add(new ACIPString("Parentheses are supposed to occur in pairs, but the input had an unmatched parenthesis.",
|
||||
ACIPString.ERROR));
|
||||
}
|
||||
}
|
||||
return al;
|
||||
}
|
||||
|
||||
/** See implementation. */
|
||||
private static boolean isNumeric(char ch) {
|
||||
return ch >= '0' && ch <= '9';
|
||||
}
|
||||
|
||||
/** See implementation. */
|
||||
private static boolean isAlpha(char ch) {
|
||||
return ch == '\''
|
||||
|
||||
// combining punctuation:
|
||||
|| ch == '%'
|
||||
|| ch == 'o'
|
||||
|| ch == 'x'
|
||||
|
||||
|| (ch >= 'A' && ch <= 'Z')
|
||||
|| (ch >= 'a' && ch <= 'z');
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue