I really hesitate to commit this because I'm not sure what it brings to the
table exactly and I fear that it makes the ACIP->Tibetan converter code a lot uglier. The TODO(DLC)[EWTS->Tibetan] comments littered throughout are part of the ugliness; they point to the ugliness. If each were addressed, cleanliness could perhaps be achieved. I've largely forgotten exactly what this change does, but it attempts to improve EWTS->Tibetan conversion. The lexer is probably really, really primitive. I concentrate here on converting a single tsheg bar rather than a whole document. Eclipse was used during part of my journey here and some imports were reorganized merely because I could. :) (Eclipse was needed when the usual ant build failed to run a new test EWTSTest. And I wanted its debugger.) Next steps: end-to-end EWTS tests should bring many problems to light. Fix those. Triage all the TODO comments. I don't know that I'll ever really trust the implementation. The tests are valuable, though. A clean implementation of EWTS->Tibetan in Jython might hold enough interest for me; I'd like to learn Python.
This commit is contained in:
parent
f64bae8ea6
commit
7198f23361
45 changed files with 1666 additions and 695 deletions
|
@ -18,6 +18,7 @@ Contributor(s): ______________________________________.
|
|||
|
||||
package org.thdl.tib.text.ttt;
|
||||
|
||||
import java.math.BigInteger;
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
|
@ -31,16 +32,130 @@ import java.util.ArrayList;
|
|||
*
|
||||
* @author David Chandler */
|
||||
class EWTSTshegBarScanner extends TTshegBarScanner {
|
||||
|
||||
/** Returns true iff ch can appear within an EWTS tsheg bar. */
|
||||
protected static boolean isValidInsideTshegBar(char ch) {
|
||||
// '\\' is absent, but should it be? TODO(DLC)[EWTS->Tibetan]
|
||||
return ((ch >= '0' && ch <= '9')
|
||||
|| (ch >= '\u0f71' && ch <= '\u0f84')
|
||||
|| EWTSTraits.instance().isUnicodeConsonant(ch)
|
||||
|| EWTSTraits.instance().isUnicodeWowel(ch)
|
||||
|| (ch >= '\u0f20' && ch <= '\u0f33')
|
||||
|| "khgncjytdpbmtstdzwzz'rlafvTDNSWYReuioIAUMHX?^\u0f39\u0f35\u0f37.+~'`-\u0f19\u0f18\u0f3f\u0f3e\u0f86\u0f87\u0f88".indexOf(ch) >= 0);
|
||||
}
|
||||
|
||||
/** See the comment in TTshegBarScanner. This does not find
|
||||
errors and warnings that you'd think of a parser finding (DLC
|
||||
errors and warnings that you'd think of a parser finding (TODO(DLC)[EWTS->Tibetan]:
|
||||
DOES IT?). */
|
||||
public ArrayList scan(String s, StringBuffer errors, int maxErrors,
|
||||
public ArrayList scan(String s, StringBuffer errors, int maxErrors, // TODO(DLC)[EWTS->Tibetan]: ignored
|
||||
boolean shortMessages, String warningLevel) {
|
||||
// the size depends on whether it's mostly Tibetan or mostly
|
||||
// Latin and a number of other factors. This is meant to be
|
||||
// an underestimate, but not too much of an underestimate.
|
||||
ArrayList al = new ArrayList(s.length() / 10);
|
||||
throw new Error("DLC unimplemented");
|
||||
|
||||
// TODO(DLC)[EWTS->Tibetan]: use jflex, javacc or something similar
|
||||
|
||||
// TODO(DLC)[EWTS->Tibetan]: what about Unicode escapes like \u0f20? When do you do that? Immediately like Java source files? I think so and then we can say that oddballs like \u0f19 are valid within tsheg bars.
|
||||
|
||||
StringBuffer sb = new StringBuffer(s);
|
||||
ExpandEscapeSequences(sb);
|
||||
int sl = sb.length();
|
||||
for (int i = 0; i < sl; i++) {
|
||||
if (isValidInsideTshegBar(sb.charAt(i))) {
|
||||
StringBuffer tbsb = new StringBuffer();
|
||||
for (; i < sl; i++) {
|
||||
if (isValidInsideTshegBar(sb.charAt(i)))
|
||||
tbsb.append(sb.charAt(i));
|
||||
else {
|
||||
--i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
al.add(new TString("EWTS", tbsb.toString(),
|
||||
TString.TIBETAN_NON_PUNCTUATION));
|
||||
} else {
|
||||
if (" /;|!:=_@#$%<>()\r\n\t".indexOf(sb.charAt(i)) >= 0)
|
||||
al.add(new TString("EWTS", sb.substring(i, i+1),
|
||||
TString.TIBETAN_PUNCTUATION));
|
||||
else
|
||||
al.add(new TString("EWTS", "ERROR TODO(DLC)[EWTS->Tibetan]: this character is illegal in EWTS: " + sb.substring(i, i+1),
|
||||
TString.ERROR));
|
||||
}
|
||||
}
|
||||
return al;
|
||||
}
|
||||
|
||||
/** Modifies the EWTS in sb such that Unicode escape sequences are
|
||||
* expanded. */
|
||||
public static void ExpandEscapeSequences(StringBuffer sb) {
|
||||
int sl;
|
||||
for (int i = 0; i < (sl = sb.length()); i++) {
|
||||
if (i + "\\u00000000".length() <= sl) {
|
||||
if (sb.charAt(i) == '\\' && sb.charAt(i + 1) == 'u' || sb.charAt(i + 1) == 'U') {
|
||||
boolean isEscape = true;
|
||||
for (int j = 0; j < "00000000".length(); j++) {
|
||||
char ch = sb.charAt(i + "\\u".length() + j);
|
||||
if (!((ch <= '9' && ch >= '0')
|
||||
|| (ch <= 'F' && ch >= 'A')
|
||||
|| (ch <= 'f' && ch >= 'a'))) {
|
||||
isEscape = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (isEscape) {
|
||||
long x = -1;
|
||||
try {
|
||||
BigInteger bigx = new java.math.BigInteger(sb.substring(i+2, i+10), 16);
|
||||
x = bigx.longValue();
|
||||
if (!(bigx.compareTo(new BigInteger("0", 16)) >= 0
|
||||
&& bigx.compareTo(new BigInteger("FFFFFFFF", 16)) <= 0))
|
||||
x = -1;
|
||||
} catch (NumberFormatException e) {
|
||||
// leave x == -1
|
||||
}
|
||||
if (x >= 0 && x <= 0xFFFF) {
|
||||
sb.replace(i, i + "\\uXXXXyyyy".length(), new String(new char[] { (char)x }));
|
||||
continue;
|
||||
} else if (x >= 0x00000000L
|
||||
&& x <= 0xFFFFFFFFL) {
|
||||
// TODO(DLC)[EWTS->Tibetan]: do nothing? test errors al.add(new TString("EWTS", "Sorry, we don't yet support Unicode escape sequences above 0x0000FFFF! File a bug.",
|
||||
//TString.ERROR));
|
||||
i += "uXXXXYYYY".length();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (i + "\\u0000".length() <= sl) {
|
||||
if (sb.charAt(i) == '\\' && sb.charAt(i + 1) == 'u' || sb.charAt(i + 1) == 'U') {
|
||||
boolean isEscape = true;
|
||||
for (int j = 0; j < "0000".length(); j++) {
|
||||
char ch = sb.charAt(i + "\\u".length() + j);
|
||||
if (!((ch <= '9' && ch >= '0')
|
||||
|| (ch <= 'F' && ch >= 'A')
|
||||
|| (ch <= 'f' && ch >= 'a'))) {
|
||||
isEscape = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (isEscape) {
|
||||
int x = -1;
|
||||
try {
|
||||
if (!((x = Integer.parseInt(sb.substring(i+2, i+6), 16)) >= 0x0000
|
||||
&& x <= 0xFFFF))
|
||||
x = -1;
|
||||
} catch (NumberFormatException e) {
|
||||
// leave x == -1
|
||||
}
|
||||
if (x >= 0) {
|
||||
sb.replace(i, i + "\\uXXXX".length(), new String(new char[] { (char)x }));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** non-public because this is a singleton */
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue