I really hesitate to commit this because I'm not sure what it brings to the

table exactly and I fear that it makes the ACIP->Tibetan converter code a lot uglier. The TODO(DLC)[EWTS->Tibetan] comments littered throughout are part of the ugliness; they point to the ugliness. If each were addressed, cleanliness could perhaps be achieved. I've largely forgotten exactly what this change does, but it attempts to improve EWTS->Tibetan conversion. The lexer is probably really, really primitive. I concentrate here on converting a single tsheg bar rather than a whole document. Eclipse was used during part of my journey here and some imports were reorganized merely because I could. :) (Eclipse was needed when the usual ant build failed to run a new test EWTSTest. And I wanted its debugger.) Next steps: end-to-end EWTS tests should bring many problems to light. Fix those. Triage all the TODO comments. I don't know that I'll ever really trust the implementation. The tests are valuable, though. A clean implementation of EWTS->Tibetan in Jython might hold enough interest for me; I'd like to learn Python.
2005-06-20 06:18:00 +00:00 · 2005-06-20 06:18:00 +00:00 · 7198f23361
commit 7198f23361
parent f64bae8ea6
45 changed files with 1666 additions and 695 deletions
--- a/source/org/thdl/tib/text/ttt/EWTSTshegBarScanner.java
+++ b/source/org/thdl/tib/text/ttt/EWTSTshegBarScanner.java
@ -18,6 +18,7 @@ Contributor(s): ______________________________________.

 package org.thdl.tib.text.ttt;

+import java.math.BigInteger;
 import java.util.ArrayList;

 /**
@ -31,16 +32,130 @@ import java.util.ArrayList;
 *
 * @author David Chandler */
 class EWTSTshegBarScanner extends TTshegBarScanner {
+
+    /** Returns true iff ch can appear within an EWTS tsheg bar. */
+    protected static boolean isValidInsideTshegBar(char ch) {
+        // '\\' is absent, but should it be?  TODO(DLC)[EWTS->Tibetan]
+        return ((ch >= '0' && ch <= '9')
+                || (ch >= '\u0f71' && ch <= '\u0f84')
+                || EWTSTraits.instance().isUnicodeConsonant(ch)
+                || EWTSTraits.instance().isUnicodeWowel(ch)
+                || (ch >= '\u0f20' && ch <= '\u0f33')
+                || "khgncjytdpbmtstdzwzz'rlafvTDNSWYReuioIAUMHX?^\u0f39\u0f35\u0f37.+~'`-\u0f19\u0f18\u0f3f\u0f3e\u0f86\u0f87\u0f88".indexOf(ch) >= 0);
+    }
+
    /** See the comment in TTshegBarScanner.  This does not find
-        errors and warnings that you'd think of a parser finding (DLC
+        errors and warnings that you'd think of a parser finding (TODO(DLC)[EWTS->Tibetan]:
        DOES IT?). */
-    public ArrayList scan(String s, StringBuffer errors, int maxErrors,
+    public ArrayList scan(String s, StringBuffer errors, int maxErrors, // TODO(DLC)[EWTS->Tibetan]: ignored
                          boolean shortMessages, String warningLevel) {
        // the size depends on whether it's mostly Tibetan or mostly
        // Latin and a number of other factors.  This is meant to be
        // an underestimate, but not too much of an underestimate.
        ArrayList al = new ArrayList(s.length() / 10);
-        throw new Error("DLC unimplemented");
+
+        // TODO(DLC)[EWTS->Tibetan]: use jflex, javacc or something similar
+
+        // TODO(DLC)[EWTS->Tibetan]: what about Unicode escapes like \u0f20?  When do you do that?  Immediately like Java source files?  I think so and then we can say that oddballs like \u0f19 are valid within tsheg bars.
+
+        StringBuffer sb = new StringBuffer(s);
+        ExpandEscapeSequences(sb);
+        int sl = sb.length();
+        for (int i = 0; i < sl; i++) {
+        	if (isValidInsideTshegBar(sb.charAt(i))) {
+        		StringBuffer tbsb = new StringBuffer();
+        		for (; i < sl; i++) {
+        			if (isValidInsideTshegBar(sb.charAt(i)))
+        				tbsb.append(sb.charAt(i));
+        			else {
+        				--i;
+        				break;
+        			}
+        		}
+        		al.add(new TString("EWTS", tbsb.toString(),
+        				TString.TIBETAN_NON_PUNCTUATION));
+        	} else {
+        		if (" /;|!:=_@#$%<>()\r\n\t".indexOf(sb.charAt(i)) >= 0)
+        			al.add(new TString("EWTS", sb.substring(i, i+1),
+        					TString.TIBETAN_PUNCTUATION));
+        		else
+        			al.add(new TString("EWTS", "ERROR TODO(DLC)[EWTS->Tibetan]: this character is illegal in EWTS: " + sb.substring(i, i+1),
+        					TString.ERROR));
+        	}
+        }
+        return al;
+    }
+    
+    /** Modifies the EWTS in sb such that Unicode escape sequences are
+     *  expanded. */
+    public static void ExpandEscapeSequences(StringBuffer sb) {
+    	int sl;
+        for (int i = 0; i < (sl = sb.length()); i++) {
+        	if (i + "\\u00000000".length() <= sl) {
+                if (sb.charAt(i) == '\\' && sb.charAt(i + 1) == 'u' || sb.charAt(i + 1) == 'U') {
+                    boolean isEscape = true;
+                    for (int j = 0; j < "00000000".length(); j++) {
+                        char ch =  sb.charAt(i + "\\u".length() + j);
+                        if (!((ch <= '9' && ch >= '0')
+                              || (ch <= 'F' && ch >= 'A')
+                              || (ch <= 'f' && ch >= 'a'))) {
+                            isEscape = false;
+                            break;
+                        }
+                    }
+                    if (isEscape) {
+                    	long x = -1;
+                    	try {
+                    		BigInteger bigx = new java.math.BigInteger(sb.substring(i+2, i+10), 16);
+                    		x = bigx.longValue();
+							if (!(bigx.compareTo(new BigInteger("0", 16)) >= 0
+								  && bigx.compareTo(new BigInteger("FFFFFFFF", 16)) <= 0))
+								x = -1;
+                    	} catch (NumberFormatException e) {
+                    		// leave x == -1
+                    	}
+                        if (x >= 0 && x <= 0xFFFF) {
+                            sb.replace(i, i + "\\uXXXXyyyy".length(), new String(new char[] { (char)x }));
+                            continue;
+                        } else if (x >= 0x00000000L
+								   && x <= 0xFFFFFFFFL) {
+// TODO(DLC)[EWTS->Tibetan]: do nothing?  test errors                        	al.add(new TString("EWTS", "Sorry, we don't yet support Unicode escape sequences above 0x0000FFFF!  File a bug.",
+                        		   //TString.ERROR));
+                        	i += "uXXXXYYYY".length();
+                            continue;
+                        }
+                    }
+                }
+            }
+            if (i + "\\u0000".length() <= sl) {
+                if (sb.charAt(i) == '\\' && sb.charAt(i + 1) == 'u' || sb.charAt(i + 1) == 'U') {
+                    boolean isEscape = true;
+                    for (int j = 0; j < "0000".length(); j++) {
+                        char ch =  sb.charAt(i + "\\u".length() + j);
+                        if (!((ch <= '9' && ch >= '0')
+                              || (ch <= 'F' && ch >= 'A')
+                              || (ch <= 'f' && ch >= 'a'))) {
+                            isEscape = false;
+                            break;
+                        }
+                    }
+                    if (isEscape) {
+                        int x = -1;
+                        try {
+                            if (!((x = Integer.parseInt(sb.substring(i+2, i+6), 16)) >= 0x0000
+                                  && x <= 0xFFFF))
+                                x = -1;
+                        } catch (NumberFormatException e) {
+                            // leave x == -1
+                        }
+                        if (x >= 0) {
+                            sb.replace(i, i + "\\uXXXX".length(), new String(new char[] { (char)x }));
+                            continue;
+                        }
+                    }
+                }
+            }
+        }
    }

    /** non-public because this is a singleton */