A reverter that converts Unicode to computer-friendly (but not, yet,

human-friendly) EWTS is here in alpha mode. It probably doesn't deal well with non-Tibetan.
2005-08-01 05:54:20 +00:00 · 2005-08-01 05:54:20 +00:00 · 5788416629
commit 5788416629
parent 00afd75362
13 changed files with 496 additions and 47 deletions
--- a/source/org/thdl/tib/text/reverter/Converter.java
+++ b/source/org/thdl/tib/text/reverter/Converter.java
@ -18,6 +18,16 @@ Contributor(s): ______________________________________.

 package org.thdl.tib.text.reverter;

+import java.text.BreakIterator;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Locale;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.thdl.tib.text.tshegbar.UnicodeUtils;
+
 /** Static methods for converting Unicode to EWTS and
 *  (TODO(dchandler): ACIP).
 *  @author David Chandler
@ -28,11 +38,110 @@ public class Converter {
        throw new Error("There's no point in instantiating this class.");
    }

-    /** Converts Tibetan Unicode to EWTS transliteration.  If errors
-     *  is non-null, error messages are appended to it.  (Errors are
-     *  always inline.) */
-    public static String convertToEwts(String unicode,
-                                       StringBuffer errors /* DLC: use it */) {
-        throw new Error("DLC not yet");
+    /** Finds combining character sequences. */
+    private static BreakIterator breaker
+    = BreakIterator.getCharacterInstance(new Locale("bo"));
+
+
+    private static final boolean debug = false;
+
+    // TODO(dchandler): use this to create LegalTshegBar objects, it's
+    // unused right now.
+    private static Pattern mightBeLegalTshegBarRegex = Pattern.compile(
+            "^"
+            + "([\u0f42\u0f51\u0f56\u0f58\u0f60])?"
+            // root stack: consonant w/ optional wowels:
+            + "(" + GC.consonantStackRegexString + ")"
+            + "(([\u0f42\u0f51\u0f56\u0f58\u0f60\u0f44\u0f53\u0f62\u0f63\u0f66][\u0f51\u0f66]?)"
+            +  "|(\u0f60[\u0f72\u0f74\u0f7c\u0f44\u0f58])+)?"
+            + "$");
+
+    /** Splits nfthdl into grapheme clusters.  Let's define a grapheme
+     *  cluster as something an end user would say cannot be
+     *  decomposed into two separate pieces sensibly.  For the most
+     *  part this is just figuring out the <em>combining character
+     *  sequences</em> as defined by Unicode, but (U+0F04 U+0F05*) is
+     *  an example of a grapheme cluster that is not a combining
+     *  character sequence (TODO(dchandler): (0f04 0f05*), is it
+     *  really worth it?  We don't handle it right now, might be good
+     *  for Unicode->ACIP anyway.)
+     *  @param nfthdl Unicode in NFTHDL decomposition form
+     *  @return List of GC objects */
+    private static List/*<GC>*/ SplitGC(String nfthdl) {
+        
+        if (debug) {
+            System.out.println("debug: "
+                               + UnicodeUtils.unicodeStringToPrettyString(nfthdl));
+        }
+        ArrayList al = new ArrayList();
+        breaker.setText(nfthdl);
+        int start = breaker.first();
+        boolean just_saw_0f7f = false;
+        for (int end = breaker.next();
+             end != BreakIterator.DONE;
+             start = end, end = breaker.next()) {
+            if ((just_saw_0f7f
+                 && (Character.getType(nfthdl.charAt(start))
+                     == Character.NON_SPACING_MARK))
+                || (end > start && '\u0f7f' == nfthdl.charAt(start)
+                    && !al.isEmpty())) {
+                // U+0F7F is a COMBINING_SPACING_MARK, not a
+                // NON_SPACING_MARK, but we want to treat it like a
+                // NON_SPACING_MARK.
+                GC gc = new GC(((GC)al.get(al.size() - 1)).getNfthdl()
+                               + nfthdl.substring(start,end));
+                if (debug) {
+                    System.out.println("debug: setting last el, "
+                                       + al.get(al.size() - 1) + " to " + gc);
+                }
+                al.set(al.size() - 1, gc);
+            } else {
+                al.add(new GC(nfthdl.substring(start,end)));
+            }
+            just_saw_0f7f
+                = (end > start && '\u0f7f' == nfthdl.charAt(end - 1));
+        }
+        return al;
+    }
+
+    /** Converts Tibetan Unicode to computer-friendly EWTS
+     *  transliteration.  Computer-friendly is not human-friendly but
+     *  hopefully even poorly written EWTS->Tibetan converters could
+     *  handle the output.  If errors is non-null, error messages are
+     *  appended to it.  (Errors are always inline.) */
+    public static String convertToEwtsForComputers(String unicode,
+                                                   StringBuffer errors) {
+
+        // First, normalize as much as we can to reduce the number of
+        // cases we must handle.
+        String decomposed
+            = UnicodeUtils.toMostlyDecomposedUnicode(unicode,
+                                                     UnicodeUtils.NORM_NFTHDL);
+
+        // TODO(dchandler): optionally warn if we see
+        // "\u0f40\u0f74\u0f71" which is in the wrong order.
+
+        List gcs = SplitGC(decomposed);
+
+        StringBuffer sb = new StringBuffer();
+        for (Iterator it = gcs.iterator(); it.hasNext(); ) {
+            GC gc = (GC)it.next();
+            StringBuffer ewts = gc.getEwtsForComputers();
+            if (null == ewts) {
+                // TODO(dchandler): use ErrorsAndWarnings?
+                ewts = new StringBuffer("[#ERROR 301: The Unicode '"
+                                        + gc.getNfthdl()
+                                        + "' (has no EWTS transliteration]");
+                if (null != errors) {
+                    errors.append(ewts);
+                    errors.append('\n');
+                }
+            }
+            sb.append(ewts);
+        }
+        return sb.toString();
    }
 }
+
+// TODO(dchandler): give a mode where an error is given if non-Tibetan
+// or at least non-EWTS (think U+534D, e.g.) is found