A reverter that converts Unicode to computer-friendly (but not, yet,

human-friendly) EWTS is here in alpha mode. It probably doesn't deal well with non-Tibetan.
2005-08-01 05:54:20 +00:00 · 2005-08-01 05:54:20 +00:00 · 5788416629
commit 5788416629
parent 00afd75362
13 changed files with 496 additions and 47 deletions
--- a/source/org/thdl/tib/text/reverter/Converter.java
+++ b/source/org/thdl/tib/text/reverter/Converter.java
@ -18,6 +18,16 @@ Contributor(s): ______________________________________.

 package org.thdl.tib.text.reverter;

+import java.text.BreakIterator;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Locale;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.thdl.tib.text.tshegbar.UnicodeUtils;
+
 /** Static methods for converting Unicode to EWTS and
 *  (TODO(dchandler): ACIP).
 *  @author David Chandler
@ -28,11 +38,110 @@ public class Converter {
        throw new Error("There's no point in instantiating this class.");
    }

-    /** Converts Tibetan Unicode to EWTS transliteration.  If errors
-     *  is non-null, error messages are appended to it.  (Errors are
-     *  always inline.) */
-    public static String convertToEwts(String unicode,
-                                       StringBuffer errors /* DLC: use it */) {
-        throw new Error("DLC not yet");
+    /** Finds combining character sequences. */
+    private static BreakIterator breaker
+    = BreakIterator.getCharacterInstance(new Locale("bo"));
+
+
+    private static final boolean debug = false;
+
+    // TODO(dchandler): use this to create LegalTshegBar objects, it's
+    // unused right now.
+    private static Pattern mightBeLegalTshegBarRegex = Pattern.compile(
+            "^"
+            + "([\u0f42\u0f51\u0f56\u0f58\u0f60])?"
+            // root stack: consonant w/ optional wowels:
+            + "(" + GC.consonantStackRegexString + ")"
+            + "(([\u0f42\u0f51\u0f56\u0f58\u0f60\u0f44\u0f53\u0f62\u0f63\u0f66][\u0f51\u0f66]?)"
+            +  "|(\u0f60[\u0f72\u0f74\u0f7c\u0f44\u0f58])+)?"
+            + "$");
+
+    /** Splits nfthdl into grapheme clusters.  Let's define a grapheme
+     *  cluster as something an end user would say cannot be
+     *  decomposed into two separate pieces sensibly.  For the most
+     *  part this is just figuring out the <em>combining character
+     *  sequences</em> as defined by Unicode, but (U+0F04 U+0F05*) is
+     *  an example of a grapheme cluster that is not a combining
+     *  character sequence (TODO(dchandler): (0f04 0f05*), is it
+     *  really worth it?  We don't handle it right now, might be good
+     *  for Unicode->ACIP anyway.)
+     *  @param nfthdl Unicode in NFTHDL decomposition form
+     *  @return List of GC objects */
+    private static List/*<GC>*/ SplitGC(String nfthdl) {
+        
+        if (debug) {
+            System.out.println("debug: "
+                               + UnicodeUtils.unicodeStringToPrettyString(nfthdl));
+        }
+        ArrayList al = new ArrayList();
+        breaker.setText(nfthdl);
+        int start = breaker.first();
+        boolean just_saw_0f7f = false;
+        for (int end = breaker.next();
+             end != BreakIterator.DONE;
+             start = end, end = breaker.next()) {
+            if ((just_saw_0f7f
+                 && (Character.getType(nfthdl.charAt(start))
+                     == Character.NON_SPACING_MARK))
+                || (end > start && '\u0f7f' == nfthdl.charAt(start)
+                    && !al.isEmpty())) {
+                // U+0F7F is a COMBINING_SPACING_MARK, not a
+                // NON_SPACING_MARK, but we want to treat it like a
+                // NON_SPACING_MARK.
+                GC gc = new GC(((GC)al.get(al.size() - 1)).getNfthdl()
+                               + nfthdl.substring(start,end));
+                if (debug) {
+                    System.out.println("debug: setting last el, "
+                                       + al.get(al.size() - 1) + " to " + gc);
+                }
+                al.set(al.size() - 1, gc);
+            } else {
+                al.add(new GC(nfthdl.substring(start,end)));
+            }
+            just_saw_0f7f
+                = (end > start && '\u0f7f' == nfthdl.charAt(end - 1));
+        }
+        return al;
+    }
+
+    /** Converts Tibetan Unicode to computer-friendly EWTS
+     *  transliteration.  Computer-friendly is not human-friendly but
+     *  hopefully even poorly written EWTS->Tibetan converters could
+     *  handle the output.  If errors is non-null, error messages are
+     *  appended to it.  (Errors are always inline.) */
+    public static String convertToEwtsForComputers(String unicode,
+                                                   StringBuffer errors) {
+
+        // First, normalize as much as we can to reduce the number of
+        // cases we must handle.
+        String decomposed
+            = UnicodeUtils.toMostlyDecomposedUnicode(unicode,
+                                                     UnicodeUtils.NORM_NFTHDL);
+
+        // TODO(dchandler): optionally warn if we see
+        // "\u0f40\u0f74\u0f71" which is in the wrong order.
+
+        List gcs = SplitGC(decomposed);
+
+        StringBuffer sb = new StringBuffer();
+        for (Iterator it = gcs.iterator(); it.hasNext(); ) {
+            GC gc = (GC)it.next();
+            StringBuffer ewts = gc.getEwtsForComputers();
+            if (null == ewts) {
+                // TODO(dchandler): use ErrorsAndWarnings?
+                ewts = new StringBuffer("[#ERROR 301: The Unicode '"
+                                        + gc.getNfthdl()
+                                        + "' (has no EWTS transliteration]");
+                if (null != errors) {
+                    errors.append(ewts);
+                    errors.append('\n');
+                }
+            }
+            sb.append(ewts);
+        }
+        return sb.toString();
    }
 }
+
+// TODO(dchandler): give a mode where an error is given if non-Tibetan
+// or at least non-EWTS (think U+534D, e.g.) is found
--- a/source/org/thdl/tib/text/reverter/ConverterTest.java
+++ b/source/org/thdl/tib/text/reverter/ConverterTest.java
@ -20,8 +20,9 @@ package org.thdl.tib.text.reverter;

 import junit.framework.TestCase;

-import org.thdl.util.ThdlOptions;
+import org.thdl.tib.text.tshegbar.UnicodeUtils;
 import org.thdl.tib.text.ttt.ErrorsAndWarnings;
+import org.thdl.util.ThdlOptions;

 /** Tests the Converter class.
 *
@ -47,9 +48,112 @@ public class ConverterTest extends TestCase {
        ThdlOptions.setUserPreference("thdl.debug", true);
    }

+    /** Asserts that converting s from Unicode to EWTS yields an
+     *  error. */
+    private void err(String s) {
+        StringBuffer sb = new StringBuffer();
+        String ewts = Converter.convertToEwtsForComputers(s, sb);
+        boolean error = (sb.length() > 0);
+        if (!error) {
+            System.out.println("expected error but got EWTS '" + ewts
+                               + "' for "
+                               + UnicodeUtils.unicodeStringToPrettyString(s));
+        }
+        assertTrue(error);
+    }
+
+    /** Tests Converter.convertToEwtsForHumans. */
+    private void hconv(String uni, String ewts) {
+        System.out.println("TODO(dchandler): DLC: implement me");
+    }
+
+    /** Tests Converter.convertToEwtsForComputers. */
+    private void conv(String uni, String ewts) {
+        StringBuffer sb = new StringBuffer();
+        String actualEwts = Converter.convertToEwtsForComputers(uni, sb);
+        assertEquals("Expected " + ewts + " but got " + actualEwts + ":\n",
+                     ewts, actualEwts);
+        boolean error = (sb.length() > 0);
+        assertTrue(!error);
+    }
+
    public ConverterTest() { }

    public void testUnicodeToEwts() {
-        assertEquals(Converter.convertToEwts("\u0f40", null), "ka");
+        conv("\u0f56\u0f62\u0f9f\u0f42\u0f66\u0f0b", "bar+tagasa ");
+        conv("\u0f40", "ka");
+        // TODO(dchandler): DLC Tibetans use Arabic numerals and English punctuation.
+        // conv("0123456789.\u0f40", "0123456789.ka");
+        conv("\u0f40\u0f7b", "kai");
+        conv("\u0f40\u0f76", "k+r-i");
+        conv("\u0f40\u0020\u0f40", "ka_ka");
+        conv("\u0f40\n\u0f40\t\u0f40\r\n", "ka\nka\tka\r\n");
+        conv("\u0f04\u0f05\u0f40\u0f0c\u00a0\u0f42", "@#ka*_ga");
+        conv("\u0f42\u0f61", "gaya");
+        hconv("\u0f42\u0f61", "g.ya");
+        conv("\u0f42\u0fb1", "g+ya");
+        hconv("\u0f42\u0fb1", "gya");
+        conv("\u0f54\u0f7e", "paM");
+        conv("\u0f54\u0f71\u0f7e", "pAM");
+        conv("\u0f54\u0f7e", "paM");
+        conv("\u0f54\u0f74\u0f7e", "puM");
+        conv("\u0f54\u0fc6", "p\\u0FC6");
+        conv("\u0f40\u0f72\u0f74", "ku+i");  // bottom-to-top
+        conv("\u0f40\u0f72\u0f74\u0f39", "k^u+i");  // 0f39 first
+        conv("\u0f40\u0f73", "kI");
+        conv("\u0f40\u0f71\u0f72", "kI");
+        conv("\u0f40\u0f72\u0f71", "kI");
+        conv("\u0f40\u0f73\u0f74", "kU+i");
+        err("\u0f48");
+        err("\u0f32\u0f39");
+        err("\u0f47\u0f98");
+        conv("\u0fcc", "\\u0FCC");
+        err("\u0fcd");
+        err("\u0f90");
+        err("\u0f90\u0fc6");
+        conv("\u0f0b\u0fc6", " \\u0FC6");  // ugly but legal...
+        err("\u0f0b\u0f90");
+        err("\u0f0b\u0f74");
+        err("\u0f0b\u0f7f");
+        err("\u0f0b\u0f3e");
+        conv("\u0f32\u0f18", "\\u0F32\\u0F18");
+        conv("\u0f54\u0fa4\u0f90", "p+p+ka");
+        // TODO(dchandler): warn("\u0f54\u0fa4\u0f90\u0f39"); (or do
+        // CCCVs work for this?)
+        if (false) {
+            // 0f39 could go with any of the three, so we give an error:
+            err("\u0f54\u0fa4\u0f90\u0f74\u0f39");
+        } else {
+            // TODO(dchandler): I want an error, not this:
+            conv("\u0f54\u0fa4\u0f90\u0f74\u0f39", "p+p+k^u");
+        }
+        conv("\u0f54\u0fa4\u0f90\u0f39", "p+p+k^a");
+        conv("\u0f55\u0f39", "fa");
+        conv("\u0f55\u0f74\u0f39", "fu");
+        conv("\u0f56\u0f39", "va");
+        conv("\u0f56\u0f74\u0f39", "vu");
+        conv("\u0f54\u0f39\u0fa4\u0f90", "p^+p+ka");
+        conv("\u0f40\u0f7e", "kaM");
+        conv("\u0f40\u0f83", "ka~M");
+        conv("\u0f40\u0f82", "ka~M`");
+        conv("\u0f40\u0f84", "ka?");
+        conv("\u0f40\u0f85\u0f40", "ka&ka");
+        err("\u0f7f");
+        conv("\u0f40\u0f7f", "kaH");
+        conv("\u0f40\u0f7f\u0f72", "kiH");
+        conv("\u0f40\u0f7f\u0f7f\u0f72\u0f7f", "kiHHH");
+        conv("\u0f40\u0f7f\u0f7e", "kaHM");
+        conv("\u0f40\u0f7e\u0f7f", "kaMH");
+        conv("\u0f40\u0f7f\u0f7e\u0f72", "kiHM");
+        conv("\u0f04\u0f05", "@#");
+        conv("\u0f04\u0f05\u0f05", "@##");
+        conv("\u0f04", "@");  // TODO(dchandler): Is this ever seen
+                              // alone?  warn/error otherwise.
+        conv("\u0f05", "#");  // TODO(dchandler): warn or error
    }
 }
+// TODO(dchandler): DLC: test all these round-trip, i.e. assert that
+// Uni->EWTS->Uni produces the same Uni.
+
+// TODO(dchandler): test with ZWSP or joiners or whatever weird crap
+// you can throw in legally to alter boundaries
--- a/source/org/thdl/tib/text/reverter/GC.java
+++ b/source/org/thdl/tib/text/reverter/GC.java
@ -0,0 +1,200 @@
+/*
+The contents of this file are subject to the THDL Open Community License
+Version 1.0 (the "License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License on the THDL web site 
+(http://www.thdl.org/).
+
+Software distributed under the License is distributed on an "AS IS" basis, 
+WITHOUT WARRANTY OF ANY KIND, either express or implied. See the 
+License for the specific terms governing rights and limitations under the 
+License. 
+
+The Initial Developer of this software is the Tibetan and Himalayan Digital
+Library (THDL). Portions created by the THDL are Copyright 2005 THDL.
+All Rights Reserved. 
+
+Contributor(s): ______________________________________.
+*/
+
+package org.thdl.tib.text.reverter;
+
+import java.util.regex.Pattern;
+import java.util.regex.Matcher;
+
+import org.thdl.util.ThdlDebug;
+import org.thdl.tib.text.THDLWylieConstants;
+import org.thdl.tib.text.tshegbar.UnicodeUtils;
+import org.thdl.tib.text.tshegbar.UnicodeCodepointToThdlWylie;
+
+/** Grapheme cluster backed by a String of Unicode.  For the most part
+ *  these are <em>combining character sequences</em> as defined by
+ *  Unicode, but (U+0F04 U+0F05+) [TODO(dchandler): not yet handled as
+ *  a single GC] is an example of a grapheme cluster that is not a
+ *  combining character sequence.
+ *  @author David Chandler
+ */
+class GC {
+    /** NFTHDL-decomposed Unicode */
+    private String nfthdl;
+
+    /** True if valid.  True for digits w/ digit combiners, character
+     *  stack plus optional wowels, a standalone mark.  False for
+     *  anything else, e.g. "\u0f0b\u0f90". */
+    private boolean valid;
+
+    /** Constructor that takes the NFTHDL-decomposed Unicode for the
+     *  grapheme cluster. */
+    public GC(String nfthdl) {
+        setNfthdl(nfthdl);
+    }
+
+    /** A regex that matches the NFTHDL Unicode for a consonant stack
+     *  with optional wowels. */
+    public static String consonantStackRegexString
+    = "[\u0f40-\u0f47\u0f49-\u0f6a]"  // base consonant
+    +  "[\u0f90-\u0f97\u0f99-\u0fbc\u0f39]*"  // subjoined cons.
+    +  "\u0f71?"  // a-chung
+    +  "[\u0f72\u0f73\u0f74\u0f7a-\u0f7d\u0f80]*"  // vowel proper
+    +  "[\u0f35\u0f37\u0f7e\u0f7f\u0f82-\u0f84"  // wowels
+    +   "\u0f86\u0f87\u0fc6]*";
+
+    private static Pattern validGcRegex = Pattern.compile(
+            "^"
+            // numeric:
+            + "([\u0f20-\u0f33][\u0f18\u0f19]*)|"
+
+            // consonant w/ optional wowels:
+            + "(" + consonantStackRegexString + ")|"
+
+            // other symbol with optional U+0FC6
+            + "([\u0f00-\u0f17\u0f1a-\u0f1f\u0f34\u0f36\u0f38"
+            +   "\u0f3a-\u0f3d\u0f85\u0f88-\u0f8b\u0fbe-\u0fc5"
+            +   "\u0fc7-\u0fcc\u0fcf-\u0fd1]\u0fc6?)|"
+
+            // other symbol that does not take U+0FC6.
+            // TODO(dchandler): include 0f0b etc. in this group?
+            + "([ \t\u00a0\n\r]{1,})"  // DLC handling of English... [0-9\\.:a-zA-Z] etc.  what to do?
+
+            + "$");
+
+    private static final boolean debug = false;
+
+    /** Returns NFTHDL-decomposed Unicode representing this grapheme
+     *  cluster. */
+    private void setNfthdl(String nfthdl) {
+        if (debug) {
+            System.out.println("debug: GC is "
+                               + UnicodeUtils.unicodeStringToPrettyString(nfthdl));
+        }
+        this.nfthdl = nfthdl;
+        assert (nfthdl.length() > 0);
+        if (nfthdl.length() < 1)
+            valid = false;
+        valid = validGcRegex.matcher(nfthdl).matches();
+    }
+
+    /** Returns NFTHDL-decomposed Unicode representing this grapheme
+     *  cluster. */
+    public String getNfthdl() { return nfthdl; }
+
+    /** Returns true iff ch is a vowel proper, not a wowel */
+    private boolean isVowel(char ch) {
+        // (We won't see \u0f76 etc. in NFTHDL, but the handling of
+        // them is suspect.)
+        return ((ch >= '\u0f71' && ch <= '\u0f75')
+                || (ch >= '\u0f7a' && ch <= '\u0f7d')
+                || (ch >= '\u0f81' && ch <= '\u0f82'));
+    }
+
+    private boolean isWowelRequiringPrecedingVowel(char ch) {
+        // not 0f39 0f18 0f19 e.g.
+        return ("\u0f35\u0f37\u0f7e\u0f7f\u0f82\u0f83\u0f84\u0f86\u0f87".indexOf(ch) >= 0);
+
+        // NOTE: 0f7f is questionable 0fc6 too... we assume [k\\u0fc6]
+        // is good EWTS.
+    }
+
+    /** Returns EWTS that is valid but not beautiful.  It's better
+     *  suited for consumption by computer programs than by humans,
+     *  though it'll do in a pinch.  (Humans like to see [rnams] instead
+     *  of [r+namasa].)
+     *  @return null if this grapheme cluster has no valid EWTS
+     *  representation or valid-but-ugly EWTS otherwise */
+    public StringBuffer getEwtsForComputers() {
+        if (!valid) {
+            return null;
+        }
+        StringBuffer sb = new StringBuffer();
+        // We use ch after the loop.  Initialization is not really
+        // needed; it's just to avoid compiler errors.
+        char ch = 'X';
+        boolean seenVowel = false;
+        String lastEwts = "";
+        boolean added_aVOWEL = false;
+        for (int i = 0; i < nfthdl.length(); i++) {
+            ch = nfthdl.charAt(i);
+            String ewts
+                = UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(ch);
+            if (i + 1 < nfthdl.length()) {  // lookahead
+                // Even computers want to see kI because the spec
+                // isn't (or at least hasn't always been) crystal
+                // clear that kA+i is equivalent to kI.
+                if (('\u0f55' == ch || '\u0fa5' == ch)
+                    && '\u0f39' == nfthdl.charAt(i + 1)) {
+                    ++i;
+                    ewts = "f";  // TODO(dchandler): hard-coded EWTS
+                } else if (('\u0f56' == ch || '\u0fa6' == ch)
+                           && '\u0f39' == nfthdl.charAt(i + 1)) {
+                    ++i;
+                    ewts = "v";  // TODO(dchandler): hard-coded EWTS
+                } else if ('\u0f71' == ch && '\u0f72' == nfthdl.charAt(i + 1)) {
+                    ++i;
+                    ewts = THDLWylieConstants.I_VOWEL;
+                    // NOTE: we could normalize to 0f73 and 0f75 when
+                    // possible in NFTHDL.  That's closer to EWTS and
+                    // would avoid these two special cases.
+                } else if ('\u0f71' == ch && '\u0f74' == nfthdl.charAt(i + 1)) {
+                    ++i;
+                    ewts = THDLWylieConstants.U_VOWEL;
+                }
+            }
+            if (null == ewts && UnicodeUtils.isInTibetanRange(ch)) {
+                return null;
+            }
+            if (UnicodeUtils.isSubjoinedConsonant(ch)
+                || (seenVowel && isVowel(ch)))
+                sb.append(THDLWylieConstants.WYLIE_SANSKRIT_STACKING_KEY);
+            if (isWowelRequiringPrecedingVowel(ch) && !seenVowel) {
+                if (!added_aVOWEL) {
+                    added_aVOWEL = true;
+                    sb.append(THDLWylieConstants.WYLIE_aVOWEL);  // paM, no pM
+                }
+            }
+            if (isVowel(ch)) {
+                seenVowel = true;
+            }
+            sb.append(ewts);
+            lastEwts = ewts;
+        }
+        if (UnicodeUtils.isNonSubjoinedConsonant(ch)
+            || UnicodeUtils.isSubjoinedConsonant(ch)
+            || '\u0f39' == ch) {
+            ThdlDebug.verify(!added_aVOWEL);
+            sb.append(THDLWylieConstants.WYLIE_aVOWEL);
+        }
+        return sb;
+    }
+
+    public int hashCode() { return nfthdl.hashCode(); }
+
+    public boolean equals(Object o) {
+        return (o instanceof GC && ((GC)o).getNfthdl().equals(getNfthdl()));
+    }
+
+    /** Quasi-XML for humans */
+    public String toString() {
+        return "<GC valid=" + valid + " pretty=\""
+            + UnicodeUtils.unicodeStringToPrettyString(getNfthdl())
+            + "\"/>";
+    }
+}
--- a/source/org/thdl/tib/text/reverter/UnicodeToTranslitForXslt.java
+++ b/source/org/thdl/tib/text/reverter/UnicodeToTranslitForXslt.java
@ -32,11 +32,12 @@ public class UnicodeToTranslitForXslt {
    }

    /** Converts Tibetan Unicode to EWTS transliteration. */
-    public static String unicodeToEwts(String unicode) {
-        return Converter.convertToEwts(unicode, null);
+    public static String unicodeToEwtsForComputers(String unicode) {
+        return Converter.convertToEwtsForComputers(unicode, null);
    }
+
    /** Converts Tibetan Unicode to ACIP transliteration. */
    public static String unicodeToAcip(String unicode) {
-        throw new Error("DLC: not yet");
+        throw new Error("TODO(dchandler): not yet");
    }
 }
--- a/source/org/thdl/tib/text/reverter/UnicodeToTranslitForXsltTest.java
+++ b/source/org/thdl/tib/text/reverter/UnicodeToTranslitForXsltTest.java
@ -50,12 +50,15 @@ public class UnicodeToTranslitForXsltTest extends TestCase {
    public UnicodeToTranslitForXsltTest() { }

    public void testUnicodeToEwts() {
-        assertEquals(UnicodeToTranslitForXslt.unicodeToEwts("\u0f40"), "ka");
-        assertEquals(UnicodeToTranslitForXslt.unicodeToEwts("\u0f56\u0f62\u0f4f\u0f42\u0f66\u0f0b"), "brtags ");
+        assertEquals("ka", UnicodeToTranslitForXslt.unicodeToEwtsForComputers("\u0f40"));
+        assertEquals("g+ya", UnicodeToTranslitForXslt.unicodeToEwtsForComputers("\u0f42\u0fb1"));
+        // TODO(dchandler): assertEquals("brtags ", UnicodeToTranslitForXslt.unicodeToEwtsForHumans("\u0f56\u0f62\u0f9f\u0f42\u0f66\u0f0b"));
    }

    public void testUnicodeToAcip() {
-        assertEquals(UnicodeToTranslitForXslt.unicodeToEwts("\u0f40"), "KA");
-        assertEquals(UnicodeToTranslitForXslt.unicodeToEwts("\u0f56\u0f62\u0f4f\u0f42\u0f66\u0f0b"), "BRTAGS ");
+        if (false) {
+            assertEquals("KA", UnicodeToTranslitForXslt.unicodeToAcip("\u0f40"));
+            assertEquals("BRTAGS ", UnicodeToTranslitForXslt.unicodeToAcip("\u0f56\u0f62\u0f9f\u0f42\u0f66\u0f0b"));
+        }
    }
 }