A reverter that converts Unicode to computer-friendly (but not, yet,

human-friendly) EWTS is here in alpha mode. It probably doesn't deal well with non-Tibetan.
2005-08-01 05:54:20 +00:00 · 2005-08-01 05:54:20 +00:00 · 5788416629
commit 5788416629
parent 00afd75362
13 changed files with 496 additions and 47 deletions
--- a/build.xml
+++ b/build.xml
@ -165,8 +165,8 @@ the jvm starting tomcat:
  <!-- Set this to 1.2 if you want J2SDK 1.4's default.  1.1 gives us
       more compatibility, but maybe there will be a performance hit
       or something.  -->
-  <property name="target.jvm" value="1.2"/>
+  <property name="target.jvm" value="1.4"/>
-  <property name="source.jvm" value="1.2"/>
+  <property name="source.jvm" value="1.4"/>
  <!-- Only the tt-servlet-compile target changes this.  Humans
       shouldn't mess with this. -->
--- a/junitbuild.xml
+++ b/junitbuild.xml
@ -73,10 +73,8 @@
      <formatter type="xml"/><!-- If not XML, then 'ant -buildfile
                                  build.xml check-report' will fail. -->
      <sysproperty key="java.awt.headless" value="true"/>
 <!-- TODO(dchandler): DLC: enable these
      <test name="org.thdl.tib.text.reverter.ConverterTest"/>
      <test name="org.thdl.tib.text.reverter.UnicodeToTranslitForXsltTest"/>
 -->
      <test name="org.thdl.tib.text.ttt.EwtsToUnicodeForXsltTest"/>
      <test name="org.thdl.tib.text.ttt.EWTSTest"/>
      <test name="org.thdl.tib.text.ttt.EWTStibwniniTest"/>
--- a/source/org/thdl/tib/input/TibetanConverter.java
+++ b/source/org/thdl/tib/input/TibetanConverter.java
@ -350,7 +350,10 @@ public class TibetanConverter implements FontConverterConstants {
                    uniText = s.toString();
                }
                StringBuffer errors = new StringBuffer();
-                String ewtsText = Converter.convertToEwts(uniText, errors);
+                // TODO(dchandler): DLC: use human-friendly EWTS, not
                // computer-friendly!
                String ewtsText = Converter.convertToEwtsForComputers(uniText,
                                                                      errors);
                // TODO(dchandler): is 51 the right choice?
                return (errors.length() > 0) ? 51 : 0;
            } catch (IOException e) {
--- a/source/org/thdl/tib/text/reverter/Converter.java
+++ b/source/org/thdl/tib/text/reverter/Converter.java
@ -18,6 +18,16 @@ Contributor(s): ______________________________________.
 package org.thdl.tib.text.reverter;
 import java.text.BreakIterator;
 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Locale;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import org.thdl.tib.text.tshegbar.UnicodeUtils;
 /** Static methods for converting Unicode to EWTS and
 *  (TODO(dchandler): ACIP).
 *  @author David Chandler
@ -28,11 +38,110 @@ public class Converter {
        throw new Error("There's no point in instantiating this class.");
    }
-    /** Converts Tibetan Unicode to EWTS transliteration.  If errors
+    /** Finds combining character sequences. */
-     *  is non-null, error messages are appended to it.  (Errors are
+    private static BreakIterator breaker
-     *  always inline.) */
+    = BreakIterator.getCharacterInstance(new Locale("bo"));
-    public static String convertToEwts(String unicode,
+
-                                       StringBuffer errors /* DLC: use it */) {
+
-        throw new Error("DLC not yet");
+    private static final boolean debug = false;
    // TODO(dchandler): use this to create LegalTshegBar objects, it's
    // unused right now.
    private static Pattern mightBeLegalTshegBarRegex = Pattern.compile(
            "^"
            + "([\u0f42\u0f51\u0f56\u0f58\u0f60])?"
            // root stack: consonant w/ optional wowels:
            + "(" + GC.consonantStackRegexString + ")"
            + "(([\u0f42\u0f51\u0f56\u0f58\u0f60\u0f44\u0f53\u0f62\u0f63\u0f66][\u0f51\u0f66]?)"
            +  "|(\u0f60[\u0f72\u0f74\u0f7c\u0f44\u0f58])+)?"
            + "$");
    /** Splits nfthdl into grapheme clusters.  Let's define a grapheme
     *  cluster as something an end user would say cannot be
     *  decomposed into two separate pieces sensibly.  For the most
     *  part this is just figuring out the <em>combining character
     *  sequences</em> as defined by Unicode, but (U+0F04 U+0F05*) is
     *  an example of a grapheme cluster that is not a combining
     *  character sequence (TODO(dchandler): (0f04 0f05*), is it
     *  really worth it?  We don't handle it right now, might be good
     *  for Unicode->ACIP anyway.)
     *  @param nfthdl Unicode in NFTHDL decomposition form
     *  @return List of GC objects */
    private static List/*<GC>*/ SplitGC(String nfthdl) {
        if (debug) {
            System.out.println("debug: "
                               + UnicodeUtils.unicodeStringToPrettyString(nfthdl));
        }
        ArrayList al = new ArrayList();
        breaker.setText(nfthdl);
        int start = breaker.first();
        boolean just_saw_0f7f = false;
        for (int end = breaker.next();
             end != BreakIterator.DONE;
             start = end, end = breaker.next()) {
            if ((just_saw_0f7f
                 && (Character.getType(nfthdl.charAt(start))
                     == Character.NON_SPACING_MARK))
                || (end > start && '\u0f7f' == nfthdl.charAt(start)
                    && !al.isEmpty())) {
                // U+0F7F is a COMBINING_SPACING_MARK, not a
                // NON_SPACING_MARK, but we want to treat it like a
                // NON_SPACING_MARK.
                GC gc = new GC(((GC)al.get(al.size() - 1)).getNfthdl()
                               + nfthdl.substring(start,end));
                if (debug) {
                    System.out.println("debug: setting last el, "
                                       + al.get(al.size() - 1) + " to " + gc);
                }
                al.set(al.size() - 1, gc);
            } else {
                al.add(new GC(nfthdl.substring(start,end)));
            }
            just_saw_0f7f
                = (end > start && '\u0f7f' == nfthdl.charAt(end - 1));
        }
        return al;
    }
    /** Converts Tibetan Unicode to computer-friendly EWTS
     *  transliteration.  Computer-friendly is not human-friendly but
     *  hopefully even poorly written EWTS->Tibetan converters could
     *  handle the output.  If errors is non-null, error messages are
     *  appended to it.  (Errors are always inline.) */
    public static String convertToEwtsForComputers(String unicode,
                                                   StringBuffer errors) {
        // First, normalize as much as we can to reduce the number of
        // cases we must handle.
        String decomposed
            = UnicodeUtils.toMostlyDecomposedUnicode(unicode,
                                                     UnicodeUtils.NORM_NFTHDL);
        // TODO(dchandler): optionally warn if we see
        // "\u0f40\u0f74\u0f71" which is in the wrong order.
        List gcs = SplitGC(decomposed);
        StringBuffer sb = new StringBuffer();
        for (Iterator it = gcs.iterator(); it.hasNext(); ) {
            GC gc = (GC)it.next();
            StringBuffer ewts = gc.getEwtsForComputers();
            if (null == ewts) {
                // TODO(dchandler): use ErrorsAndWarnings?
                ewts = new StringBuffer("[#ERROR 301: The Unicode '"
                                        + gc.getNfthdl()
                                        + "' (has no EWTS transliteration]");
                if (null != errors) {
                    errors.append(ewts);
                    errors.append('\n');
                }
            }
            sb.append(ewts);
        }
        return sb.toString();
    }
 }
 // TODO(dchandler): give a mode where an error is given if non-Tibetan
 // or at least non-EWTS (think U+534D, e.g.) is found
--- a/source/org/thdl/tib/text/reverter/ConverterTest.java
+++ b/source/org/thdl/tib/text/reverter/ConverterTest.java
@ -20,8 +20,9 @@ package org.thdl.tib.text.reverter;
 import junit.framework.TestCase;
-import org.thdl.util.ThdlOptions;
+import org.thdl.tib.text.tshegbar.UnicodeUtils;
 import org.thdl.tib.text.ttt.ErrorsAndWarnings;
 import org.thdl.util.ThdlOptions;
 /** Tests the Converter class.
 *
@ -47,9 +48,112 @@ public class ConverterTest extends TestCase {
        ThdlOptions.setUserPreference("thdl.debug", true);
    }
    /** Asserts that converting s from Unicode to EWTS yields an
     *  error. */
    private void err(String s) {
        StringBuffer sb = new StringBuffer();
        String ewts = Converter.convertToEwtsForComputers(s, sb);
        boolean error = (sb.length() > 0);
        if (!error) {
            System.out.println("expected error but got EWTS '" + ewts
                               + "' for "
                               + UnicodeUtils.unicodeStringToPrettyString(s));
        }
        assertTrue(error);
    }
    /** Tests Converter.convertToEwtsForHumans. */
    private void hconv(String uni, String ewts) {
        System.out.println("TODO(dchandler): DLC: implement me");
    }
    /** Tests Converter.convertToEwtsForComputers. */
    private void conv(String uni, String ewts) {
        StringBuffer sb = new StringBuffer();
        String actualEwts = Converter.convertToEwtsForComputers(uni, sb);
        assertEquals("Expected " + ewts + " but got " + actualEwts + ":\n",
                     ewts, actualEwts);
        boolean error = (sb.length() > 0);
        assertTrue(!error);
    }
    public ConverterTest() { }
    public void testUnicodeToEwts() {
-        assertEquals(Converter.convertToEwts("\u0f40", null), "ka");
+        conv("\u0f56\u0f62\u0f9f\u0f42\u0f66\u0f0b", "bar+tagasa ");
        conv("\u0f40", "ka");
        // TODO(dchandler): DLC Tibetans use Arabic numerals and English punctuation.
        // conv("0123456789.\u0f40", "0123456789.ka");
        conv("\u0f40\u0f7b", "kai");
        conv("\u0f40\u0f76", "k+r-i");
        conv("\u0f40\u0020\u0f40", "ka_ka");
        conv("\u0f40\n\u0f40\t\u0f40\r\n", "ka\nka\tka\r\n");
        conv("\u0f04\u0f05\u0f40\u0f0c\u00a0\u0f42", "@#ka*_ga");
        conv("\u0f42\u0f61", "gaya");
        hconv("\u0f42\u0f61", "g.ya");
        conv("\u0f42\u0fb1", "g+ya");
        hconv("\u0f42\u0fb1", "gya");
        conv("\u0f54\u0f7e", "paM");
        conv("\u0f54\u0f71\u0f7e", "pAM");
        conv("\u0f54\u0f7e", "paM");
        conv("\u0f54\u0f74\u0f7e", "puM");
        conv("\u0f54\u0fc6", "p\\u0FC6");
        conv("\u0f40\u0f72\u0f74", "ku+i");  // bottom-to-top
        conv("\u0f40\u0f72\u0f74\u0f39", "k^u+i");  // 0f39 first
        conv("\u0f40\u0f73", "kI");
        conv("\u0f40\u0f71\u0f72", "kI");
        conv("\u0f40\u0f72\u0f71", "kI");
        conv("\u0f40\u0f73\u0f74", "kU+i");
        err("\u0f48");
        err("\u0f32\u0f39");
        err("\u0f47\u0f98");
        conv("\u0fcc", "\\u0FCC");
        err("\u0fcd");
        err("\u0f90");
        err("\u0f90\u0fc6");
        conv("\u0f0b\u0fc6", " \\u0FC6");  // ugly but legal...
        err("\u0f0b\u0f90");
        err("\u0f0b\u0f74");
        err("\u0f0b\u0f7f");
        err("\u0f0b\u0f3e");
        conv("\u0f32\u0f18", "\\u0F32\\u0F18");
        conv("\u0f54\u0fa4\u0f90", "p+p+ka");
        // TODO(dchandler): warn("\u0f54\u0fa4\u0f90\u0f39"); (or do
        // CCCVs work for this?)
        if (false) {
            // 0f39 could go with any of the three, so we give an error:
            err("\u0f54\u0fa4\u0f90\u0f74\u0f39");
        } else {
            // TODO(dchandler): I want an error, not this:
            conv("\u0f54\u0fa4\u0f90\u0f74\u0f39", "p+p+k^u");
        }
        conv("\u0f54\u0fa4\u0f90\u0f39", "p+p+k^a");
        conv("\u0f55\u0f39", "fa");
        conv("\u0f55\u0f74\u0f39", "fu");
        conv("\u0f56\u0f39", "va");
        conv("\u0f56\u0f74\u0f39", "vu");
        conv("\u0f54\u0f39\u0fa4\u0f90", "p^+p+ka");
        conv("\u0f40\u0f7e", "kaM");
        conv("\u0f40\u0f83", "ka~M");
        conv("\u0f40\u0f82", "ka~M`");
        conv("\u0f40\u0f84", "ka?");
        conv("\u0f40\u0f85\u0f40", "ka&ka");
        err("\u0f7f");
        conv("\u0f40\u0f7f", "kaH");
        conv("\u0f40\u0f7f\u0f72", "kiH");
        conv("\u0f40\u0f7f\u0f7f\u0f72\u0f7f", "kiHHH");
        conv("\u0f40\u0f7f\u0f7e", "kaHM");
        conv("\u0f40\u0f7e\u0f7f", "kaMH");
        conv("\u0f40\u0f7f\u0f7e\u0f72", "kiHM");
        conv("\u0f04\u0f05", "@#");
        conv("\u0f04\u0f05\u0f05", "@##");
        conv("\u0f04", "@");  // TODO(dchandler): Is this ever seen
                              // alone?  warn/error otherwise.
        conv("\u0f05", "#");  // TODO(dchandler): warn or error
    }
 }
 // TODO(dchandler): DLC: test all these round-trip, i.e. assert that
 // Uni->EWTS->Uni produces the same Uni.
 // TODO(dchandler): test with ZWSP or joiners or whatever weird crap
 // you can throw in legally to alter boundaries
--- a/source/org/thdl/tib/text/reverter/GC.java
+++ b/source/org/thdl/tib/text/reverter/GC.java
@ -0,0 +1,200 @@
 /*
 The contents of this file are subject to the THDL Open Community License
 Version 1.0 (the "License"); you may not use this file except in compliance
 with the License. You may obtain a copy of the License on the THDL web site 
 (http://www.thdl.org/).
 Software distributed under the License is distributed on an "AS IS" basis, 
 WITHOUT WARRANTY OF ANY KIND, either express or implied. See the 
 License for the specific terms governing rights and limitations under the 
 License. 
 The Initial Developer of this software is the Tibetan and Himalayan Digital
 Library (THDL). Portions created by the THDL are Copyright 2005 THDL.
 All Rights Reserved. 
 Contributor(s): ______________________________________.
 */
 package org.thdl.tib.text.reverter;
 import java.util.regex.Pattern;
 import java.util.regex.Matcher;
 import org.thdl.util.ThdlDebug;
 import org.thdl.tib.text.THDLWylieConstants;
 import org.thdl.tib.text.tshegbar.UnicodeUtils;
 import org.thdl.tib.text.tshegbar.UnicodeCodepointToThdlWylie;
 /** Grapheme cluster backed by a String of Unicode.  For the most part
 *  these are <em>combining character sequences</em> as defined by
 *  Unicode, but (U+0F04 U+0F05+) [TODO(dchandler): not yet handled as
 *  a single GC] is an example of a grapheme cluster that is not a
 *  combining character sequence.
 *  @author David Chandler
 */
 class GC {
    /** NFTHDL-decomposed Unicode */
    private String nfthdl;
    /** True if valid.  True for digits w/ digit combiners, character
     *  stack plus optional wowels, a standalone mark.  False for
     *  anything else, e.g. "\u0f0b\u0f90". */
    private boolean valid;
    /** Constructor that takes the NFTHDL-decomposed Unicode for the
     *  grapheme cluster. */
    public GC(String nfthdl) {
        setNfthdl(nfthdl);
    }
    /** A regex that matches the NFTHDL Unicode for a consonant stack
     *  with optional wowels. */
    public static String consonantStackRegexString
    = "[\u0f40-\u0f47\u0f49-\u0f6a]"  // base consonant
    +  "[\u0f90-\u0f97\u0f99-\u0fbc\u0f39]*"  // subjoined cons.
    +  "\u0f71?"  // a-chung
    +  "[\u0f72\u0f73\u0f74\u0f7a-\u0f7d\u0f80]*"  // vowel proper
    +  "[\u0f35\u0f37\u0f7e\u0f7f\u0f82-\u0f84"  // wowels
    +   "\u0f86\u0f87\u0fc6]*";
    private static Pattern validGcRegex = Pattern.compile(
            "^"
            // numeric:
            + "([\u0f20-\u0f33][\u0f18\u0f19]*)|"
            // consonant w/ optional wowels:
            + "(" + consonantStackRegexString + ")|"
            // other symbol with optional U+0FC6
            + "([\u0f00-\u0f17\u0f1a-\u0f1f\u0f34\u0f36\u0f38"
            +   "\u0f3a-\u0f3d\u0f85\u0f88-\u0f8b\u0fbe-\u0fc5"
            +   "\u0fc7-\u0fcc\u0fcf-\u0fd1]\u0fc6?)|"
            // other symbol that does not take U+0FC6.
            // TODO(dchandler): include 0f0b etc. in this group?
            + "([ \t\u00a0\n\r]{1,})"  // DLC handling of English... [0-9\\.:a-zA-Z] etc.  what to do?
            + "$");
    private static final boolean debug = false;
    /** Returns NFTHDL-decomposed Unicode representing this grapheme
     *  cluster. */
    private void setNfthdl(String nfthdl) {
        if (debug) {
            System.out.println("debug: GC is "
                               + UnicodeUtils.unicodeStringToPrettyString(nfthdl));
        }
        this.nfthdl = nfthdl;
        assert (nfthdl.length() > 0);
        if (nfthdl.length() < 1)
            valid = false;
        valid = validGcRegex.matcher(nfthdl).matches();
    }
    /** Returns NFTHDL-decomposed Unicode representing this grapheme
     *  cluster. */
    public String getNfthdl() { return nfthdl; }
    /** Returns true iff ch is a vowel proper, not a wowel */
    private boolean isVowel(char ch) {
        // (We won't see \u0f76 etc. in NFTHDL, but the handling of
        // them is suspect.)
        return ((ch >= '\u0f71' && ch <= '\u0f75')
                || (ch >= '\u0f7a' && ch <= '\u0f7d')
                || (ch >= '\u0f81' && ch <= '\u0f82'));
    }
    private boolean isWowelRequiringPrecedingVowel(char ch) {
        // not 0f39 0f18 0f19 e.g.
        return ("\u0f35\u0f37\u0f7e\u0f7f\u0f82\u0f83\u0f84\u0f86\u0f87".indexOf(ch) >= 0);
        // NOTE: 0f7f is questionable 0fc6 too... we assume [k\\u0fc6]
        // is good EWTS.
    }
    /** Returns EWTS that is valid but not beautiful.  It's better
     *  suited for consumption by computer programs than by humans,
     *  though it'll do in a pinch.  (Humans like to see [rnams] instead
     *  of [r+namasa].)
     *  @return null if this grapheme cluster has no valid EWTS
     *  representation or valid-but-ugly EWTS otherwise */
    public StringBuffer getEwtsForComputers() {
        if (!valid) {
            return null;
        }
        StringBuffer sb = new StringBuffer();
        // We use ch after the loop.  Initialization is not really
        // needed; it's just to avoid compiler errors.
        char ch = 'X';
        boolean seenVowel = false;
        String lastEwts = "";
        boolean added_aVOWEL = false;
        for (int i = 0; i < nfthdl.length(); i++) {
            ch = nfthdl.charAt(i);
            String ewts
                = UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(ch);
            if (i + 1 < nfthdl.length()) {  // lookahead
                // Even computers want to see kI because the spec
                // isn't (or at least hasn't always been) crystal
                // clear that kA+i is equivalent to kI.
                if (('\u0f55' == ch || '\u0fa5' == ch)
                    && '\u0f39' == nfthdl.charAt(i + 1)) {
                    ++i;
                    ewts = "f";  // TODO(dchandler): hard-coded EWTS
                } else if (('\u0f56' == ch || '\u0fa6' == ch)
                           && '\u0f39' == nfthdl.charAt(i + 1)) {
                    ++i;
                    ewts = "v";  // TODO(dchandler): hard-coded EWTS
                } else if ('\u0f71' == ch && '\u0f72' == nfthdl.charAt(i + 1)) {
                    ++i;
                    ewts = THDLWylieConstants.I_VOWEL;
                    // NOTE: we could normalize to 0f73 and 0f75 when
                    // possible in NFTHDL.  That's closer to EWTS and
                    // would avoid these two special cases.
                } else if ('\u0f71' == ch && '\u0f74' == nfthdl.charAt(i + 1)) {
                    ++i;
                    ewts = THDLWylieConstants.U_VOWEL;
                }
            }
            if (null == ewts && UnicodeUtils.isInTibetanRange(ch)) {
                return null;
            }
            if (UnicodeUtils.isSubjoinedConsonant(ch)
                || (seenVowel && isVowel(ch)))
                sb.append(THDLWylieConstants.WYLIE_SANSKRIT_STACKING_KEY);
            if (isWowelRequiringPrecedingVowel(ch) && !seenVowel) {
                if (!added_aVOWEL) {
                    added_aVOWEL = true;
                    sb.append(THDLWylieConstants.WYLIE_aVOWEL);  // paM, no pM
                }
            }
            if (isVowel(ch)) {
                seenVowel = true;
            }
            sb.append(ewts);
            lastEwts = ewts;
        }
        if (UnicodeUtils.isNonSubjoinedConsonant(ch)
            || UnicodeUtils.isSubjoinedConsonant(ch)
            || '\u0f39' == ch) {
            ThdlDebug.verify(!added_aVOWEL);
            sb.append(THDLWylieConstants.WYLIE_aVOWEL);
        }
        return sb;
    }
    public int hashCode() { return nfthdl.hashCode(); }
    public boolean equals(Object o) {
        return (o instanceof GC && ((GC)o).getNfthdl().equals(getNfthdl()));
    }
    /** Quasi-XML for humans */
    public String toString() {
        return "<GC valid=" + valid + " pretty=\""
            + UnicodeUtils.unicodeStringToPrettyString(getNfthdl())
            + "\"/>";
    }
 }
--- a/source/org/thdl/tib/text/reverter/UnicodeToTranslitForXslt.java
+++ b/source/org/thdl/tib/text/reverter/UnicodeToTranslitForXslt.java
@ -32,11 +32,12 @@ public class UnicodeToTranslitForXslt {
    }
    /** Converts Tibetan Unicode to EWTS transliteration. */
-    public static String unicodeToEwts(String unicode) {
+    public static String unicodeToEwtsForComputers(String unicode) {
-        return Converter.convertToEwts(unicode, null);
+        return Converter.convertToEwtsForComputers(unicode, null);
    }
    /** Converts Tibetan Unicode to ACIP transliteration. */
    public static String unicodeToAcip(String unicode) {
-        throw new Error("DLC: not yet");
+        throw new Error("TODO(dchandler): not yet");
    }
 }
--- a/source/org/thdl/tib/text/reverter/UnicodeToTranslitForXsltTest.java
+++ b/source/org/thdl/tib/text/reverter/UnicodeToTranslitForXsltTest.java
@ -50,12 +50,15 @@ public class UnicodeToTranslitForXsltTest extends TestCase {
    public UnicodeToTranslitForXsltTest() { }
    public void testUnicodeToEwts() {
-        assertEquals(UnicodeToTranslitForXslt.unicodeToEwts("\u0f40"), "ka");
+        assertEquals("ka", UnicodeToTranslitForXslt.unicodeToEwtsForComputers("\u0f40"));
-        assertEquals(UnicodeToTranslitForXslt.unicodeToEwts("\u0f56\u0f62\u0f4f\u0f42\u0f66\u0f0b"), "brtags ");
+        assertEquals("g+ya", UnicodeToTranslitForXslt.unicodeToEwtsForComputers("\u0f42\u0fb1"));
        // TODO(dchandler): assertEquals("brtags ", UnicodeToTranslitForXslt.unicodeToEwtsForHumans("\u0f56\u0f62\u0f9f\u0f42\u0f66\u0f0b"));
    }
    public void testUnicodeToAcip() {
-        assertEquals(UnicodeToTranslitForXslt.unicodeToEwts("\u0f40"), "KA");
+        if (false) {
-        assertEquals(UnicodeToTranslitForXslt.unicodeToEwts("\u0f56\u0f62\u0f4f\u0f42\u0f66\u0f0b"), "BRTAGS ");
+            assertEquals("KA", UnicodeToTranslitForXslt.unicodeToAcip("\u0f40"));
            assertEquals("BRTAGS ", UnicodeToTranslitForXslt.unicodeToAcip("\u0f56\u0f62\u0f9f\u0f42\u0f66\u0f0b"));
        }
    }
 }
--- a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java
+++ b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java
@ -463,6 +463,7 @@ public final class LegalTshegBar
     *  concatenation like 'u'i'o.  Returns false otherwise (including
     *  the case that suffix is the empty string). */
    public static boolean isAchungBasedSuffix(String suffix) {
        // TODO(dchandler): use java.util.regex
        int i = 0; // so that the empty string causes false to be returned.
        while (i == 0 || !suffix.equals("")) {
            boolean startsWithOneOfThem = false;
--- a/source/org/thdl/tib/text/tshegbar/UnicodeCodepointToThdlWylie.java
+++ b/source/org/thdl/tib/text/tshegbar/UnicodeCodepointToThdlWylie.java
@ -67,11 +67,16 @@ public class UnicodeCodepointToThdlWylie {
        // fail.
        switch (x) {
        case '\t': return "\t";
        case '\n': return "\n";
        case '\r': return "\r";
        case ' ': return "_";
        case '\u00a0': return "_";
        case '\u0F00': return "oM";
        case '\u0F01': return "\\u0F01";
-        case '\u0F02': return null; // DLC
+        case '\u0F02': return "\\u0F02";
-        case '\u0F03': return null; // DLC
+        case '\u0F03': return "\\u0F03";
        case '\u0F04': return "@";
        case '\u0F05': return "#";
        case '\u0F06': return "$";
@ -314,8 +319,6 @@ public class UnicodeCodepointToThdlWylie {
        case '\u0FCF': return "\\u0FCF"; // DLC i added this to the 'EWTS document misspeaks' bug report... null I think...
        default: {
            // DLC handle space (EW's "_")
            // This codepoint is in the range 0FD0-0FFF or is not in
            // the Tibetan range at all.  In either case, there is no
            // corresponding THDL Extended Wylie.
--- a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java
+++ b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java
@ -102,7 +102,10 @@ public class UnicodeUtils implements UnicodeConstants {
        nor NFKD breaks down <code>U+0F00</code> into its constituent
        codepoints.  NFTHDL uses a maximum of codepoints, and it never
        uses codepoints whose use has been {@link #isDiscouraged(char)
-        discouraged}.
+        discouraged}.  NFTHDL also does not screw things up by using
        the standard-but-wrong CCCVs.  It sorts stretches of combining
        characters wisely as per
        {@link http://iris.lib.virginia.edu/tibet/xml/showEssay.php?xml=/tools/encodingTib.xml}.
        <p>The Tibetan passages of the returned string are in the
        chosen normalized form, but codepoints outside of the {@link
@ -136,6 +139,9 @@ public class UnicodeUtils implements UnicodeConstants {
                tibetanUnicode.insert(offset, s);
            }
        }
        if (normForm == NORM_NFTHDL) {
            fixSomeOrderingErrorsInTibetanUnicode(tibetanUnicode);
        }
    }
    /** Like {@link #toMostlyDecomposedUnicode(StringBuffer, byte)},
@ -418,7 +424,39 @@ public class UnicodeUtils implements UnicodeConstants {
     *  product.)
     */
    private static char unicode_pairs[][]
-        = { { '\u0f71', '\u0f74' },
+        = {
            /* TODO(dchandler): use regex
             * "[\u0f39\u0f71-\u0f84\u0f86\u0f87]{2,}" to find patches
             * that need sorting and then sort each of those.  This
             * cross product is ugly. */
            { '\u0f39', '\u0f71' },
            { '\u0f39', '\u0f72' },
            { '\u0f39', '\u0f74' },
            { '\u0f39', '\u0f7a' },
            { '\u0f39', '\u0f7b' },
            { '\u0f39', '\u0f7c' },
            { '\u0f39', '\u0f7d' },
            { '\u0f39', '\u0f7e' },
            { '\u0f39', '\u0f7f' },
            { '\u0f39', '\u0f80' },
            { '\u0f39', '\u0f82' },
            { '\u0f39', '\u0f83' },
            { '\u0f71', '\u0f7f' },
            { '\u0f72', '\u0f7f' },
            { '\u0f74', '\u0f7f' },
            { '\u0f7a', '\u0f7f' },
            { '\u0f7b', '\u0f7f' },
            { '\u0f7c', '\u0f7f' },
            { '\u0f7d', '\u0f7f' },
            // but not { '\u0f7e', '\u0f7f' },
            { '\u0f39', '\u0f7f' },
            { '\u0f80', '\u0f7f' },
            { '\u0f82', '\u0f7f' },
            { '\u0f83', '\u0f7f' },
            { '\u0f71', '\u0f74' },
            { '\u0f71', '\u0f72' },
            { '\u0f71', '\u0f7a' },
@ -489,7 +527,9 @@ public class UnicodeUtils implements UnicodeConstants {
     *  the same file modulo Unicode booboos would be better.  </p>
     *
     *  @param sb the buffer to be mutated
-     *  @return true if sb was mutated */
+     *  @return true if sb was mutated
     *  @see <a href="http://iris.lib.virginia.edu/tibet/xml/showEssay.php?xml=/tools/encodingTib.xml">Tibetan Encoding Model</a>
     */
    public static boolean fixSomeOrderingErrorsInTibetanUnicode(StringBuffer sb) {
        boolean mutated = false;
        int len = sb.length();
@ -512,25 +552,5 @@ public class UnicodeUtils implements UnicodeConstants {
        } while (mutated_this_time_through);
        return mutated;
    }
    /** Returns true iff ch is a valid Tibetan codepoint in Unicode
     *  4.0: */
    public boolean isTibetanUnicodeCodepoint(char ch) {
        // NOTE: could use an array of 256 booleans for speed but I'm lazy
        return ((ch >= '\u0f00' && ch <= '\u0fcf')
                && !(ch == '\u0f48'
                     || (ch > '\u0f6a' && ch < '\u0f71')
                     || (ch > '\u0f8b' && ch < '\u0f90')
                     || ch == '\u0f98'
                     || ch == '\u0fbd'
                     || ch == '\u0fcd'
                     || ch == '\u0fce'));
    }
    /** Returns true iff ch is in 0F00-0FFF but isn't a valid Tibetan
     *  codepoint in Unicode 4.0: */
    public boolean isInvalidTibetanUnicode(char ch) {
        return (isInTibetanRange(ch) && !isTibetanUnicodeCodepoint(ch));
    }
 }
--- a/source/org/thdl/tib/text/ttt/EWTSTest.java
+++ b/source/org/thdl/tib/text/ttt/EWTSTest.java
@ -798,6 +798,7 @@ public class EWTSTest extends TestCase {
            just_ewts2uni_test("\\uefff", "\uefff");
        }
        ewts2uni_test("kaHH", "\u0F40\u0f7f\u0f7f");
        // Below was semiautomatically generated from the EWTS spec's
        // 'ewts.xml' representation (early August 2004 edition):
--- a/source/org/thdl/tib/text/ttt/TPairListFactory.java
+++ b/source/org/thdl/tib/text/ttt/TPairListFactory.java
@ -405,6 +405,12 @@ class TPairListFactory {
                    "\u0f74", THDLWylieConstants.u_VOWEL,
                    // TODO(dchandler): equivalence classes I'm not
                    // sure.
                    // http://iris.lib.virginia.edu/tibet/xml/showEssay.php?xml=/tools/encodingTib.xml
                    // says to go above base and then upwards.  Think
                    // it over.
                    // equivalence class:
                    "\u0f72", THDLWylieConstants.i_VOWEL,
                    "\u0f7a", THDLWylieConstants.e_VOWEL,