*/ SplitGC(String nfthdl) {
+
+ if (debug) {
+ System.out.println("debug: "
+ + UnicodeUtils.unicodeStringToPrettyString(nfthdl));
+ }
+ ArrayList al = new ArrayList();
+ breaker.setText(nfthdl);
+ int start = breaker.first();
+ boolean just_saw_0f7f = false;
+ for (int end = breaker.next();
+ end != BreakIterator.DONE;
+ start = end, end = breaker.next()) {
+ if ((just_saw_0f7f
+ && (Character.getType(nfthdl.charAt(start))
+ == Character.NON_SPACING_MARK))
+ || (end > start && '\u0f7f' == nfthdl.charAt(start)
+ && !al.isEmpty())) {
+ // U+0F7F is a COMBINING_SPACING_MARK, not a
+ // NON_SPACING_MARK, but we want to treat it like a
+ // NON_SPACING_MARK.
+ GC gc = new GC(((GC)al.get(al.size() - 1)).getNfthdl()
+ + nfthdl.substring(start,end));
+ if (debug) {
+ System.out.println("debug: setting last el, "
+ + al.get(al.size() - 1) + " to " + gc);
+ }
+ al.set(al.size() - 1, gc);
+ } else {
+ al.add(new GC(nfthdl.substring(start,end)));
+ }
+ just_saw_0f7f
+ = (end > start && '\u0f7f' == nfthdl.charAt(end - 1));
+ }
+ return al;
+ }
+
+ /** Converts Tibetan Unicode to computer-friendly EWTS
+ * transliteration. Computer-friendly is not human-friendly but
+ * hopefully even poorly written EWTS->Tibetan converters could
+ * handle the output. If errors is non-null, error messages are
+ * appended to it. (Errors are always inline.) */
+ public static String convertToEwtsForComputers(String unicode,
+ StringBuffer errors) {
+
+ // First, normalize as much as we can to reduce the number of
+ // cases we must handle.
+ String decomposed
+ = UnicodeUtils.toMostlyDecomposedUnicode(unicode,
+ UnicodeUtils.NORM_NFTHDL);
+
+ // TODO(dchandler): optionally warn if we see
+ // "\u0f40\u0f74\u0f71" which is in the wrong order.
+
+ List gcs = SplitGC(decomposed);
+
+ StringBuffer sb = new StringBuffer();
+ for (Iterator it = gcs.iterator(); it.hasNext(); ) {
+ GC gc = (GC)it.next();
+ StringBuffer ewts = gc.getEwtsForComputers();
+ if (null == ewts) {
+ // TODO(dchandler): use ErrorsAndWarnings?
+ ewts = new StringBuffer("[#ERROR 301: The Unicode '"
+ + gc.getNfthdl()
+ + "' (has no EWTS transliteration]");
+ if (null != errors) {
+ errors.append(ewts);
+ errors.append('\n');
+ }
+ }
+ sb.append(ewts);
+ }
+ return sb.toString();
}
}
+
+// TODO(dchandler): give a mode where an error is given if non-Tibetan
+// or at least non-EWTS (think U+534D, e.g.) is found
diff --git a/source/org/thdl/tib/text/reverter/ConverterTest.java b/source/org/thdl/tib/text/reverter/ConverterTest.java
index 5c97876..1f96a2a 100644
--- a/source/org/thdl/tib/text/reverter/ConverterTest.java
+++ b/source/org/thdl/tib/text/reverter/ConverterTest.java
@@ -20,8 +20,9 @@ package org.thdl.tib.text.reverter;
import junit.framework.TestCase;
-import org.thdl.util.ThdlOptions;
+import org.thdl.tib.text.tshegbar.UnicodeUtils;
import org.thdl.tib.text.ttt.ErrorsAndWarnings;
+import org.thdl.util.ThdlOptions;
/** Tests the Converter class.
*
@@ -47,9 +48,112 @@ public class ConverterTest extends TestCase {
ThdlOptions.setUserPreference("thdl.debug", true);
}
+ /** Asserts that converting s from Unicode to EWTS yields an
+ * error. */
+ private void err(String s) {
+ StringBuffer sb = new StringBuffer();
+ String ewts = Converter.convertToEwtsForComputers(s, sb);
+ boolean error = (sb.length() > 0);
+ if (!error) {
+ System.out.println("expected error but got EWTS '" + ewts
+ + "' for "
+ + UnicodeUtils.unicodeStringToPrettyString(s));
+ }
+ assertTrue(error);
+ }
+
+ /** Tests Converter.convertToEwtsForHumans. */
+ private void hconv(String uni, String ewts) {
+ System.out.println("TODO(dchandler): DLC: implement me");
+ }
+
+ /** Tests Converter.convertToEwtsForComputers. */
+ private void conv(String uni, String ewts) {
+ StringBuffer sb = new StringBuffer();
+ String actualEwts = Converter.convertToEwtsForComputers(uni, sb);
+ assertEquals("Expected " + ewts + " but got " + actualEwts + ":\n",
+ ewts, actualEwts);
+ boolean error = (sb.length() > 0);
+ assertTrue(!error);
+ }
+
public ConverterTest() { }
public void testUnicodeToEwts() {
- assertEquals(Converter.convertToEwts("\u0f40", null), "ka");
+ conv("\u0f56\u0f62\u0f9f\u0f42\u0f66\u0f0b", "bar+tagasa ");
+ conv("\u0f40", "ka");
+ // TODO(dchandler): DLC Tibetans use Arabic numerals and English punctuation.
+ // conv("0123456789.\u0f40", "0123456789.ka");
+ conv("\u0f40\u0f7b", "kai");
+ conv("\u0f40\u0f76", "k+r-i");
+ conv("\u0f40\u0020\u0f40", "ka_ka");
+ conv("\u0f40\n\u0f40\t\u0f40\r\n", "ka\nka\tka\r\n");
+ conv("\u0f04\u0f05\u0f40\u0f0c\u00a0\u0f42", "@#ka*_ga");
+ conv("\u0f42\u0f61", "gaya");
+ hconv("\u0f42\u0f61", "g.ya");
+ conv("\u0f42\u0fb1", "g+ya");
+ hconv("\u0f42\u0fb1", "gya");
+ conv("\u0f54\u0f7e", "paM");
+ conv("\u0f54\u0f71\u0f7e", "pAM");
+ conv("\u0f54\u0f7e", "paM");
+ conv("\u0f54\u0f74\u0f7e", "puM");
+ conv("\u0f54\u0fc6", "p\\u0FC6");
+ conv("\u0f40\u0f72\u0f74", "ku+i"); // bottom-to-top
+ conv("\u0f40\u0f72\u0f74\u0f39", "k^u+i"); // 0f39 first
+ conv("\u0f40\u0f73", "kI");
+ conv("\u0f40\u0f71\u0f72", "kI");
+ conv("\u0f40\u0f72\u0f71", "kI");
+ conv("\u0f40\u0f73\u0f74", "kU+i");
+ err("\u0f48");
+ err("\u0f32\u0f39");
+ err("\u0f47\u0f98");
+ conv("\u0fcc", "\\u0FCC");
+ err("\u0fcd");
+ err("\u0f90");
+ err("\u0f90\u0fc6");
+ conv("\u0f0b\u0fc6", " \\u0FC6"); // ugly but legal...
+ err("\u0f0b\u0f90");
+ err("\u0f0b\u0f74");
+ err("\u0f0b\u0f7f");
+ err("\u0f0b\u0f3e");
+ conv("\u0f32\u0f18", "\\u0F32\\u0F18");
+ conv("\u0f54\u0fa4\u0f90", "p+p+ka");
+ // TODO(dchandler): warn("\u0f54\u0fa4\u0f90\u0f39"); (or do
+ // CCCVs work for this?)
+ if (false) {
+ // 0f39 could go with any of the three, so we give an error:
+ err("\u0f54\u0fa4\u0f90\u0f74\u0f39");
+ } else {
+ // TODO(dchandler): I want an error, not this:
+ conv("\u0f54\u0fa4\u0f90\u0f74\u0f39", "p+p+k^u");
+ }
+ conv("\u0f54\u0fa4\u0f90\u0f39", "p+p+k^a");
+ conv("\u0f55\u0f39", "fa");
+ conv("\u0f55\u0f74\u0f39", "fu");
+ conv("\u0f56\u0f39", "va");
+ conv("\u0f56\u0f74\u0f39", "vu");
+ conv("\u0f54\u0f39\u0fa4\u0f90", "p^+p+ka");
+ conv("\u0f40\u0f7e", "kaM");
+ conv("\u0f40\u0f83", "ka~M");
+ conv("\u0f40\u0f82", "ka~M`");
+ conv("\u0f40\u0f84", "ka?");
+ conv("\u0f40\u0f85\u0f40", "ka&ka");
+ err("\u0f7f");
+ conv("\u0f40\u0f7f", "kaH");
+ conv("\u0f40\u0f7f\u0f72", "kiH");
+ conv("\u0f40\u0f7f\u0f7f\u0f72\u0f7f", "kiHHH");
+ conv("\u0f40\u0f7f\u0f7e", "kaHM");
+ conv("\u0f40\u0f7e\u0f7f", "kaMH");
+ conv("\u0f40\u0f7f\u0f7e\u0f72", "kiHM");
+ conv("\u0f04\u0f05", "@#");
+ conv("\u0f04\u0f05\u0f05", "@##");
+ conv("\u0f04", "@"); // TODO(dchandler): Is this ever seen
+ // alone? warn/error otherwise.
+ conv("\u0f05", "#"); // TODO(dchandler): warn or error
}
}
+// TODO(dchandler): DLC: test all these round-trip, i.e. assert that
+// Uni->EWTS->Uni produces the same Uni.
+
+// TODO(dchandler): test with ZWSP or joiners or whatever weird crap
+// you can throw in legally to alter boundaries
diff --git a/source/org/thdl/tib/text/reverter/GC.java b/source/org/thdl/tib/text/reverter/GC.java
new file mode 100644
index 0000000..ed4939a
--- /dev/null
+++ b/source/org/thdl/tib/text/reverter/GC.java
@@ -0,0 +1,200 @@
+/*
+The contents of this file are subject to the THDL Open Community License
+Version 1.0 (the "License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License on the THDL web site
+(http://www.thdl.org/).
+
+Software distributed under the License is distributed on an "AS IS" basis,
+WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+License for the specific terms governing rights and limitations under the
+License.
+
+The Initial Developer of this software is the Tibetan and Himalayan Digital
+Library (THDL). Portions created by the THDL are Copyright 2005 THDL.
+All Rights Reserved.
+
+Contributor(s): ______________________________________.
+*/
+
+package org.thdl.tib.text.reverter;
+
+import java.util.regex.Pattern;
+import java.util.regex.Matcher;
+
+import org.thdl.util.ThdlDebug;
+import org.thdl.tib.text.THDLWylieConstants;
+import org.thdl.tib.text.tshegbar.UnicodeUtils;
+import org.thdl.tib.text.tshegbar.UnicodeCodepointToThdlWylie;
+
+/** Grapheme cluster backed by a String of Unicode. For the most part
+ * these are combining character sequences as defined by
+ * Unicode, but (U+0F04 U+0F05+) [TODO(dchandler): not yet handled as
+ * a single GC] is an example of a grapheme cluster that is not a
+ * combining character sequence.
+ * @author David Chandler
+ */
+class GC {
+ /** NFTHDL-decomposed Unicode */
+ private String nfthdl;
+
+ /** True if valid. True for digits w/ digit combiners, character
+ * stack plus optional wowels, a standalone mark. False for
+ * anything else, e.g. "\u0f0b\u0f90". */
+ private boolean valid;
+
+ /** Constructor that takes the NFTHDL-decomposed Unicode for the
+ * grapheme cluster. */
+ public GC(String nfthdl) {
+ setNfthdl(nfthdl);
+ }
+
+ /** A regex that matches the NFTHDL Unicode for a consonant stack
+ * with optional wowels. */
+ public static String consonantStackRegexString
+ = "[\u0f40-\u0f47\u0f49-\u0f6a]" // base consonant
+ + "[\u0f90-\u0f97\u0f99-\u0fbc\u0f39]*" // subjoined cons.
+ + "\u0f71?" // a-chung
+ + "[\u0f72\u0f73\u0f74\u0f7a-\u0f7d\u0f80]*" // vowel proper
+ + "[\u0f35\u0f37\u0f7e\u0f7f\u0f82-\u0f84" // wowels
+ + "\u0f86\u0f87\u0fc6]*";
+
+ private static Pattern validGcRegex = Pattern.compile(
+ "^"
+ // numeric:
+ + "([\u0f20-\u0f33][\u0f18\u0f19]*)|"
+
+ // consonant w/ optional wowels:
+ + "(" + consonantStackRegexString + ")|"
+
+ // other symbol with optional U+0FC6
+ + "([\u0f00-\u0f17\u0f1a-\u0f1f\u0f34\u0f36\u0f38"
+ + "\u0f3a-\u0f3d\u0f85\u0f88-\u0f8b\u0fbe-\u0fc5"
+ + "\u0fc7-\u0fcc\u0fcf-\u0fd1]\u0fc6?)|"
+
+ // other symbol that does not take U+0FC6.
+ // TODO(dchandler): include 0f0b etc. in this group?
+ + "([ \t\u00a0\n\r]{1,})" // DLC handling of English... [0-9\\.:a-zA-Z] etc. what to do?
+
+ + "$");
+
+ private static final boolean debug = false;
+
+ /** Returns NFTHDL-decomposed Unicode representing this grapheme
+ * cluster. */
+ private void setNfthdl(String nfthdl) {
+ if (debug) {
+ System.out.println("debug: GC is "
+ + UnicodeUtils.unicodeStringToPrettyString(nfthdl));
+ }
+ this.nfthdl = nfthdl;
+ assert (nfthdl.length() > 0);
+ if (nfthdl.length() < 1)
+ valid = false;
+ valid = validGcRegex.matcher(nfthdl).matches();
+ }
+
+ /** Returns NFTHDL-decomposed Unicode representing this grapheme
+ * cluster. */
+ public String getNfthdl() { return nfthdl; }
+
+ /** Returns true iff ch is a vowel proper, not a wowel */
+ private boolean isVowel(char ch) {
+ // (We won't see \u0f76 etc. in NFTHDL, but the handling of
+ // them is suspect.)
+ return ((ch >= '\u0f71' && ch <= '\u0f75')
+ || (ch >= '\u0f7a' && ch <= '\u0f7d')
+ || (ch >= '\u0f81' && ch <= '\u0f82'));
+ }
+
+ private boolean isWowelRequiringPrecedingVowel(char ch) {
+ // not 0f39 0f18 0f19 e.g.
+ return ("\u0f35\u0f37\u0f7e\u0f7f\u0f82\u0f83\u0f84\u0f86\u0f87".indexOf(ch) >= 0);
+
+ // NOTE: 0f7f is questionable 0fc6 too... we assume [k\\u0fc6]
+ // is good EWTS.
+ }
+
+ /** Returns EWTS that is valid but not beautiful. It's better
+ * suited for consumption by computer programs than by humans,
+ * though it'll do in a pinch. (Humans like to see [rnams] instead
+ * of [r+namasa].)
+ * @return null if this grapheme cluster has no valid EWTS
+ * representation or valid-but-ugly EWTS otherwise */
+ public StringBuffer getEwtsForComputers() {
+ if (!valid) {
+ return null;
+ }
+ StringBuffer sb = new StringBuffer();
+ // We use ch after the loop. Initialization is not really
+ // needed; it's just to avoid compiler errors.
+ char ch = 'X';
+ boolean seenVowel = false;
+ String lastEwts = "";
+ boolean added_aVOWEL = false;
+ for (int i = 0; i < nfthdl.length(); i++) {
+ ch = nfthdl.charAt(i);
+ String ewts
+ = UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(ch);
+ if (i + 1 < nfthdl.length()) { // lookahead
+ // Even computers want to see kI because the spec
+ // isn't (or at least hasn't always been) crystal
+ // clear that kA+i is equivalent to kI.
+ if (('\u0f55' == ch || '\u0fa5' == ch)
+ && '\u0f39' == nfthdl.charAt(i + 1)) {
+ ++i;
+ ewts = "f"; // TODO(dchandler): hard-coded EWTS
+ } else if (('\u0f56' == ch || '\u0fa6' == ch)
+ && '\u0f39' == nfthdl.charAt(i + 1)) {
+ ++i;
+ ewts = "v"; // TODO(dchandler): hard-coded EWTS
+ } else if ('\u0f71' == ch && '\u0f72' == nfthdl.charAt(i + 1)) {
+ ++i;
+ ewts = THDLWylieConstants.I_VOWEL;
+ // NOTE: we could normalize to 0f73 and 0f75 when
+ // possible in NFTHDL. That's closer to EWTS and
+ // would avoid these two special cases.
+ } else if ('\u0f71' == ch && '\u0f74' == nfthdl.charAt(i + 1)) {
+ ++i;
+ ewts = THDLWylieConstants.U_VOWEL;
+ }
+ }
+ if (null == ewts && UnicodeUtils.isInTibetanRange(ch)) {
+ return null;
+ }
+ if (UnicodeUtils.isSubjoinedConsonant(ch)
+ || (seenVowel && isVowel(ch)))
+ sb.append(THDLWylieConstants.WYLIE_SANSKRIT_STACKING_KEY);
+ if (isWowelRequiringPrecedingVowel(ch) && !seenVowel) {
+ if (!added_aVOWEL) {
+ added_aVOWEL = true;
+ sb.append(THDLWylieConstants.WYLIE_aVOWEL); // paM, no pM
+ }
+ }
+ if (isVowel(ch)) {
+ seenVowel = true;
+ }
+ sb.append(ewts);
+ lastEwts = ewts;
+ }
+ if (UnicodeUtils.isNonSubjoinedConsonant(ch)
+ || UnicodeUtils.isSubjoinedConsonant(ch)
+ || '\u0f39' == ch) {
+ ThdlDebug.verify(!added_aVOWEL);
+ sb.append(THDLWylieConstants.WYLIE_aVOWEL);
+ }
+ return sb;
+ }
+
+ public int hashCode() { return nfthdl.hashCode(); }
+
+ public boolean equals(Object o) {
+ return (o instanceof GC && ((GC)o).getNfthdl().equals(getNfthdl()));
+ }
+
+ /** Quasi-XML for humans */
+ public String toString() {
+ return "";
+ }
+}
diff --git a/source/org/thdl/tib/text/reverter/UnicodeToTranslitForXslt.java b/source/org/thdl/tib/text/reverter/UnicodeToTranslitForXslt.java
index 2fceaed..3ff10ed 100644
--- a/source/org/thdl/tib/text/reverter/UnicodeToTranslitForXslt.java
+++ b/source/org/thdl/tib/text/reverter/UnicodeToTranslitForXslt.java
@@ -32,11 +32,12 @@ public class UnicodeToTranslitForXslt {
}
/** Converts Tibetan Unicode to EWTS transliteration. */
- public static String unicodeToEwts(String unicode) {
- return Converter.convertToEwts(unicode, null);
+ public static String unicodeToEwtsForComputers(String unicode) {
+ return Converter.convertToEwtsForComputers(unicode, null);
}
+
/** Converts Tibetan Unicode to ACIP transliteration. */
public static String unicodeToAcip(String unicode) {
- throw new Error("DLC: not yet");
+ throw new Error("TODO(dchandler): not yet");
}
}
diff --git a/source/org/thdl/tib/text/reverter/UnicodeToTranslitForXsltTest.java b/source/org/thdl/tib/text/reverter/UnicodeToTranslitForXsltTest.java
index 9012b49..42a39e5 100644
--- a/source/org/thdl/tib/text/reverter/UnicodeToTranslitForXsltTest.java
+++ b/source/org/thdl/tib/text/reverter/UnicodeToTranslitForXsltTest.java
@@ -50,12 +50,15 @@ public class UnicodeToTranslitForXsltTest extends TestCase {
public UnicodeToTranslitForXsltTest() { }
public void testUnicodeToEwts() {
- assertEquals(UnicodeToTranslitForXslt.unicodeToEwts("\u0f40"), "ka");
- assertEquals(UnicodeToTranslitForXslt.unicodeToEwts("\u0f56\u0f62\u0f4f\u0f42\u0f66\u0f0b"), "brtags ");
+ assertEquals("ka", UnicodeToTranslitForXslt.unicodeToEwtsForComputers("\u0f40"));
+ assertEquals("g+ya", UnicodeToTranslitForXslt.unicodeToEwtsForComputers("\u0f42\u0fb1"));
+ // TODO(dchandler): assertEquals("brtags ", UnicodeToTranslitForXslt.unicodeToEwtsForHumans("\u0f56\u0f62\u0f9f\u0f42\u0f66\u0f0b"));
}
public void testUnicodeToAcip() {
- assertEquals(UnicodeToTranslitForXslt.unicodeToEwts("\u0f40"), "KA");
- assertEquals(UnicodeToTranslitForXslt.unicodeToEwts("\u0f56\u0f62\u0f4f\u0f42\u0f66\u0f0b"), "BRTAGS ");
+ if (false) {
+ assertEquals("KA", UnicodeToTranslitForXslt.unicodeToAcip("\u0f40"));
+ assertEquals("BRTAGS ", UnicodeToTranslitForXslt.unicodeToAcip("\u0f56\u0f62\u0f9f\u0f42\u0f66\u0f0b"));
+ }
}
}
diff --git a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java
index d49dd8c..ab3c01b 100644
--- a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java
+++ b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java
@@ -463,6 +463,7 @@ public final class LegalTshegBar
* concatenation like 'u'i'o. Returns false otherwise (including
* the case that suffix is the empty string). */
public static boolean isAchungBasedSuffix(String suffix) {
+ // TODO(dchandler): use java.util.regex
int i = 0; // so that the empty string causes false to be returned.
while (i == 0 || !suffix.equals("")) {
boolean startsWithOneOfThem = false;
diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeCodepointToThdlWylie.java b/source/org/thdl/tib/text/tshegbar/UnicodeCodepointToThdlWylie.java
index 928a495..c998cd8 100644
--- a/source/org/thdl/tib/text/tshegbar/UnicodeCodepointToThdlWylie.java
+++ b/source/org/thdl/tib/text/tshegbar/UnicodeCodepointToThdlWylie.java
@@ -67,11 +67,16 @@ public class UnicodeCodepointToThdlWylie {
// fail.
switch (x) {
+ case '\t': return "\t";
+ case '\n': return "\n";
+ case '\r': return "\r";
+ case ' ': return "_";
+ case '\u00a0': return "_";
case '\u0F00': return "oM";
case '\u0F01': return "\\u0F01";
- case '\u0F02': return null; // DLC
- case '\u0F03': return null; // DLC
+ case '\u0F02': return "\\u0F02";
+ case '\u0F03': return "\\u0F03";
case '\u0F04': return "@";
case '\u0F05': return "#";
case '\u0F06': return "$";
@@ -314,8 +319,6 @@ public class UnicodeCodepointToThdlWylie {
case '\u0FCF': return "\\u0FCF"; // DLC i added this to the 'EWTS document misspeaks' bug report... null I think...
default: {
- // DLC handle space (EW's "_")
-
// This codepoint is in the range 0FD0-0FFF or is not in
// the Tibetan range at all. In either case, there is no
// corresponding THDL Extended Wylie.
diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java
index cbf8c27..f8070ed 100644
--- a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java
+++ b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java
@@ -102,7 +102,10 @@ public class UnicodeUtils implements UnicodeConstants {
nor NFKD breaks down U+0F00
into its constituent
codepoints. NFTHDL uses a maximum of codepoints, and it never
uses codepoints whose use has been {@link #isDiscouraged(char)
- discouraged}.
+ discouraged}. NFTHDL also does not screw things up by using
+ the standard-but-wrong CCCVs. It sorts stretches of combining
+ characters wisely as per
+ {@link http://iris.lib.virginia.edu/tibet/xml/showEssay.php?xml=/tools/encodingTib.xml}.
The Tibetan passages of the returned string are in the
chosen normalized form, but codepoints outside of the {@link
@@ -136,6 +139,9 @@ public class UnicodeUtils implements UnicodeConstants {
tibetanUnicode.insert(offset, s);
}
}
+ if (normForm == NORM_NFTHDL) {
+ fixSomeOrderingErrorsInTibetanUnicode(tibetanUnicode);
+ }
}
/** Like {@link #toMostlyDecomposedUnicode(StringBuffer, byte)},
@@ -418,7 +424,39 @@ public class UnicodeUtils implements UnicodeConstants {
* product.)
*/
private static char unicode_pairs[][]
- = { { '\u0f71', '\u0f74' },
+ = {
+ /* TODO(dchandler): use regex
+ * "[\u0f39\u0f71-\u0f84\u0f86\u0f87]{2,}" to find patches
+ * that need sorting and then sort each of those. This
+ * cross product is ugly. */
+
+ { '\u0f39', '\u0f71' },
+ { '\u0f39', '\u0f72' },
+ { '\u0f39', '\u0f74' },
+ { '\u0f39', '\u0f7a' },
+ { '\u0f39', '\u0f7b' },
+ { '\u0f39', '\u0f7c' },
+ { '\u0f39', '\u0f7d' },
+ { '\u0f39', '\u0f7e' },
+ { '\u0f39', '\u0f7f' },
+ { '\u0f39', '\u0f80' },
+ { '\u0f39', '\u0f82' },
+ { '\u0f39', '\u0f83' },
+
+ { '\u0f71', '\u0f7f' },
+ { '\u0f72', '\u0f7f' },
+ { '\u0f74', '\u0f7f' },
+ { '\u0f7a', '\u0f7f' },
+ { '\u0f7b', '\u0f7f' },
+ { '\u0f7c', '\u0f7f' },
+ { '\u0f7d', '\u0f7f' },
+ // but not { '\u0f7e', '\u0f7f' },
+ { '\u0f39', '\u0f7f' },
+ { '\u0f80', '\u0f7f' },
+ { '\u0f82', '\u0f7f' },
+ { '\u0f83', '\u0f7f' },
+
+ { '\u0f71', '\u0f74' },
{ '\u0f71', '\u0f72' },
{ '\u0f71', '\u0f7a' },
@@ -489,7 +527,9 @@ public class UnicodeUtils implements UnicodeConstants {
* the same file modulo Unicode booboos would be better.
*
* @param sb the buffer to be mutated
- * @return true if sb was mutated */
+ * @return true if sb was mutated
+ * @see Tibetan Encoding Model
+ */
public static boolean fixSomeOrderingErrorsInTibetanUnicode(StringBuffer sb) {
boolean mutated = false;
int len = sb.length();
@@ -512,25 +552,5 @@ public class UnicodeUtils implements UnicodeConstants {
} while (mutated_this_time_through);
return mutated;
}
-
- /** Returns true iff ch is a valid Tibetan codepoint in Unicode
- * 4.0: */
- public boolean isTibetanUnicodeCodepoint(char ch) {
- // NOTE: could use an array of 256 booleans for speed but I'm lazy
- return ((ch >= '\u0f00' && ch <= '\u0fcf')
- && !(ch == '\u0f48'
- || (ch > '\u0f6a' && ch < '\u0f71')
- || (ch > '\u0f8b' && ch < '\u0f90')
- || ch == '\u0f98'
- || ch == '\u0fbd'
- || ch == '\u0fcd'
- || ch == '\u0fce'));
- }
-
- /** Returns true iff ch is in 0F00-0FFF but isn't a valid Tibetan
- * codepoint in Unicode 4.0: */
- public boolean isInvalidTibetanUnicode(char ch) {
- return (isInTibetanRange(ch) && !isTibetanUnicodeCodepoint(ch));
- }
}
diff --git a/source/org/thdl/tib/text/ttt/EWTSTest.java b/source/org/thdl/tib/text/ttt/EWTSTest.java
index dca358c..e1a1f21 100644
--- a/source/org/thdl/tib/text/ttt/EWTSTest.java
+++ b/source/org/thdl/tib/text/ttt/EWTSTest.java
@@ -798,6 +798,7 @@ public class EWTSTest extends TestCase {
just_ewts2uni_test("\\uefff", "\uefff");
}
+ ewts2uni_test("kaHH", "\u0F40\u0f7f\u0f7f");
// Below was semiautomatically generated from the EWTS spec's
// 'ewts.xml' representation (early August 2004 edition):
diff --git a/source/org/thdl/tib/text/ttt/TPairListFactory.java b/source/org/thdl/tib/text/ttt/TPairListFactory.java
index 6fb9e9a..c1afcb8 100644
--- a/source/org/thdl/tib/text/ttt/TPairListFactory.java
+++ b/source/org/thdl/tib/text/ttt/TPairListFactory.java
@@ -405,6 +405,12 @@ class TPairListFactory {
"\u0f74", THDLWylieConstants.u_VOWEL,
+ // TODO(dchandler): equivalence classes I'm not
+ // sure.
+ // http://iris.lib.virginia.edu/tibet/xml/showEssay.php?xml=/tools/encodingTib.xml
+ // says to go above base and then upwards. Think
+ // it over.
+
// equivalence class:
"\u0f72", THDLWylieConstants.i_VOWEL,
"\u0f7a", THDLWylieConstants.e_VOWEL,