From 5788416629e62628e59fa70b85c17900c93b3467 Mon Sep 17 00:00:00 2001 From: dchandler Date: Mon, 1 Aug 2005 05:54:20 +0000 Subject: [PATCH] A reverter that converts Unicode to computer-friendly (but not, yet, human-friendly) EWTS is here in alpha mode. It probably doesn't deal well with non-Tibetan. --- build.xml | 4 +- junitbuild.xml | 2 - .../org/thdl/tib/input/TibetanConverter.java | 5 +- .../org/thdl/tib/text/reverter/Converter.java | 121 ++++++++++- .../thdl/tib/text/reverter/ConverterTest.java | 108 +++++++++- source/org/thdl/tib/text/reverter/GC.java | 200 ++++++++++++++++++ .../reverter/UnicodeToTranslitForXslt.java | 7 +- .../UnicodeToTranslitForXsltTest.java | 11 +- .../thdl/tib/text/tshegbar/LegalTshegBar.java | 1 + .../tshegbar/UnicodeCodepointToThdlWylie.java | 11 +- .../thdl/tib/text/tshegbar/UnicodeUtils.java | 66 ++++-- source/org/thdl/tib/text/ttt/EWTSTest.java | 1 + .../thdl/tib/text/ttt/TPairListFactory.java | 6 + 13 files changed, 496 insertions(+), 47 deletions(-) create mode 100644 source/org/thdl/tib/text/reverter/GC.java diff --git a/build.xml b/build.xml index 9d03e01..38f295d 100644 --- a/build.xml +++ b/build.xml @@ -165,8 +165,8 @@ the jvm starting tomcat: - - + + diff --git a/junitbuild.xml b/junitbuild.xml index 4c79e0f..8b58f04 100644 --- a/junitbuild.xml +++ b/junitbuild.xml @@ -73,10 +73,8 @@ - diff --git a/source/org/thdl/tib/input/TibetanConverter.java b/source/org/thdl/tib/input/TibetanConverter.java index 7def541..2af64ca 100644 --- a/source/org/thdl/tib/input/TibetanConverter.java +++ b/source/org/thdl/tib/input/TibetanConverter.java @@ -350,7 +350,10 @@ public class TibetanConverter implements FontConverterConstants { uniText = s.toString(); } StringBuffer errors = new StringBuffer(); - String ewtsText = Converter.convertToEwts(uniText, errors); + // TODO(dchandler): DLC: use human-friendly EWTS, not + // computer-friendly! + String ewtsText = Converter.convertToEwtsForComputers(uniText, + errors); // TODO(dchandler): is 51 the right choice? return (errors.length() > 0) ? 51 : 0; } catch (IOException e) { diff --git a/source/org/thdl/tib/text/reverter/Converter.java b/source/org/thdl/tib/text/reverter/Converter.java index 623112a..5b693d5 100644 --- a/source/org/thdl/tib/text/reverter/Converter.java +++ b/source/org/thdl/tib/text/reverter/Converter.java @@ -18,6 +18,16 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.reverter; +import java.text.BreakIterator; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Locale; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.thdl.tib.text.tshegbar.UnicodeUtils; + /** Static methods for converting Unicode to EWTS and * (TODO(dchandler): ACIP). * @author David Chandler @@ -28,11 +38,110 @@ public class Converter { throw new Error("There's no point in instantiating this class."); } - /** Converts Tibetan Unicode to EWTS transliteration. If errors - * is non-null, error messages are appended to it. (Errors are - * always inline.) */ - public static String convertToEwts(String unicode, - StringBuffer errors /* DLC: use it */) { - throw new Error("DLC not yet"); + /** Finds combining character sequences. */ + private static BreakIterator breaker + = BreakIterator.getCharacterInstance(new Locale("bo")); + + + private static final boolean debug = false; + + // TODO(dchandler): use this to create LegalTshegBar objects, it's + // unused right now. + private static Pattern mightBeLegalTshegBarRegex = Pattern.compile( + "^" + + "([\u0f42\u0f51\u0f56\u0f58\u0f60])?" + // root stack: consonant w/ optional wowels: + + "(" + GC.consonantStackRegexString + ")" + + "(([\u0f42\u0f51\u0f56\u0f58\u0f60\u0f44\u0f53\u0f62\u0f63\u0f66][\u0f51\u0f66]?)" + + "|(\u0f60[\u0f72\u0f74\u0f7c\u0f44\u0f58])+)?" + + "$"); + + /** Splits nfthdl into grapheme clusters. Let's define a grapheme + * cluster as something an end user would say cannot be + * decomposed into two separate pieces sensibly. For the most + * part this is just figuring out the combining character + * sequences as defined by Unicode, but (U+0F04 U+0F05*) is + * an example of a grapheme cluster that is not a combining + * character sequence (TODO(dchandler): (0f04 0f05*), is it + * really worth it? We don't handle it right now, might be good + * for Unicode->ACIP anyway.) + * @param nfthdl Unicode in NFTHDL decomposition form + * @return List of GC objects */ + private static List/**/ SplitGC(String nfthdl) { + + if (debug) { + System.out.println("debug: " + + UnicodeUtils.unicodeStringToPrettyString(nfthdl)); + } + ArrayList al = new ArrayList(); + breaker.setText(nfthdl); + int start = breaker.first(); + boolean just_saw_0f7f = false; + for (int end = breaker.next(); + end != BreakIterator.DONE; + start = end, end = breaker.next()) { + if ((just_saw_0f7f + && (Character.getType(nfthdl.charAt(start)) + == Character.NON_SPACING_MARK)) + || (end > start && '\u0f7f' == nfthdl.charAt(start) + && !al.isEmpty())) { + // U+0F7F is a COMBINING_SPACING_MARK, not a + // NON_SPACING_MARK, but we want to treat it like a + // NON_SPACING_MARK. + GC gc = new GC(((GC)al.get(al.size() - 1)).getNfthdl() + + nfthdl.substring(start,end)); + if (debug) { + System.out.println("debug: setting last el, " + + al.get(al.size() - 1) + " to " + gc); + } + al.set(al.size() - 1, gc); + } else { + al.add(new GC(nfthdl.substring(start,end))); + } + just_saw_0f7f + = (end > start && '\u0f7f' == nfthdl.charAt(end - 1)); + } + return al; + } + + /** Converts Tibetan Unicode to computer-friendly EWTS + * transliteration. Computer-friendly is not human-friendly but + * hopefully even poorly written EWTS->Tibetan converters could + * handle the output. If errors is non-null, error messages are + * appended to it. (Errors are always inline.) */ + public static String convertToEwtsForComputers(String unicode, + StringBuffer errors) { + + // First, normalize as much as we can to reduce the number of + // cases we must handle. + String decomposed + = UnicodeUtils.toMostlyDecomposedUnicode(unicode, + UnicodeUtils.NORM_NFTHDL); + + // TODO(dchandler): optionally warn if we see + // "\u0f40\u0f74\u0f71" which is in the wrong order. + + List gcs = SplitGC(decomposed); + + StringBuffer sb = new StringBuffer(); + for (Iterator it = gcs.iterator(); it.hasNext(); ) { + GC gc = (GC)it.next(); + StringBuffer ewts = gc.getEwtsForComputers(); + if (null == ewts) { + // TODO(dchandler): use ErrorsAndWarnings? + ewts = new StringBuffer("[#ERROR 301: The Unicode '" + + gc.getNfthdl() + + "' (has no EWTS transliteration]"); + if (null != errors) { + errors.append(ewts); + errors.append('\n'); + } + } + sb.append(ewts); + } + return sb.toString(); } } + +// TODO(dchandler): give a mode where an error is given if non-Tibetan +// or at least non-EWTS (think U+534D, e.g.) is found diff --git a/source/org/thdl/tib/text/reverter/ConverterTest.java b/source/org/thdl/tib/text/reverter/ConverterTest.java index 5c97876..1f96a2a 100644 --- a/source/org/thdl/tib/text/reverter/ConverterTest.java +++ b/source/org/thdl/tib/text/reverter/ConverterTest.java @@ -20,8 +20,9 @@ package org.thdl.tib.text.reverter; import junit.framework.TestCase; -import org.thdl.util.ThdlOptions; +import org.thdl.tib.text.tshegbar.UnicodeUtils; import org.thdl.tib.text.ttt.ErrorsAndWarnings; +import org.thdl.util.ThdlOptions; /** Tests the Converter class. * @@ -47,9 +48,112 @@ public class ConverterTest extends TestCase { ThdlOptions.setUserPreference("thdl.debug", true); } + /** Asserts that converting s from Unicode to EWTS yields an + * error. */ + private void err(String s) { + StringBuffer sb = new StringBuffer(); + String ewts = Converter.convertToEwtsForComputers(s, sb); + boolean error = (sb.length() > 0); + if (!error) { + System.out.println("expected error but got EWTS '" + ewts + + "' for " + + UnicodeUtils.unicodeStringToPrettyString(s)); + } + assertTrue(error); + } + + /** Tests Converter.convertToEwtsForHumans. */ + private void hconv(String uni, String ewts) { + System.out.println("TODO(dchandler): DLC: implement me"); + } + + /** Tests Converter.convertToEwtsForComputers. */ + private void conv(String uni, String ewts) { + StringBuffer sb = new StringBuffer(); + String actualEwts = Converter.convertToEwtsForComputers(uni, sb); + assertEquals("Expected " + ewts + " but got " + actualEwts + ":\n", + ewts, actualEwts); + boolean error = (sb.length() > 0); + assertTrue(!error); + } + public ConverterTest() { } public void testUnicodeToEwts() { - assertEquals(Converter.convertToEwts("\u0f40", null), "ka"); + conv("\u0f56\u0f62\u0f9f\u0f42\u0f66\u0f0b", "bar+tagasa "); + conv("\u0f40", "ka"); + // TODO(dchandler): DLC Tibetans use Arabic numerals and English punctuation. + // conv("0123456789.\u0f40", "0123456789.ka"); + conv("\u0f40\u0f7b", "kai"); + conv("\u0f40\u0f76", "k+r-i"); + conv("\u0f40\u0020\u0f40", "ka_ka"); + conv("\u0f40\n\u0f40\t\u0f40\r\n", "ka\nka\tka\r\n"); + conv("\u0f04\u0f05\u0f40\u0f0c\u00a0\u0f42", "@#ka*_ga"); + conv("\u0f42\u0f61", "gaya"); + hconv("\u0f42\u0f61", "g.ya"); + conv("\u0f42\u0fb1", "g+ya"); + hconv("\u0f42\u0fb1", "gya"); + conv("\u0f54\u0f7e", "paM"); + conv("\u0f54\u0f71\u0f7e", "pAM"); + conv("\u0f54\u0f7e", "paM"); + conv("\u0f54\u0f74\u0f7e", "puM"); + conv("\u0f54\u0fc6", "p\\u0FC6"); + conv("\u0f40\u0f72\u0f74", "ku+i"); // bottom-to-top + conv("\u0f40\u0f72\u0f74\u0f39", "k^u+i"); // 0f39 first + conv("\u0f40\u0f73", "kI"); + conv("\u0f40\u0f71\u0f72", "kI"); + conv("\u0f40\u0f72\u0f71", "kI"); + conv("\u0f40\u0f73\u0f74", "kU+i"); + err("\u0f48"); + err("\u0f32\u0f39"); + err("\u0f47\u0f98"); + conv("\u0fcc", "\\u0FCC"); + err("\u0fcd"); + err("\u0f90"); + err("\u0f90\u0fc6"); + conv("\u0f0b\u0fc6", " \\u0FC6"); // ugly but legal... + err("\u0f0b\u0f90"); + err("\u0f0b\u0f74"); + err("\u0f0b\u0f7f"); + err("\u0f0b\u0f3e"); + conv("\u0f32\u0f18", "\\u0F32\\u0F18"); + conv("\u0f54\u0fa4\u0f90", "p+p+ka"); + // TODO(dchandler): warn("\u0f54\u0fa4\u0f90\u0f39"); (or do + // CCCVs work for this?) + if (false) { + // 0f39 could go with any of the three, so we give an error: + err("\u0f54\u0fa4\u0f90\u0f74\u0f39"); + } else { + // TODO(dchandler): I want an error, not this: + conv("\u0f54\u0fa4\u0f90\u0f74\u0f39", "p+p+k^u"); + } + conv("\u0f54\u0fa4\u0f90\u0f39", "p+p+k^a"); + conv("\u0f55\u0f39", "fa"); + conv("\u0f55\u0f74\u0f39", "fu"); + conv("\u0f56\u0f39", "va"); + conv("\u0f56\u0f74\u0f39", "vu"); + conv("\u0f54\u0f39\u0fa4\u0f90", "p^+p+ka"); + conv("\u0f40\u0f7e", "kaM"); + conv("\u0f40\u0f83", "ka~M"); + conv("\u0f40\u0f82", "ka~M`"); + conv("\u0f40\u0f84", "ka?"); + conv("\u0f40\u0f85\u0f40", "ka&ka"); + err("\u0f7f"); + conv("\u0f40\u0f7f", "kaH"); + conv("\u0f40\u0f7f\u0f72", "kiH"); + conv("\u0f40\u0f7f\u0f7f\u0f72\u0f7f", "kiHHH"); + conv("\u0f40\u0f7f\u0f7e", "kaHM"); + conv("\u0f40\u0f7e\u0f7f", "kaMH"); + conv("\u0f40\u0f7f\u0f7e\u0f72", "kiHM"); + conv("\u0f04\u0f05", "@#"); + conv("\u0f04\u0f05\u0f05", "@##"); + conv("\u0f04", "@"); // TODO(dchandler): Is this ever seen + // alone? warn/error otherwise. + conv("\u0f05", "#"); // TODO(dchandler): warn or error } } +// TODO(dchandler): DLC: test all these round-trip, i.e. assert that +// Uni->EWTS->Uni produces the same Uni. + +// TODO(dchandler): test with ZWSP or joiners or whatever weird crap +// you can throw in legally to alter boundaries diff --git a/source/org/thdl/tib/text/reverter/GC.java b/source/org/thdl/tib/text/reverter/GC.java new file mode 100644 index 0000000..ed4939a --- /dev/null +++ b/source/org/thdl/tib/text/reverter/GC.java @@ -0,0 +1,200 @@ +/* +The contents of this file are subject to the THDL Open Community License +Version 1.0 (the "License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License on the THDL web site +(http://www.thdl.org/). + +Software distributed under the License is distributed on an "AS IS" basis, +WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +License for the specific terms governing rights and limitations under the +License. + +The Initial Developer of this software is the Tibetan and Himalayan Digital +Library (THDL). Portions created by the THDL are Copyright 2005 THDL. +All Rights Reserved. + +Contributor(s): ______________________________________. +*/ + +package org.thdl.tib.text.reverter; + +import java.util.regex.Pattern; +import java.util.regex.Matcher; + +import org.thdl.util.ThdlDebug; +import org.thdl.tib.text.THDLWylieConstants; +import org.thdl.tib.text.tshegbar.UnicodeUtils; +import org.thdl.tib.text.tshegbar.UnicodeCodepointToThdlWylie; + +/** Grapheme cluster backed by a String of Unicode. For the most part + * these are combining character sequences as defined by + * Unicode, but (U+0F04 U+0F05+) [TODO(dchandler): not yet handled as + * a single GC] is an example of a grapheme cluster that is not a + * combining character sequence. + * @author David Chandler + */ +class GC { + /** NFTHDL-decomposed Unicode */ + private String nfthdl; + + /** True if valid. True for digits w/ digit combiners, character + * stack plus optional wowels, a standalone mark. False for + * anything else, e.g. "\u0f0b\u0f90". */ + private boolean valid; + + /** Constructor that takes the NFTHDL-decomposed Unicode for the + * grapheme cluster. */ + public GC(String nfthdl) { + setNfthdl(nfthdl); + } + + /** A regex that matches the NFTHDL Unicode for a consonant stack + * with optional wowels. */ + public static String consonantStackRegexString + = "[\u0f40-\u0f47\u0f49-\u0f6a]" // base consonant + + "[\u0f90-\u0f97\u0f99-\u0fbc\u0f39]*" // subjoined cons. + + "\u0f71?" // a-chung + + "[\u0f72\u0f73\u0f74\u0f7a-\u0f7d\u0f80]*" // vowel proper + + "[\u0f35\u0f37\u0f7e\u0f7f\u0f82-\u0f84" // wowels + + "\u0f86\u0f87\u0fc6]*"; + + private static Pattern validGcRegex = Pattern.compile( + "^" + // numeric: + + "([\u0f20-\u0f33][\u0f18\u0f19]*)|" + + // consonant w/ optional wowels: + + "(" + consonantStackRegexString + ")|" + + // other symbol with optional U+0FC6 + + "([\u0f00-\u0f17\u0f1a-\u0f1f\u0f34\u0f36\u0f38" + + "\u0f3a-\u0f3d\u0f85\u0f88-\u0f8b\u0fbe-\u0fc5" + + "\u0fc7-\u0fcc\u0fcf-\u0fd1]\u0fc6?)|" + + // other symbol that does not take U+0FC6. + // TODO(dchandler): include 0f0b etc. in this group? + + "([ \t\u00a0\n\r]{1,})" // DLC handling of English... [0-9\\.:a-zA-Z] etc. what to do? + + + "$"); + + private static final boolean debug = false; + + /** Returns NFTHDL-decomposed Unicode representing this grapheme + * cluster. */ + private void setNfthdl(String nfthdl) { + if (debug) { + System.out.println("debug: GC is " + + UnicodeUtils.unicodeStringToPrettyString(nfthdl)); + } + this.nfthdl = nfthdl; + assert (nfthdl.length() > 0); + if (nfthdl.length() < 1) + valid = false; + valid = validGcRegex.matcher(nfthdl).matches(); + } + + /** Returns NFTHDL-decomposed Unicode representing this grapheme + * cluster. */ + public String getNfthdl() { return nfthdl; } + + /** Returns true iff ch is a vowel proper, not a wowel */ + private boolean isVowel(char ch) { + // (We won't see \u0f76 etc. in NFTHDL, but the handling of + // them is suspect.) + return ((ch >= '\u0f71' && ch <= '\u0f75') + || (ch >= '\u0f7a' && ch <= '\u0f7d') + || (ch >= '\u0f81' && ch <= '\u0f82')); + } + + private boolean isWowelRequiringPrecedingVowel(char ch) { + // not 0f39 0f18 0f19 e.g. + return ("\u0f35\u0f37\u0f7e\u0f7f\u0f82\u0f83\u0f84\u0f86\u0f87".indexOf(ch) >= 0); + + // NOTE: 0f7f is questionable 0fc6 too... we assume [k\\u0fc6] + // is good EWTS. + } + + /** Returns EWTS that is valid but not beautiful. It's better + * suited for consumption by computer programs than by humans, + * though it'll do in a pinch. (Humans like to see [rnams] instead + * of [r+namasa].) + * @return null if this grapheme cluster has no valid EWTS + * representation or valid-but-ugly EWTS otherwise */ + public StringBuffer getEwtsForComputers() { + if (!valid) { + return null; + } + StringBuffer sb = new StringBuffer(); + // We use ch after the loop. Initialization is not really + // needed; it's just to avoid compiler errors. + char ch = 'X'; + boolean seenVowel = false; + String lastEwts = ""; + boolean added_aVOWEL = false; + for (int i = 0; i < nfthdl.length(); i++) { + ch = nfthdl.charAt(i); + String ewts + = UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(ch); + if (i + 1 < nfthdl.length()) { // lookahead + // Even computers want to see kI because the spec + // isn't (or at least hasn't always been) crystal + // clear that kA+i is equivalent to kI. + if (('\u0f55' == ch || '\u0fa5' == ch) + && '\u0f39' == nfthdl.charAt(i + 1)) { + ++i; + ewts = "f"; // TODO(dchandler): hard-coded EWTS + } else if (('\u0f56' == ch || '\u0fa6' == ch) + && '\u0f39' == nfthdl.charAt(i + 1)) { + ++i; + ewts = "v"; // TODO(dchandler): hard-coded EWTS + } else if ('\u0f71' == ch && '\u0f72' == nfthdl.charAt(i + 1)) { + ++i; + ewts = THDLWylieConstants.I_VOWEL; + // NOTE: we could normalize to 0f73 and 0f75 when + // possible in NFTHDL. That's closer to EWTS and + // would avoid these two special cases. + } else if ('\u0f71' == ch && '\u0f74' == nfthdl.charAt(i + 1)) { + ++i; + ewts = THDLWylieConstants.U_VOWEL; + } + } + if (null == ewts && UnicodeUtils.isInTibetanRange(ch)) { + return null; + } + if (UnicodeUtils.isSubjoinedConsonant(ch) + || (seenVowel && isVowel(ch))) + sb.append(THDLWylieConstants.WYLIE_SANSKRIT_STACKING_KEY); + if (isWowelRequiringPrecedingVowel(ch) && !seenVowel) { + if (!added_aVOWEL) { + added_aVOWEL = true; + sb.append(THDLWylieConstants.WYLIE_aVOWEL); // paM, no pM + } + } + if (isVowel(ch)) { + seenVowel = true; + } + sb.append(ewts); + lastEwts = ewts; + } + if (UnicodeUtils.isNonSubjoinedConsonant(ch) + || UnicodeUtils.isSubjoinedConsonant(ch) + || '\u0f39' == ch) { + ThdlDebug.verify(!added_aVOWEL); + sb.append(THDLWylieConstants.WYLIE_aVOWEL); + } + return sb; + } + + public int hashCode() { return nfthdl.hashCode(); } + + public boolean equals(Object o) { + return (o instanceof GC && ((GC)o).getNfthdl().equals(getNfthdl())); + } + + /** Quasi-XML for humans */ + public String toString() { + return ""; + } +} diff --git a/source/org/thdl/tib/text/reverter/UnicodeToTranslitForXslt.java b/source/org/thdl/tib/text/reverter/UnicodeToTranslitForXslt.java index 2fceaed..3ff10ed 100644 --- a/source/org/thdl/tib/text/reverter/UnicodeToTranslitForXslt.java +++ b/source/org/thdl/tib/text/reverter/UnicodeToTranslitForXslt.java @@ -32,11 +32,12 @@ public class UnicodeToTranslitForXslt { } /** Converts Tibetan Unicode to EWTS transliteration. */ - public static String unicodeToEwts(String unicode) { - return Converter.convertToEwts(unicode, null); + public static String unicodeToEwtsForComputers(String unicode) { + return Converter.convertToEwtsForComputers(unicode, null); } + /** Converts Tibetan Unicode to ACIP transliteration. */ public static String unicodeToAcip(String unicode) { - throw new Error("DLC: not yet"); + throw new Error("TODO(dchandler): not yet"); } } diff --git a/source/org/thdl/tib/text/reverter/UnicodeToTranslitForXsltTest.java b/source/org/thdl/tib/text/reverter/UnicodeToTranslitForXsltTest.java index 9012b49..42a39e5 100644 --- a/source/org/thdl/tib/text/reverter/UnicodeToTranslitForXsltTest.java +++ b/source/org/thdl/tib/text/reverter/UnicodeToTranslitForXsltTest.java @@ -50,12 +50,15 @@ public class UnicodeToTranslitForXsltTest extends TestCase { public UnicodeToTranslitForXsltTest() { } public void testUnicodeToEwts() { - assertEquals(UnicodeToTranslitForXslt.unicodeToEwts("\u0f40"), "ka"); - assertEquals(UnicodeToTranslitForXslt.unicodeToEwts("\u0f56\u0f62\u0f4f\u0f42\u0f66\u0f0b"), "brtags "); + assertEquals("ka", UnicodeToTranslitForXslt.unicodeToEwtsForComputers("\u0f40")); + assertEquals("g+ya", UnicodeToTranslitForXslt.unicodeToEwtsForComputers("\u0f42\u0fb1")); + // TODO(dchandler): assertEquals("brtags ", UnicodeToTranslitForXslt.unicodeToEwtsForHumans("\u0f56\u0f62\u0f9f\u0f42\u0f66\u0f0b")); } public void testUnicodeToAcip() { - assertEquals(UnicodeToTranslitForXslt.unicodeToEwts("\u0f40"), "KA"); - assertEquals(UnicodeToTranslitForXslt.unicodeToEwts("\u0f56\u0f62\u0f4f\u0f42\u0f66\u0f0b"), "BRTAGS "); + if (false) { + assertEquals("KA", UnicodeToTranslitForXslt.unicodeToAcip("\u0f40")); + assertEquals("BRTAGS ", UnicodeToTranslitForXslt.unicodeToAcip("\u0f56\u0f62\u0f9f\u0f42\u0f66\u0f0b")); + } } } diff --git a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java index d49dd8c..ab3c01b 100644 --- a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java +++ b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java @@ -463,6 +463,7 @@ public final class LegalTshegBar * concatenation like 'u'i'o. Returns false otherwise (including * the case that suffix is the empty string). */ public static boolean isAchungBasedSuffix(String suffix) { + // TODO(dchandler): use java.util.regex int i = 0; // so that the empty string causes false to be returned. while (i == 0 || !suffix.equals("")) { boolean startsWithOneOfThem = false; diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeCodepointToThdlWylie.java b/source/org/thdl/tib/text/tshegbar/UnicodeCodepointToThdlWylie.java index 928a495..c998cd8 100644 --- a/source/org/thdl/tib/text/tshegbar/UnicodeCodepointToThdlWylie.java +++ b/source/org/thdl/tib/text/tshegbar/UnicodeCodepointToThdlWylie.java @@ -67,11 +67,16 @@ public class UnicodeCodepointToThdlWylie { // fail. switch (x) { + case '\t': return "\t"; + case '\n': return "\n"; + case '\r': return "\r"; + case ' ': return "_"; + case '\u00a0': return "_"; case '\u0F00': return "oM"; case '\u0F01': return "\\u0F01"; - case '\u0F02': return null; // DLC - case '\u0F03': return null; // DLC + case '\u0F02': return "\\u0F02"; + case '\u0F03': return "\\u0F03"; case '\u0F04': return "@"; case '\u0F05': return "#"; case '\u0F06': return "$"; @@ -314,8 +319,6 @@ public class UnicodeCodepointToThdlWylie { case '\u0FCF': return "\\u0FCF"; // DLC i added this to the 'EWTS document misspeaks' bug report... null I think... default: { - // DLC handle space (EW's "_") - // This codepoint is in the range 0FD0-0FFF or is not in // the Tibetan range at all. In either case, there is no // corresponding THDL Extended Wylie. diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java index cbf8c27..f8070ed 100644 --- a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java +++ b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java @@ -102,7 +102,10 @@ public class UnicodeUtils implements UnicodeConstants { nor NFKD breaks down U+0F00 into its constituent codepoints. NFTHDL uses a maximum of codepoints, and it never uses codepoints whose use has been {@link #isDiscouraged(char) - discouraged}. + discouraged}. NFTHDL also does not screw things up by using + the standard-but-wrong CCCVs. It sorts stretches of combining + characters wisely as per + {@link http://iris.lib.virginia.edu/tibet/xml/showEssay.php?xml=/tools/encodingTib.xml}.

The Tibetan passages of the returned string are in the chosen normalized form, but codepoints outside of the {@link @@ -136,6 +139,9 @@ public class UnicodeUtils implements UnicodeConstants { tibetanUnicode.insert(offset, s); } } + if (normForm == NORM_NFTHDL) { + fixSomeOrderingErrorsInTibetanUnicode(tibetanUnicode); + } } /** Like {@link #toMostlyDecomposedUnicode(StringBuffer, byte)}, @@ -418,7 +424,39 @@ public class UnicodeUtils implements UnicodeConstants { * product.) */ private static char unicode_pairs[][] - = { { '\u0f71', '\u0f74' }, + = { + /* TODO(dchandler): use regex + * "[\u0f39\u0f71-\u0f84\u0f86\u0f87]{2,}" to find patches + * that need sorting and then sort each of those. This + * cross product is ugly. */ + + { '\u0f39', '\u0f71' }, + { '\u0f39', '\u0f72' }, + { '\u0f39', '\u0f74' }, + { '\u0f39', '\u0f7a' }, + { '\u0f39', '\u0f7b' }, + { '\u0f39', '\u0f7c' }, + { '\u0f39', '\u0f7d' }, + { '\u0f39', '\u0f7e' }, + { '\u0f39', '\u0f7f' }, + { '\u0f39', '\u0f80' }, + { '\u0f39', '\u0f82' }, + { '\u0f39', '\u0f83' }, + + { '\u0f71', '\u0f7f' }, + { '\u0f72', '\u0f7f' }, + { '\u0f74', '\u0f7f' }, + { '\u0f7a', '\u0f7f' }, + { '\u0f7b', '\u0f7f' }, + { '\u0f7c', '\u0f7f' }, + { '\u0f7d', '\u0f7f' }, + // but not { '\u0f7e', '\u0f7f' }, + { '\u0f39', '\u0f7f' }, + { '\u0f80', '\u0f7f' }, + { '\u0f82', '\u0f7f' }, + { '\u0f83', '\u0f7f' }, + + { '\u0f71', '\u0f74' }, { '\u0f71', '\u0f72' }, { '\u0f71', '\u0f7a' }, @@ -489,7 +527,9 @@ public class UnicodeUtils implements UnicodeConstants { * the same file modulo Unicode booboos would be better.

* * @param sb the buffer to be mutated - * @return true if sb was mutated */ + * @return true if sb was mutated + * @see Tibetan Encoding Model + */ public static boolean fixSomeOrderingErrorsInTibetanUnicode(StringBuffer sb) { boolean mutated = false; int len = sb.length(); @@ -512,25 +552,5 @@ public class UnicodeUtils implements UnicodeConstants { } while (mutated_this_time_through); return mutated; } - - /** Returns true iff ch is a valid Tibetan codepoint in Unicode - * 4.0: */ - public boolean isTibetanUnicodeCodepoint(char ch) { - // NOTE: could use an array of 256 booleans for speed but I'm lazy - return ((ch >= '\u0f00' && ch <= '\u0fcf') - && !(ch == '\u0f48' - || (ch > '\u0f6a' && ch < '\u0f71') - || (ch > '\u0f8b' && ch < '\u0f90') - || ch == '\u0f98' - || ch == '\u0fbd' - || ch == '\u0fcd' - || ch == '\u0fce')); - } - - /** Returns true iff ch is in 0F00-0FFF but isn't a valid Tibetan - * codepoint in Unicode 4.0: */ - public boolean isInvalidTibetanUnicode(char ch) { - return (isInTibetanRange(ch) && !isTibetanUnicodeCodepoint(ch)); - } } diff --git a/source/org/thdl/tib/text/ttt/EWTSTest.java b/source/org/thdl/tib/text/ttt/EWTSTest.java index dca358c..e1a1f21 100644 --- a/source/org/thdl/tib/text/ttt/EWTSTest.java +++ b/source/org/thdl/tib/text/ttt/EWTSTest.java @@ -798,6 +798,7 @@ public class EWTSTest extends TestCase { just_ewts2uni_test("\\uefff", "\uefff"); } + ewts2uni_test("kaHH", "\u0F40\u0f7f\u0f7f"); // Below was semiautomatically generated from the EWTS spec's // 'ewts.xml' representation (early August 2004 edition): diff --git a/source/org/thdl/tib/text/ttt/TPairListFactory.java b/source/org/thdl/tib/text/ttt/TPairListFactory.java index 6fb9e9a..c1afcb8 100644 --- a/source/org/thdl/tib/text/ttt/TPairListFactory.java +++ b/source/org/thdl/tib/text/ttt/TPairListFactory.java @@ -405,6 +405,12 @@ class TPairListFactory { "\u0f74", THDLWylieConstants.u_VOWEL, + // TODO(dchandler): equivalence classes I'm not + // sure. + // http://iris.lib.virginia.edu/tibet/xml/showEssay.php?xml=/tools/encodingTib.xml + // says to go above base and then upwards. Think + // it over. + // equivalence class: "\u0f72", THDLWylieConstants.i_VOWEL, "\u0f7a", THDLWylieConstants.e_VOWEL,