A reverter that converts Unicode to computer-friendly (but not, yet,

human-friendly) EWTS is here in alpha mode.  It probably doesn't deal
well with non-Tibetan.
This commit is contained in:
dchandler 2005-08-01 05:54:20 +00:00
parent 00afd75362
commit 5788416629
13 changed files with 496 additions and 47 deletions

View file

@ -18,6 +18,16 @@ Contributor(s): ______________________________________.
package org.thdl.tib.text.reverter;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.thdl.tib.text.tshegbar.UnicodeUtils;
/** Static methods for converting Unicode to EWTS and
* (TODO(dchandler): ACIP).
* @author David Chandler
@ -28,11 +38,110 @@ public class Converter {
throw new Error("There's no point in instantiating this class.");
}
/** Converts Tibetan Unicode to EWTS transliteration. If errors
* is non-null, error messages are appended to it. (Errors are
* always inline.) */
public static String convertToEwts(String unicode,
StringBuffer errors /* DLC: use it */) {
throw new Error("DLC not yet");
/** Finds combining character sequences. */
private static BreakIterator breaker
= BreakIterator.getCharacterInstance(new Locale("bo"));
private static final boolean debug = false;
// TODO(dchandler): use this to create LegalTshegBar objects, it's
// unused right now.
private static Pattern mightBeLegalTshegBarRegex = Pattern.compile(
"^"
+ "([\u0f42\u0f51\u0f56\u0f58\u0f60])?"
// root stack: consonant w/ optional wowels:
+ "(" + GC.consonantStackRegexString + ")"
+ "(([\u0f42\u0f51\u0f56\u0f58\u0f60\u0f44\u0f53\u0f62\u0f63\u0f66][\u0f51\u0f66]?)"
+ "|(\u0f60[\u0f72\u0f74\u0f7c\u0f44\u0f58])+)?"
+ "$");
/** Splits nfthdl into grapheme clusters. Let's define a grapheme
* cluster as something an end user would say cannot be
* decomposed into two separate pieces sensibly. For the most
* part this is just figuring out the <em>combining character
* sequences</em> as defined by Unicode, but (U+0F04 U+0F05*) is
* an example of a grapheme cluster that is not a combining
* character sequence (TODO(dchandler): (0f04 0f05*), is it
* really worth it? We don't handle it right now, might be good
* for Unicode->ACIP anyway.)
* @param nfthdl Unicode in NFTHDL decomposition form
* @return List of GC objects */
private static List/*<GC>*/ SplitGC(String nfthdl) {
if (debug) {
System.out.println("debug: "
+ UnicodeUtils.unicodeStringToPrettyString(nfthdl));
}
ArrayList al = new ArrayList();
breaker.setText(nfthdl);
int start = breaker.first();
boolean just_saw_0f7f = false;
for (int end = breaker.next();
end != BreakIterator.DONE;
start = end, end = breaker.next()) {
if ((just_saw_0f7f
&& (Character.getType(nfthdl.charAt(start))
== Character.NON_SPACING_MARK))
|| (end > start && '\u0f7f' == nfthdl.charAt(start)
&& !al.isEmpty())) {
// U+0F7F is a COMBINING_SPACING_MARK, not a
// NON_SPACING_MARK, but we want to treat it like a
// NON_SPACING_MARK.
GC gc = new GC(((GC)al.get(al.size() - 1)).getNfthdl()
+ nfthdl.substring(start,end));
if (debug) {
System.out.println("debug: setting last el, "
+ al.get(al.size() - 1) + " to " + gc);
}
al.set(al.size() - 1, gc);
} else {
al.add(new GC(nfthdl.substring(start,end)));
}
just_saw_0f7f
= (end > start && '\u0f7f' == nfthdl.charAt(end - 1));
}
return al;
}
/** Converts Tibetan Unicode to computer-friendly EWTS
* transliteration. Computer-friendly is not human-friendly but
* hopefully even poorly written EWTS->Tibetan converters could
* handle the output. If errors is non-null, error messages are
* appended to it. (Errors are always inline.) */
public static String convertToEwtsForComputers(String unicode,
StringBuffer errors) {
// First, normalize as much as we can to reduce the number of
// cases we must handle.
String decomposed
= UnicodeUtils.toMostlyDecomposedUnicode(unicode,
UnicodeUtils.NORM_NFTHDL);
// TODO(dchandler): optionally warn if we see
// "\u0f40\u0f74\u0f71" which is in the wrong order.
List gcs = SplitGC(decomposed);
StringBuffer sb = new StringBuffer();
for (Iterator it = gcs.iterator(); it.hasNext(); ) {
GC gc = (GC)it.next();
StringBuffer ewts = gc.getEwtsForComputers();
if (null == ewts) {
// TODO(dchandler): use ErrorsAndWarnings?
ewts = new StringBuffer("[#ERROR 301: The Unicode '"
+ gc.getNfthdl()
+ "' (has no EWTS transliteration]");
if (null != errors) {
errors.append(ewts);
errors.append('\n');
}
}
sb.append(ewts);
}
return sb.toString();
}
}
// TODO(dchandler): give a mode where an error is given if non-Tibetan
// or at least non-EWTS (think U+534D, e.g.) is found

View file

@ -20,8 +20,9 @@ package org.thdl.tib.text.reverter;
import junit.framework.TestCase;
import org.thdl.util.ThdlOptions;
import org.thdl.tib.text.tshegbar.UnicodeUtils;
import org.thdl.tib.text.ttt.ErrorsAndWarnings;
import org.thdl.util.ThdlOptions;
/** Tests the Converter class.
*
@ -47,9 +48,112 @@ public class ConverterTest extends TestCase {
ThdlOptions.setUserPreference("thdl.debug", true);
}
/** Asserts that converting s from Unicode to EWTS yields an
* error. */
private void err(String s) {
StringBuffer sb = new StringBuffer();
String ewts = Converter.convertToEwtsForComputers(s, sb);
boolean error = (sb.length() > 0);
if (!error) {
System.out.println("expected error but got EWTS '" + ewts
+ "' for "
+ UnicodeUtils.unicodeStringToPrettyString(s));
}
assertTrue(error);
}
/** Tests Converter.convertToEwtsForHumans. */
private void hconv(String uni, String ewts) {
System.out.println("TODO(dchandler): DLC: implement me");
}
/** Tests Converter.convertToEwtsForComputers. */
private void conv(String uni, String ewts) {
StringBuffer sb = new StringBuffer();
String actualEwts = Converter.convertToEwtsForComputers(uni, sb);
assertEquals("Expected " + ewts + " but got " + actualEwts + ":\n",
ewts, actualEwts);
boolean error = (sb.length() > 0);
assertTrue(!error);
}
public ConverterTest() { }
public void testUnicodeToEwts() {
assertEquals(Converter.convertToEwts("\u0f40", null), "ka");
conv("\u0f56\u0f62\u0f9f\u0f42\u0f66\u0f0b", "bar+tagasa ");
conv("\u0f40", "ka");
// TODO(dchandler): DLC Tibetans use Arabic numerals and English punctuation.
// conv("0123456789.\u0f40", "0123456789.ka");
conv("\u0f40\u0f7b", "kai");
conv("\u0f40\u0f76", "k+r-i");
conv("\u0f40\u0020\u0f40", "ka_ka");
conv("\u0f40\n\u0f40\t\u0f40\r\n", "ka\nka\tka\r\n");
conv("\u0f04\u0f05\u0f40\u0f0c\u00a0\u0f42", "@#ka*_ga");
conv("\u0f42\u0f61", "gaya");
hconv("\u0f42\u0f61", "g.ya");
conv("\u0f42\u0fb1", "g+ya");
hconv("\u0f42\u0fb1", "gya");
conv("\u0f54\u0f7e", "paM");
conv("\u0f54\u0f71\u0f7e", "pAM");
conv("\u0f54\u0f7e", "paM");
conv("\u0f54\u0f74\u0f7e", "puM");
conv("\u0f54\u0fc6", "p\\u0FC6");
conv("\u0f40\u0f72\u0f74", "ku+i"); // bottom-to-top
conv("\u0f40\u0f72\u0f74\u0f39", "k^u+i"); // 0f39 first
conv("\u0f40\u0f73", "kI");
conv("\u0f40\u0f71\u0f72", "kI");
conv("\u0f40\u0f72\u0f71", "kI");
conv("\u0f40\u0f73\u0f74", "kU+i");
err("\u0f48");
err("\u0f32\u0f39");
err("\u0f47\u0f98");
conv("\u0fcc", "\\u0FCC");
err("\u0fcd");
err("\u0f90");
err("\u0f90\u0fc6");
conv("\u0f0b\u0fc6", " \\u0FC6"); // ugly but legal...
err("\u0f0b\u0f90");
err("\u0f0b\u0f74");
err("\u0f0b\u0f7f");
err("\u0f0b\u0f3e");
conv("\u0f32\u0f18", "\\u0F32\\u0F18");
conv("\u0f54\u0fa4\u0f90", "p+p+ka");
// TODO(dchandler): warn("\u0f54\u0fa4\u0f90\u0f39"); (or do
// CCCVs work for this?)
if (false) {
// 0f39 could go with any of the three, so we give an error:
err("\u0f54\u0fa4\u0f90\u0f74\u0f39");
} else {
// TODO(dchandler): I want an error, not this:
conv("\u0f54\u0fa4\u0f90\u0f74\u0f39", "p+p+k^u");
}
conv("\u0f54\u0fa4\u0f90\u0f39", "p+p+k^a");
conv("\u0f55\u0f39", "fa");
conv("\u0f55\u0f74\u0f39", "fu");
conv("\u0f56\u0f39", "va");
conv("\u0f56\u0f74\u0f39", "vu");
conv("\u0f54\u0f39\u0fa4\u0f90", "p^+p+ka");
conv("\u0f40\u0f7e", "kaM");
conv("\u0f40\u0f83", "ka~M");
conv("\u0f40\u0f82", "ka~M`");
conv("\u0f40\u0f84", "ka?");
conv("\u0f40\u0f85\u0f40", "ka&ka");
err("\u0f7f");
conv("\u0f40\u0f7f", "kaH");
conv("\u0f40\u0f7f\u0f72", "kiH");
conv("\u0f40\u0f7f\u0f7f\u0f72\u0f7f", "kiHHH");
conv("\u0f40\u0f7f\u0f7e", "kaHM");
conv("\u0f40\u0f7e\u0f7f", "kaMH");
conv("\u0f40\u0f7f\u0f7e\u0f72", "kiHM");
conv("\u0f04\u0f05", "@#");
conv("\u0f04\u0f05\u0f05", "@##");
conv("\u0f04", "@"); // TODO(dchandler): Is this ever seen
// alone? warn/error otherwise.
conv("\u0f05", "#"); // TODO(dchandler): warn or error
}
}
// TODO(dchandler): DLC: test all these round-trip, i.e. assert that
// Uni->EWTS->Uni produces the same Uni.
// TODO(dchandler): test with ZWSP or joiners or whatever weird crap
// you can throw in legally to alter boundaries

View file

@ -0,0 +1,200 @@
/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2005 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text.reverter;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import org.thdl.util.ThdlDebug;
import org.thdl.tib.text.THDLWylieConstants;
import org.thdl.tib.text.tshegbar.UnicodeUtils;
import org.thdl.tib.text.tshegbar.UnicodeCodepointToThdlWylie;
/** Grapheme cluster backed by a String of Unicode. For the most part
* these are <em>combining character sequences</em> as defined by
* Unicode, but (U+0F04 U+0F05+) [TODO(dchandler): not yet handled as
* a single GC] is an example of a grapheme cluster that is not a
* combining character sequence.
* @author David Chandler
*/
class GC {
/** NFTHDL-decomposed Unicode */
private String nfthdl;
/** True if valid. True for digits w/ digit combiners, character
* stack plus optional wowels, a standalone mark. False for
* anything else, e.g. "\u0f0b\u0f90". */
private boolean valid;
/** Constructor that takes the NFTHDL-decomposed Unicode for the
* grapheme cluster. */
public GC(String nfthdl) {
setNfthdl(nfthdl);
}
/** A regex that matches the NFTHDL Unicode for a consonant stack
* with optional wowels. */
public static String consonantStackRegexString
= "[\u0f40-\u0f47\u0f49-\u0f6a]" // base consonant
+ "[\u0f90-\u0f97\u0f99-\u0fbc\u0f39]*" // subjoined cons.
+ "\u0f71?" // a-chung
+ "[\u0f72\u0f73\u0f74\u0f7a-\u0f7d\u0f80]*" // vowel proper
+ "[\u0f35\u0f37\u0f7e\u0f7f\u0f82-\u0f84" // wowels
+ "\u0f86\u0f87\u0fc6]*";
private static Pattern validGcRegex = Pattern.compile(
"^"
// numeric:
+ "([\u0f20-\u0f33][\u0f18\u0f19]*)|"
// consonant w/ optional wowels:
+ "(" + consonantStackRegexString + ")|"
// other symbol with optional U+0FC6
+ "([\u0f00-\u0f17\u0f1a-\u0f1f\u0f34\u0f36\u0f38"
+ "\u0f3a-\u0f3d\u0f85\u0f88-\u0f8b\u0fbe-\u0fc5"
+ "\u0fc7-\u0fcc\u0fcf-\u0fd1]\u0fc6?)|"
// other symbol that does not take U+0FC6.
// TODO(dchandler): include 0f0b etc. in this group?
+ "([ \t\u00a0\n\r]{1,})" // DLC handling of English... [0-9\\.:a-zA-Z] etc. what to do?
+ "$");
private static final boolean debug = false;
/** Returns NFTHDL-decomposed Unicode representing this grapheme
* cluster. */
private void setNfthdl(String nfthdl) {
if (debug) {
System.out.println("debug: GC is "
+ UnicodeUtils.unicodeStringToPrettyString(nfthdl));
}
this.nfthdl = nfthdl;
assert (nfthdl.length() > 0);
if (nfthdl.length() < 1)
valid = false;
valid = validGcRegex.matcher(nfthdl).matches();
}
/** Returns NFTHDL-decomposed Unicode representing this grapheme
* cluster. */
public String getNfthdl() { return nfthdl; }
/** Returns true iff ch is a vowel proper, not a wowel */
private boolean isVowel(char ch) {
// (We won't see \u0f76 etc. in NFTHDL, but the handling of
// them is suspect.)
return ((ch >= '\u0f71' && ch <= '\u0f75')
|| (ch >= '\u0f7a' && ch <= '\u0f7d')
|| (ch >= '\u0f81' && ch <= '\u0f82'));
}
private boolean isWowelRequiringPrecedingVowel(char ch) {
// not 0f39 0f18 0f19 e.g.
return ("\u0f35\u0f37\u0f7e\u0f7f\u0f82\u0f83\u0f84\u0f86\u0f87".indexOf(ch) >= 0);
// NOTE: 0f7f is questionable 0fc6 too... we assume [k\\u0fc6]
// is good EWTS.
}
/** Returns EWTS that is valid but not beautiful. It's better
* suited for consumption by computer programs than by humans,
* though it'll do in a pinch. (Humans like to see [rnams] instead
* of [r+namasa].)
* @return null if this grapheme cluster has no valid EWTS
* representation or valid-but-ugly EWTS otherwise */
public StringBuffer getEwtsForComputers() {
if (!valid) {
return null;
}
StringBuffer sb = new StringBuffer();
// We use ch after the loop. Initialization is not really
// needed; it's just to avoid compiler errors.
char ch = 'X';
boolean seenVowel = false;
String lastEwts = "";
boolean added_aVOWEL = false;
for (int i = 0; i < nfthdl.length(); i++) {
ch = nfthdl.charAt(i);
String ewts
= UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(ch);
if (i + 1 < nfthdl.length()) { // lookahead
// Even computers want to see kI because the spec
// isn't (or at least hasn't always been) crystal
// clear that kA+i is equivalent to kI.
if (('\u0f55' == ch || '\u0fa5' == ch)
&& '\u0f39' == nfthdl.charAt(i + 1)) {
++i;
ewts = "f"; // TODO(dchandler): hard-coded EWTS
} else if (('\u0f56' == ch || '\u0fa6' == ch)
&& '\u0f39' == nfthdl.charAt(i + 1)) {
++i;
ewts = "v"; // TODO(dchandler): hard-coded EWTS
} else if ('\u0f71' == ch && '\u0f72' == nfthdl.charAt(i + 1)) {
++i;
ewts = THDLWylieConstants.I_VOWEL;
// NOTE: we could normalize to 0f73 and 0f75 when
// possible in NFTHDL. That's closer to EWTS and
// would avoid these two special cases.
} else if ('\u0f71' == ch && '\u0f74' == nfthdl.charAt(i + 1)) {
++i;
ewts = THDLWylieConstants.U_VOWEL;
}
}
if (null == ewts && UnicodeUtils.isInTibetanRange(ch)) {
return null;
}
if (UnicodeUtils.isSubjoinedConsonant(ch)
|| (seenVowel && isVowel(ch)))
sb.append(THDLWylieConstants.WYLIE_SANSKRIT_STACKING_KEY);
if (isWowelRequiringPrecedingVowel(ch) && !seenVowel) {
if (!added_aVOWEL) {
added_aVOWEL = true;
sb.append(THDLWylieConstants.WYLIE_aVOWEL); // paM, no pM
}
}
if (isVowel(ch)) {
seenVowel = true;
}
sb.append(ewts);
lastEwts = ewts;
}
if (UnicodeUtils.isNonSubjoinedConsonant(ch)
|| UnicodeUtils.isSubjoinedConsonant(ch)
|| '\u0f39' == ch) {
ThdlDebug.verify(!added_aVOWEL);
sb.append(THDLWylieConstants.WYLIE_aVOWEL);
}
return sb;
}
public int hashCode() { return nfthdl.hashCode(); }
public boolean equals(Object o) {
return (o instanceof GC && ((GC)o).getNfthdl().equals(getNfthdl()));
}
/** Quasi-XML for humans */
public String toString() {
return "<GC valid=" + valid + " pretty=\""
+ UnicodeUtils.unicodeStringToPrettyString(getNfthdl())
+ "\"/>";
}
}

View file

@ -32,11 +32,12 @@ public class UnicodeToTranslitForXslt {
}
/** Converts Tibetan Unicode to EWTS transliteration. */
public static String unicodeToEwts(String unicode) {
return Converter.convertToEwts(unicode, null);
public static String unicodeToEwtsForComputers(String unicode) {
return Converter.convertToEwtsForComputers(unicode, null);
}
/** Converts Tibetan Unicode to ACIP transliteration. */
public static String unicodeToAcip(String unicode) {
throw new Error("DLC: not yet");
throw new Error("TODO(dchandler): not yet");
}
}

View file

@ -50,12 +50,15 @@ public class UnicodeToTranslitForXsltTest extends TestCase {
public UnicodeToTranslitForXsltTest() { }
public void testUnicodeToEwts() {
assertEquals(UnicodeToTranslitForXslt.unicodeToEwts("\u0f40"), "ka");
assertEquals(UnicodeToTranslitForXslt.unicodeToEwts("\u0f56\u0f62\u0f4f\u0f42\u0f66\u0f0b"), "brtags ");
assertEquals("ka", UnicodeToTranslitForXslt.unicodeToEwtsForComputers("\u0f40"));
assertEquals("g+ya", UnicodeToTranslitForXslt.unicodeToEwtsForComputers("\u0f42\u0fb1"));
// TODO(dchandler): assertEquals("brtags ", UnicodeToTranslitForXslt.unicodeToEwtsForHumans("\u0f56\u0f62\u0f9f\u0f42\u0f66\u0f0b"));
}
public void testUnicodeToAcip() {
assertEquals(UnicodeToTranslitForXslt.unicodeToEwts("\u0f40"), "KA");
assertEquals(UnicodeToTranslitForXslt.unicodeToEwts("\u0f56\u0f62\u0f4f\u0f42\u0f66\u0f0b"), "BRTAGS ");
if (false) {
assertEquals("KA", UnicodeToTranslitForXslt.unicodeToAcip("\u0f40"));
assertEquals("BRTAGS ", UnicodeToTranslitForXslt.unicodeToAcip("\u0f56\u0f62\u0f9f\u0f42\u0f66\u0f0b"));
}
}
}