A reverter that converts Unicode to computer-friendly (but not, yet,

human-friendly) EWTS is here in alpha mode.  It probably doesn't deal
well with non-Tibetan.
This commit is contained in:
dchandler 2005-08-01 05:54:20 +00:00
parent 00afd75362
commit 5788416629
13 changed files with 496 additions and 47 deletions

View file

@ -165,8 +165,8 @@ the jvm starting tomcat:
<!-- Set this to 1.2 if you want J2SDK 1.4's default. 1.1 gives us <!-- Set this to 1.2 if you want J2SDK 1.4's default. 1.1 gives us
more compatibility, but maybe there will be a performance hit more compatibility, but maybe there will be a performance hit
or something. --> or something. -->
<property name="target.jvm" value="1.2"/> <property name="target.jvm" value="1.4"/>
<property name="source.jvm" value="1.2"/> <property name="source.jvm" value="1.4"/>
<!-- Only the tt-servlet-compile target changes this. Humans <!-- Only the tt-servlet-compile target changes this. Humans
shouldn't mess with this. --> shouldn't mess with this. -->

View file

@ -73,10 +73,8 @@
<formatter type="xml"/><!-- If not XML, then 'ant -buildfile <formatter type="xml"/><!-- If not XML, then 'ant -buildfile
build.xml check-report' will fail. --> build.xml check-report' will fail. -->
<sysproperty key="java.awt.headless" value="true"/> <sysproperty key="java.awt.headless" value="true"/>
<!-- TODO(dchandler): DLC: enable these
<test name="org.thdl.tib.text.reverter.ConverterTest"/> <test name="org.thdl.tib.text.reverter.ConverterTest"/>
<test name="org.thdl.tib.text.reverter.UnicodeToTranslitForXsltTest"/> <test name="org.thdl.tib.text.reverter.UnicodeToTranslitForXsltTest"/>
-->
<test name="org.thdl.tib.text.ttt.EwtsToUnicodeForXsltTest"/> <test name="org.thdl.tib.text.ttt.EwtsToUnicodeForXsltTest"/>
<test name="org.thdl.tib.text.ttt.EWTSTest"/> <test name="org.thdl.tib.text.ttt.EWTSTest"/>
<test name="org.thdl.tib.text.ttt.EWTStibwniniTest"/> <test name="org.thdl.tib.text.ttt.EWTStibwniniTest"/>

View file

@ -350,7 +350,10 @@ public class TibetanConverter implements FontConverterConstants {
uniText = s.toString(); uniText = s.toString();
} }
StringBuffer errors = new StringBuffer(); StringBuffer errors = new StringBuffer();
String ewtsText = Converter.convertToEwts(uniText, errors); // TODO(dchandler): DLC: use human-friendly EWTS, not
// computer-friendly!
String ewtsText = Converter.convertToEwtsForComputers(uniText,
errors);
// TODO(dchandler): is 51 the right choice? // TODO(dchandler): is 51 the right choice?
return (errors.length() > 0) ? 51 : 0; return (errors.length() > 0) ? 51 : 0;
} catch (IOException e) { } catch (IOException e) {

View file

@ -18,6 +18,16 @@ Contributor(s): ______________________________________.
package org.thdl.tib.text.reverter; package org.thdl.tib.text.reverter;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.thdl.tib.text.tshegbar.UnicodeUtils;
/** Static methods for converting Unicode to EWTS and /** Static methods for converting Unicode to EWTS and
* (TODO(dchandler): ACIP). * (TODO(dchandler): ACIP).
* @author David Chandler * @author David Chandler
@ -28,11 +38,110 @@ public class Converter {
throw new Error("There's no point in instantiating this class."); throw new Error("There's no point in instantiating this class.");
} }
/** Converts Tibetan Unicode to EWTS transliteration. If errors /** Finds combining character sequences. */
* is non-null, error messages are appended to it. (Errors are private static BreakIterator breaker
* always inline.) */ = BreakIterator.getCharacterInstance(new Locale("bo"));
public static String convertToEwts(String unicode,
StringBuffer errors /* DLC: use it */) {
throw new Error("DLC not yet"); private static final boolean debug = false;
// TODO(dchandler): use this to create LegalTshegBar objects, it's
// unused right now.
private static Pattern mightBeLegalTshegBarRegex = Pattern.compile(
"^"
+ "([\u0f42\u0f51\u0f56\u0f58\u0f60])?"
// root stack: consonant w/ optional wowels:
+ "(" + GC.consonantStackRegexString + ")"
+ "(([\u0f42\u0f51\u0f56\u0f58\u0f60\u0f44\u0f53\u0f62\u0f63\u0f66][\u0f51\u0f66]?)"
+ "|(\u0f60[\u0f72\u0f74\u0f7c\u0f44\u0f58])+)?"
+ "$");
/** Splits nfthdl into grapheme clusters. Let's define a grapheme
* cluster as something an end user would say cannot be
* decomposed into two separate pieces sensibly. For the most
* part this is just figuring out the <em>combining character
* sequences</em> as defined by Unicode, but (U+0F04 U+0F05*) is
* an example of a grapheme cluster that is not a combining
* character sequence (TODO(dchandler): (0f04 0f05*), is it
* really worth it? We don't handle it right now, might be good
* for Unicode->ACIP anyway.)
* @param nfthdl Unicode in NFTHDL decomposition form
* @return List of GC objects */
private static List/*<GC>*/ SplitGC(String nfthdl) {
if (debug) {
System.out.println("debug: "
+ UnicodeUtils.unicodeStringToPrettyString(nfthdl));
}
ArrayList al = new ArrayList();
breaker.setText(nfthdl);
int start = breaker.first();
boolean just_saw_0f7f = false;
for (int end = breaker.next();
end != BreakIterator.DONE;
start = end, end = breaker.next()) {
if ((just_saw_0f7f
&& (Character.getType(nfthdl.charAt(start))
== Character.NON_SPACING_MARK))
|| (end > start && '\u0f7f' == nfthdl.charAt(start)
&& !al.isEmpty())) {
// U+0F7F is a COMBINING_SPACING_MARK, not a
// NON_SPACING_MARK, but we want to treat it like a
// NON_SPACING_MARK.
GC gc = new GC(((GC)al.get(al.size() - 1)).getNfthdl()
+ nfthdl.substring(start,end));
if (debug) {
System.out.println("debug: setting last el, "
+ al.get(al.size() - 1) + " to " + gc);
}
al.set(al.size() - 1, gc);
} else {
al.add(new GC(nfthdl.substring(start,end)));
}
just_saw_0f7f
= (end > start && '\u0f7f' == nfthdl.charAt(end - 1));
}
return al;
}
/** Converts Tibetan Unicode to computer-friendly EWTS
* transliteration. Computer-friendly is not human-friendly but
* hopefully even poorly written EWTS->Tibetan converters could
* handle the output. If errors is non-null, error messages are
* appended to it. (Errors are always inline.) */
public static String convertToEwtsForComputers(String unicode,
StringBuffer errors) {
// First, normalize as much as we can to reduce the number of
// cases we must handle.
String decomposed
= UnicodeUtils.toMostlyDecomposedUnicode(unicode,
UnicodeUtils.NORM_NFTHDL);
// TODO(dchandler): optionally warn if we see
// "\u0f40\u0f74\u0f71" which is in the wrong order.
List gcs = SplitGC(decomposed);
StringBuffer sb = new StringBuffer();
for (Iterator it = gcs.iterator(); it.hasNext(); ) {
GC gc = (GC)it.next();
StringBuffer ewts = gc.getEwtsForComputers();
if (null == ewts) {
// TODO(dchandler): use ErrorsAndWarnings?
ewts = new StringBuffer("[#ERROR 301: The Unicode '"
+ gc.getNfthdl()
+ "' (has no EWTS transliteration]");
if (null != errors) {
errors.append(ewts);
errors.append('\n');
}
}
sb.append(ewts);
}
return sb.toString();
} }
} }
// TODO(dchandler): give a mode where an error is given if non-Tibetan
// or at least non-EWTS (think U+534D, e.g.) is found

View file

@ -20,8 +20,9 @@ package org.thdl.tib.text.reverter;
import junit.framework.TestCase; import junit.framework.TestCase;
import org.thdl.util.ThdlOptions; import org.thdl.tib.text.tshegbar.UnicodeUtils;
import org.thdl.tib.text.ttt.ErrorsAndWarnings; import org.thdl.tib.text.ttt.ErrorsAndWarnings;
import org.thdl.util.ThdlOptions;
/** Tests the Converter class. /** Tests the Converter class.
* *
@ -47,9 +48,112 @@ public class ConverterTest extends TestCase {
ThdlOptions.setUserPreference("thdl.debug", true); ThdlOptions.setUserPreference("thdl.debug", true);
} }
/** Asserts that converting s from Unicode to EWTS yields an
* error. */
private void err(String s) {
StringBuffer sb = new StringBuffer();
String ewts = Converter.convertToEwtsForComputers(s, sb);
boolean error = (sb.length() > 0);
if (!error) {
System.out.println("expected error but got EWTS '" + ewts
+ "' for "
+ UnicodeUtils.unicodeStringToPrettyString(s));
}
assertTrue(error);
}
/** Tests Converter.convertToEwtsForHumans. */
private void hconv(String uni, String ewts) {
System.out.println("TODO(dchandler): DLC: implement me");
}
/** Tests Converter.convertToEwtsForComputers. */
private void conv(String uni, String ewts) {
StringBuffer sb = new StringBuffer();
String actualEwts = Converter.convertToEwtsForComputers(uni, sb);
assertEquals("Expected " + ewts + " but got " + actualEwts + ":\n",
ewts, actualEwts);
boolean error = (sb.length() > 0);
assertTrue(!error);
}
public ConverterTest() { } public ConverterTest() { }
public void testUnicodeToEwts() { public void testUnicodeToEwts() {
assertEquals(Converter.convertToEwts("\u0f40", null), "ka"); conv("\u0f56\u0f62\u0f9f\u0f42\u0f66\u0f0b", "bar+tagasa ");
conv("\u0f40", "ka");
// TODO(dchandler): DLC Tibetans use Arabic numerals and English punctuation.
// conv("0123456789.\u0f40", "0123456789.ka");
conv("\u0f40\u0f7b", "kai");
conv("\u0f40\u0f76", "k+r-i");
conv("\u0f40\u0020\u0f40", "ka_ka");
conv("\u0f40\n\u0f40\t\u0f40\r\n", "ka\nka\tka\r\n");
conv("\u0f04\u0f05\u0f40\u0f0c\u00a0\u0f42", "@#ka*_ga");
conv("\u0f42\u0f61", "gaya");
hconv("\u0f42\u0f61", "g.ya");
conv("\u0f42\u0fb1", "g+ya");
hconv("\u0f42\u0fb1", "gya");
conv("\u0f54\u0f7e", "paM");
conv("\u0f54\u0f71\u0f7e", "pAM");
conv("\u0f54\u0f7e", "paM");
conv("\u0f54\u0f74\u0f7e", "puM");
conv("\u0f54\u0fc6", "p\\u0FC6");
conv("\u0f40\u0f72\u0f74", "ku+i"); // bottom-to-top
conv("\u0f40\u0f72\u0f74\u0f39", "k^u+i"); // 0f39 first
conv("\u0f40\u0f73", "kI");
conv("\u0f40\u0f71\u0f72", "kI");
conv("\u0f40\u0f72\u0f71", "kI");
conv("\u0f40\u0f73\u0f74", "kU+i");
err("\u0f48");
err("\u0f32\u0f39");
err("\u0f47\u0f98");
conv("\u0fcc", "\\u0FCC");
err("\u0fcd");
err("\u0f90");
err("\u0f90\u0fc6");
conv("\u0f0b\u0fc6", " \\u0FC6"); // ugly but legal...
err("\u0f0b\u0f90");
err("\u0f0b\u0f74");
err("\u0f0b\u0f7f");
err("\u0f0b\u0f3e");
conv("\u0f32\u0f18", "\\u0F32\\u0F18");
conv("\u0f54\u0fa4\u0f90", "p+p+ka");
// TODO(dchandler): warn("\u0f54\u0fa4\u0f90\u0f39"); (or do
// CCCVs work for this?)
if (false) {
// 0f39 could go with any of the three, so we give an error:
err("\u0f54\u0fa4\u0f90\u0f74\u0f39");
} else {
// TODO(dchandler): I want an error, not this:
conv("\u0f54\u0fa4\u0f90\u0f74\u0f39", "p+p+k^u");
}
conv("\u0f54\u0fa4\u0f90\u0f39", "p+p+k^a");
conv("\u0f55\u0f39", "fa");
conv("\u0f55\u0f74\u0f39", "fu");
conv("\u0f56\u0f39", "va");
conv("\u0f56\u0f74\u0f39", "vu");
conv("\u0f54\u0f39\u0fa4\u0f90", "p^+p+ka");
conv("\u0f40\u0f7e", "kaM");
conv("\u0f40\u0f83", "ka~M");
conv("\u0f40\u0f82", "ka~M`");
conv("\u0f40\u0f84", "ka?");
conv("\u0f40\u0f85\u0f40", "ka&ka");
err("\u0f7f");
conv("\u0f40\u0f7f", "kaH");
conv("\u0f40\u0f7f\u0f72", "kiH");
conv("\u0f40\u0f7f\u0f7f\u0f72\u0f7f", "kiHHH");
conv("\u0f40\u0f7f\u0f7e", "kaHM");
conv("\u0f40\u0f7e\u0f7f", "kaMH");
conv("\u0f40\u0f7f\u0f7e\u0f72", "kiHM");
conv("\u0f04\u0f05", "@#");
conv("\u0f04\u0f05\u0f05", "@##");
conv("\u0f04", "@"); // TODO(dchandler): Is this ever seen
// alone? warn/error otherwise.
conv("\u0f05", "#"); // TODO(dchandler): warn or error
} }
} }
// TODO(dchandler): DLC: test all these round-trip, i.e. assert that
// Uni->EWTS->Uni produces the same Uni.
// TODO(dchandler): test with ZWSP or joiners or whatever weird crap
// you can throw in legally to alter boundaries

View file

@ -0,0 +1,200 @@
/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2005 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text.reverter;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import org.thdl.util.ThdlDebug;
import org.thdl.tib.text.THDLWylieConstants;
import org.thdl.tib.text.tshegbar.UnicodeUtils;
import org.thdl.tib.text.tshegbar.UnicodeCodepointToThdlWylie;
/** Grapheme cluster backed by a String of Unicode. For the most part
* these are <em>combining character sequences</em> as defined by
* Unicode, but (U+0F04 U+0F05+) [TODO(dchandler): not yet handled as
* a single GC] is an example of a grapheme cluster that is not a
* combining character sequence.
* @author David Chandler
*/
class GC {
/** NFTHDL-decomposed Unicode */
private String nfthdl;
/** True if valid. True for digits w/ digit combiners, character
* stack plus optional wowels, a standalone mark. False for
* anything else, e.g. "\u0f0b\u0f90". */
private boolean valid;
/** Constructor that takes the NFTHDL-decomposed Unicode for the
* grapheme cluster. */
public GC(String nfthdl) {
setNfthdl(nfthdl);
}
/** A regex that matches the NFTHDL Unicode for a consonant stack
* with optional wowels. */
public static String consonantStackRegexString
= "[\u0f40-\u0f47\u0f49-\u0f6a]" // base consonant
+ "[\u0f90-\u0f97\u0f99-\u0fbc\u0f39]*" // subjoined cons.
+ "\u0f71?" // a-chung
+ "[\u0f72\u0f73\u0f74\u0f7a-\u0f7d\u0f80]*" // vowel proper
+ "[\u0f35\u0f37\u0f7e\u0f7f\u0f82-\u0f84" // wowels
+ "\u0f86\u0f87\u0fc6]*";
private static Pattern validGcRegex = Pattern.compile(
"^"
// numeric:
+ "([\u0f20-\u0f33][\u0f18\u0f19]*)|"
// consonant w/ optional wowels:
+ "(" + consonantStackRegexString + ")|"
// other symbol with optional U+0FC6
+ "([\u0f00-\u0f17\u0f1a-\u0f1f\u0f34\u0f36\u0f38"
+ "\u0f3a-\u0f3d\u0f85\u0f88-\u0f8b\u0fbe-\u0fc5"
+ "\u0fc7-\u0fcc\u0fcf-\u0fd1]\u0fc6?)|"
// other symbol that does not take U+0FC6.
// TODO(dchandler): include 0f0b etc. in this group?
+ "([ \t\u00a0\n\r]{1,})" // DLC handling of English... [0-9\\.:a-zA-Z] etc. what to do?
+ "$");
private static final boolean debug = false;
/** Returns NFTHDL-decomposed Unicode representing this grapheme
* cluster. */
private void setNfthdl(String nfthdl) {
if (debug) {
System.out.println("debug: GC is "
+ UnicodeUtils.unicodeStringToPrettyString(nfthdl));
}
this.nfthdl = nfthdl;
assert (nfthdl.length() > 0);
if (nfthdl.length() < 1)
valid = false;
valid = validGcRegex.matcher(nfthdl).matches();
}
/** Returns NFTHDL-decomposed Unicode representing this grapheme
* cluster. */
public String getNfthdl() { return nfthdl; }
/** Returns true iff ch is a vowel proper, not a wowel */
private boolean isVowel(char ch) {
// (We won't see \u0f76 etc. in NFTHDL, but the handling of
// them is suspect.)
return ((ch >= '\u0f71' && ch <= '\u0f75')
|| (ch >= '\u0f7a' && ch <= '\u0f7d')
|| (ch >= '\u0f81' && ch <= '\u0f82'));
}
private boolean isWowelRequiringPrecedingVowel(char ch) {
// not 0f39 0f18 0f19 e.g.
return ("\u0f35\u0f37\u0f7e\u0f7f\u0f82\u0f83\u0f84\u0f86\u0f87".indexOf(ch) >= 0);
// NOTE: 0f7f is questionable 0fc6 too... we assume [k\\u0fc6]
// is good EWTS.
}
/** Returns EWTS that is valid but not beautiful. It's better
* suited for consumption by computer programs than by humans,
* though it'll do in a pinch. (Humans like to see [rnams] instead
* of [r+namasa].)
* @return null if this grapheme cluster has no valid EWTS
* representation or valid-but-ugly EWTS otherwise */
public StringBuffer getEwtsForComputers() {
if (!valid) {
return null;
}
StringBuffer sb = new StringBuffer();
// We use ch after the loop. Initialization is not really
// needed; it's just to avoid compiler errors.
char ch = 'X';
boolean seenVowel = false;
String lastEwts = "";
boolean added_aVOWEL = false;
for (int i = 0; i < nfthdl.length(); i++) {
ch = nfthdl.charAt(i);
String ewts
= UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(ch);
if (i + 1 < nfthdl.length()) { // lookahead
// Even computers want to see kI because the spec
// isn't (or at least hasn't always been) crystal
// clear that kA+i is equivalent to kI.
if (('\u0f55' == ch || '\u0fa5' == ch)
&& '\u0f39' == nfthdl.charAt(i + 1)) {
++i;
ewts = "f"; // TODO(dchandler): hard-coded EWTS
} else if (('\u0f56' == ch || '\u0fa6' == ch)
&& '\u0f39' == nfthdl.charAt(i + 1)) {
++i;
ewts = "v"; // TODO(dchandler): hard-coded EWTS
} else if ('\u0f71' == ch && '\u0f72' == nfthdl.charAt(i + 1)) {
++i;
ewts = THDLWylieConstants.I_VOWEL;
// NOTE: we could normalize to 0f73 and 0f75 when
// possible in NFTHDL. That's closer to EWTS and
// would avoid these two special cases.
} else if ('\u0f71' == ch && '\u0f74' == nfthdl.charAt(i + 1)) {
++i;
ewts = THDLWylieConstants.U_VOWEL;
}
}
if (null == ewts && UnicodeUtils.isInTibetanRange(ch)) {
return null;
}
if (UnicodeUtils.isSubjoinedConsonant(ch)
|| (seenVowel && isVowel(ch)))
sb.append(THDLWylieConstants.WYLIE_SANSKRIT_STACKING_KEY);
if (isWowelRequiringPrecedingVowel(ch) && !seenVowel) {
if (!added_aVOWEL) {
added_aVOWEL = true;
sb.append(THDLWylieConstants.WYLIE_aVOWEL); // paM, no pM
}
}
if (isVowel(ch)) {
seenVowel = true;
}
sb.append(ewts);
lastEwts = ewts;
}
if (UnicodeUtils.isNonSubjoinedConsonant(ch)
|| UnicodeUtils.isSubjoinedConsonant(ch)
|| '\u0f39' == ch) {
ThdlDebug.verify(!added_aVOWEL);
sb.append(THDLWylieConstants.WYLIE_aVOWEL);
}
return sb;
}
public int hashCode() { return nfthdl.hashCode(); }
public boolean equals(Object o) {
return (o instanceof GC && ((GC)o).getNfthdl().equals(getNfthdl()));
}
/** Quasi-XML for humans */
public String toString() {
return "<GC valid=" + valid + " pretty=\""
+ UnicodeUtils.unicodeStringToPrettyString(getNfthdl())
+ "\"/>";
}
}

View file

@ -32,11 +32,12 @@ public class UnicodeToTranslitForXslt {
} }
/** Converts Tibetan Unicode to EWTS transliteration. */ /** Converts Tibetan Unicode to EWTS transliteration. */
public static String unicodeToEwts(String unicode) { public static String unicodeToEwtsForComputers(String unicode) {
return Converter.convertToEwts(unicode, null); return Converter.convertToEwtsForComputers(unicode, null);
} }
/** Converts Tibetan Unicode to ACIP transliteration. */ /** Converts Tibetan Unicode to ACIP transliteration. */
public static String unicodeToAcip(String unicode) { public static String unicodeToAcip(String unicode) {
throw new Error("DLC: not yet"); throw new Error("TODO(dchandler): not yet");
} }
} }

View file

@ -50,12 +50,15 @@ public class UnicodeToTranslitForXsltTest extends TestCase {
public UnicodeToTranslitForXsltTest() { } public UnicodeToTranslitForXsltTest() { }
public void testUnicodeToEwts() { public void testUnicodeToEwts() {
assertEquals(UnicodeToTranslitForXslt.unicodeToEwts("\u0f40"), "ka"); assertEquals("ka", UnicodeToTranslitForXslt.unicodeToEwtsForComputers("\u0f40"));
assertEquals(UnicodeToTranslitForXslt.unicodeToEwts("\u0f56\u0f62\u0f4f\u0f42\u0f66\u0f0b"), "brtags "); assertEquals("g+ya", UnicodeToTranslitForXslt.unicodeToEwtsForComputers("\u0f42\u0fb1"));
// TODO(dchandler): assertEquals("brtags ", UnicodeToTranslitForXslt.unicodeToEwtsForHumans("\u0f56\u0f62\u0f9f\u0f42\u0f66\u0f0b"));
} }
public void testUnicodeToAcip() { public void testUnicodeToAcip() {
assertEquals(UnicodeToTranslitForXslt.unicodeToEwts("\u0f40"), "KA"); if (false) {
assertEquals(UnicodeToTranslitForXslt.unicodeToEwts("\u0f56\u0f62\u0f4f\u0f42\u0f66\u0f0b"), "BRTAGS "); assertEquals("KA", UnicodeToTranslitForXslt.unicodeToAcip("\u0f40"));
assertEquals("BRTAGS ", UnicodeToTranslitForXslt.unicodeToAcip("\u0f56\u0f62\u0f9f\u0f42\u0f66\u0f0b"));
}
} }
} }

View file

@ -463,6 +463,7 @@ public final class LegalTshegBar
* concatenation like 'u'i'o. Returns false otherwise (including * concatenation like 'u'i'o. Returns false otherwise (including
* the case that suffix is the empty string). */ * the case that suffix is the empty string). */
public static boolean isAchungBasedSuffix(String suffix) { public static boolean isAchungBasedSuffix(String suffix) {
// TODO(dchandler): use java.util.regex
int i = 0; // so that the empty string causes false to be returned. int i = 0; // so that the empty string causes false to be returned.
while (i == 0 || !suffix.equals("")) { while (i == 0 || !suffix.equals("")) {
boolean startsWithOneOfThem = false; boolean startsWithOneOfThem = false;

View file

@ -67,11 +67,16 @@ public class UnicodeCodepointToThdlWylie {
// fail. // fail.
switch (x) { switch (x) {
case '\t': return "\t";
case '\n': return "\n";
case '\r': return "\r";
case ' ': return "_";
case '\u00a0': return "_";
case '\u0F00': return "oM"; case '\u0F00': return "oM";
case '\u0F01': return "\\u0F01"; case '\u0F01': return "\\u0F01";
case '\u0F02': return null; // DLC case '\u0F02': return "\\u0F02";
case '\u0F03': return null; // DLC case '\u0F03': return "\\u0F03";
case '\u0F04': return "@"; case '\u0F04': return "@";
case '\u0F05': return "#"; case '\u0F05': return "#";
case '\u0F06': return "$"; case '\u0F06': return "$";
@ -314,8 +319,6 @@ public class UnicodeCodepointToThdlWylie {
case '\u0FCF': return "\\u0FCF"; // DLC i added this to the 'EWTS document misspeaks' bug report... null I think... case '\u0FCF': return "\\u0FCF"; // DLC i added this to the 'EWTS document misspeaks' bug report... null I think...
default: { default: {
// DLC handle space (EW's "_")
// This codepoint is in the range 0FD0-0FFF or is not in // This codepoint is in the range 0FD0-0FFF or is not in
// the Tibetan range at all. In either case, there is no // the Tibetan range at all. In either case, there is no
// corresponding THDL Extended Wylie. // corresponding THDL Extended Wylie.

View file

@ -102,7 +102,10 @@ public class UnicodeUtils implements UnicodeConstants {
nor NFKD breaks down <code>U+0F00</code> into its constituent nor NFKD breaks down <code>U+0F00</code> into its constituent
codepoints. NFTHDL uses a maximum of codepoints, and it never codepoints. NFTHDL uses a maximum of codepoints, and it never
uses codepoints whose use has been {@link #isDiscouraged(char) uses codepoints whose use has been {@link #isDiscouraged(char)
discouraged}. discouraged}. NFTHDL also does not screw things up by using
the standard-but-wrong CCCVs. It sorts stretches of combining
characters wisely as per
{@link http://iris.lib.virginia.edu/tibet/xml/showEssay.php?xml=/tools/encodingTib.xml}.
<p>The Tibetan passages of the returned string are in the <p>The Tibetan passages of the returned string are in the
chosen normalized form, but codepoints outside of the {@link chosen normalized form, but codepoints outside of the {@link
@ -136,6 +139,9 @@ public class UnicodeUtils implements UnicodeConstants {
tibetanUnicode.insert(offset, s); tibetanUnicode.insert(offset, s);
} }
} }
if (normForm == NORM_NFTHDL) {
fixSomeOrderingErrorsInTibetanUnicode(tibetanUnicode);
}
} }
/** Like {@link #toMostlyDecomposedUnicode(StringBuffer, byte)}, /** Like {@link #toMostlyDecomposedUnicode(StringBuffer, byte)},
@ -418,7 +424,39 @@ public class UnicodeUtils implements UnicodeConstants {
* product.) * product.)
*/ */
private static char unicode_pairs[][] private static char unicode_pairs[][]
= { { '\u0f71', '\u0f74' }, = {
/* TODO(dchandler): use regex
* "[\u0f39\u0f71-\u0f84\u0f86\u0f87]{2,}" to find patches
* that need sorting and then sort each of those. This
* cross product is ugly. */
{ '\u0f39', '\u0f71' },
{ '\u0f39', '\u0f72' },
{ '\u0f39', '\u0f74' },
{ '\u0f39', '\u0f7a' },
{ '\u0f39', '\u0f7b' },
{ '\u0f39', '\u0f7c' },
{ '\u0f39', '\u0f7d' },
{ '\u0f39', '\u0f7e' },
{ '\u0f39', '\u0f7f' },
{ '\u0f39', '\u0f80' },
{ '\u0f39', '\u0f82' },
{ '\u0f39', '\u0f83' },
{ '\u0f71', '\u0f7f' },
{ '\u0f72', '\u0f7f' },
{ '\u0f74', '\u0f7f' },
{ '\u0f7a', '\u0f7f' },
{ '\u0f7b', '\u0f7f' },
{ '\u0f7c', '\u0f7f' },
{ '\u0f7d', '\u0f7f' },
// but not { '\u0f7e', '\u0f7f' },
{ '\u0f39', '\u0f7f' },
{ '\u0f80', '\u0f7f' },
{ '\u0f82', '\u0f7f' },
{ '\u0f83', '\u0f7f' },
{ '\u0f71', '\u0f74' },
{ '\u0f71', '\u0f72' }, { '\u0f71', '\u0f72' },
{ '\u0f71', '\u0f7a' }, { '\u0f71', '\u0f7a' },
@ -489,7 +527,9 @@ public class UnicodeUtils implements UnicodeConstants {
* the same file modulo Unicode booboos would be better. </p> * the same file modulo Unicode booboos would be better. </p>
* *
* @param sb the buffer to be mutated * @param sb the buffer to be mutated
* @return true if sb was mutated */ * @return true if sb was mutated
* @see <a href="http://iris.lib.virginia.edu/tibet/xml/showEssay.php?xml=/tools/encodingTib.xml">Tibetan Encoding Model</a>
*/
public static boolean fixSomeOrderingErrorsInTibetanUnicode(StringBuffer sb) { public static boolean fixSomeOrderingErrorsInTibetanUnicode(StringBuffer sb) {
boolean mutated = false; boolean mutated = false;
int len = sb.length(); int len = sb.length();
@ -512,25 +552,5 @@ public class UnicodeUtils implements UnicodeConstants {
} while (mutated_this_time_through); } while (mutated_this_time_through);
return mutated; return mutated;
} }
/** Returns true iff ch is a valid Tibetan codepoint in Unicode
* 4.0: */
public boolean isTibetanUnicodeCodepoint(char ch) {
// NOTE: could use an array of 256 booleans for speed but I'm lazy
return ((ch >= '\u0f00' && ch <= '\u0fcf')
&& !(ch == '\u0f48'
|| (ch > '\u0f6a' && ch < '\u0f71')
|| (ch > '\u0f8b' && ch < '\u0f90')
|| ch == '\u0f98'
|| ch == '\u0fbd'
|| ch == '\u0fcd'
|| ch == '\u0fce'));
}
/** Returns true iff ch is in 0F00-0FFF but isn't a valid Tibetan
* codepoint in Unicode 4.0: */
public boolean isInvalidTibetanUnicode(char ch) {
return (isInTibetanRange(ch) && !isTibetanUnicodeCodepoint(ch));
}
} }

View file

@ -798,6 +798,7 @@ public class EWTSTest extends TestCase {
just_ewts2uni_test("\\uefff", "\uefff"); just_ewts2uni_test("\\uefff", "\uefff");
} }
ewts2uni_test("kaHH", "\u0F40\u0f7f\u0f7f");
// Below was semiautomatically generated from the EWTS spec's // Below was semiautomatically generated from the EWTS spec's
// 'ewts.xml' representation (early August 2004 edition): // 'ewts.xml' representation (early August 2004 edition):

View file

@ -405,6 +405,12 @@ class TPairListFactory {
"\u0f74", THDLWylieConstants.u_VOWEL, "\u0f74", THDLWylieConstants.u_VOWEL,
// TODO(dchandler): equivalence classes I'm not
// sure.
// http://iris.lib.virginia.edu/tibet/xml/showEssay.php?xml=/tools/encodingTib.xml
// says to go above base and then upwards. Think
// it over.
// equivalence class: // equivalence class:
"\u0f72", THDLWylieConstants.i_VOWEL, "\u0f72", THDLWylieConstants.i_VOWEL,
"\u0f7a", THDLWylieConstants.e_VOWEL, "\u0f7a", THDLWylieConstants.e_VOWEL,