A reverter that converts Unicode to computer-friendly (but not, yet,
human-friendly) EWTS is here in alpha mode. It probably doesn't deal well with non-Tibetan.
This commit is contained in:
parent
00afd75362
commit
5788416629
13 changed files with 496 additions and 47 deletions
|
@ -165,8 +165,8 @@ the jvm starting tomcat:
|
||||||
<!-- Set this to 1.2 if you want J2SDK 1.4's default. 1.1 gives us
|
<!-- Set this to 1.2 if you want J2SDK 1.4's default. 1.1 gives us
|
||||||
more compatibility, but maybe there will be a performance hit
|
more compatibility, but maybe there will be a performance hit
|
||||||
or something. -->
|
or something. -->
|
||||||
<property name="target.jvm" value="1.2"/>
|
<property name="target.jvm" value="1.4"/>
|
||||||
<property name="source.jvm" value="1.2"/>
|
<property name="source.jvm" value="1.4"/>
|
||||||
|
|
||||||
<!-- Only the tt-servlet-compile target changes this. Humans
|
<!-- Only the tt-servlet-compile target changes this. Humans
|
||||||
shouldn't mess with this. -->
|
shouldn't mess with this. -->
|
||||||
|
|
|
@ -73,10 +73,8 @@
|
||||||
<formatter type="xml"/><!-- If not XML, then 'ant -buildfile
|
<formatter type="xml"/><!-- If not XML, then 'ant -buildfile
|
||||||
build.xml check-report' will fail. -->
|
build.xml check-report' will fail. -->
|
||||||
<sysproperty key="java.awt.headless" value="true"/>
|
<sysproperty key="java.awt.headless" value="true"/>
|
||||||
<!-- TODO(dchandler): DLC: enable these
|
|
||||||
<test name="org.thdl.tib.text.reverter.ConverterTest"/>
|
<test name="org.thdl.tib.text.reverter.ConverterTest"/>
|
||||||
<test name="org.thdl.tib.text.reverter.UnicodeToTranslitForXsltTest"/>
|
<test name="org.thdl.tib.text.reverter.UnicodeToTranslitForXsltTest"/>
|
||||||
-->
|
|
||||||
<test name="org.thdl.tib.text.ttt.EwtsToUnicodeForXsltTest"/>
|
<test name="org.thdl.tib.text.ttt.EwtsToUnicodeForXsltTest"/>
|
||||||
<test name="org.thdl.tib.text.ttt.EWTSTest"/>
|
<test name="org.thdl.tib.text.ttt.EWTSTest"/>
|
||||||
<test name="org.thdl.tib.text.ttt.EWTStibwniniTest"/>
|
<test name="org.thdl.tib.text.ttt.EWTStibwniniTest"/>
|
||||||
|
|
|
@ -350,7 +350,10 @@ public class TibetanConverter implements FontConverterConstants {
|
||||||
uniText = s.toString();
|
uniText = s.toString();
|
||||||
}
|
}
|
||||||
StringBuffer errors = new StringBuffer();
|
StringBuffer errors = new StringBuffer();
|
||||||
String ewtsText = Converter.convertToEwts(uniText, errors);
|
// TODO(dchandler): DLC: use human-friendly EWTS, not
|
||||||
|
// computer-friendly!
|
||||||
|
String ewtsText = Converter.convertToEwtsForComputers(uniText,
|
||||||
|
errors);
|
||||||
// TODO(dchandler): is 51 the right choice?
|
// TODO(dchandler): is 51 the right choice?
|
||||||
return (errors.length() > 0) ? 51 : 0;
|
return (errors.length() > 0) ? 51 : 0;
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
|
|
|
@ -18,6 +18,16 @@ Contributor(s): ______________________________________.
|
||||||
|
|
||||||
package org.thdl.tib.text.reverter;
|
package org.thdl.tib.text.reverter;
|
||||||
|
|
||||||
|
import java.text.BreakIterator;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Locale;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import org.thdl.tib.text.tshegbar.UnicodeUtils;
|
||||||
|
|
||||||
/** Static methods for converting Unicode to EWTS and
|
/** Static methods for converting Unicode to EWTS and
|
||||||
* (TODO(dchandler): ACIP).
|
* (TODO(dchandler): ACIP).
|
||||||
* @author David Chandler
|
* @author David Chandler
|
||||||
|
@ -28,11 +38,110 @@ public class Converter {
|
||||||
throw new Error("There's no point in instantiating this class.");
|
throw new Error("There's no point in instantiating this class.");
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Converts Tibetan Unicode to EWTS transliteration. If errors
|
/** Finds combining character sequences. */
|
||||||
* is non-null, error messages are appended to it. (Errors are
|
private static BreakIterator breaker
|
||||||
* always inline.) */
|
= BreakIterator.getCharacterInstance(new Locale("bo"));
|
||||||
public static String convertToEwts(String unicode,
|
|
||||||
StringBuffer errors /* DLC: use it */) {
|
|
||||||
throw new Error("DLC not yet");
|
private static final boolean debug = false;
|
||||||
|
|
||||||
|
// TODO(dchandler): use this to create LegalTshegBar objects, it's
|
||||||
|
// unused right now.
|
||||||
|
private static Pattern mightBeLegalTshegBarRegex = Pattern.compile(
|
||||||
|
"^"
|
||||||
|
+ "([\u0f42\u0f51\u0f56\u0f58\u0f60])?"
|
||||||
|
// root stack: consonant w/ optional wowels:
|
||||||
|
+ "(" + GC.consonantStackRegexString + ")"
|
||||||
|
+ "(([\u0f42\u0f51\u0f56\u0f58\u0f60\u0f44\u0f53\u0f62\u0f63\u0f66][\u0f51\u0f66]?)"
|
||||||
|
+ "|(\u0f60[\u0f72\u0f74\u0f7c\u0f44\u0f58])+)?"
|
||||||
|
+ "$");
|
||||||
|
|
||||||
|
/** Splits nfthdl into grapheme clusters. Let's define a grapheme
|
||||||
|
* cluster as something an end user would say cannot be
|
||||||
|
* decomposed into two separate pieces sensibly. For the most
|
||||||
|
* part this is just figuring out the <em>combining character
|
||||||
|
* sequences</em> as defined by Unicode, but (U+0F04 U+0F05*) is
|
||||||
|
* an example of a grapheme cluster that is not a combining
|
||||||
|
* character sequence (TODO(dchandler): (0f04 0f05*), is it
|
||||||
|
* really worth it? We don't handle it right now, might be good
|
||||||
|
* for Unicode->ACIP anyway.)
|
||||||
|
* @param nfthdl Unicode in NFTHDL decomposition form
|
||||||
|
* @return List of GC objects */
|
||||||
|
private static List/*<GC>*/ SplitGC(String nfthdl) {
|
||||||
|
|
||||||
|
if (debug) {
|
||||||
|
System.out.println("debug: "
|
||||||
|
+ UnicodeUtils.unicodeStringToPrettyString(nfthdl));
|
||||||
|
}
|
||||||
|
ArrayList al = new ArrayList();
|
||||||
|
breaker.setText(nfthdl);
|
||||||
|
int start = breaker.first();
|
||||||
|
boolean just_saw_0f7f = false;
|
||||||
|
for (int end = breaker.next();
|
||||||
|
end != BreakIterator.DONE;
|
||||||
|
start = end, end = breaker.next()) {
|
||||||
|
if ((just_saw_0f7f
|
||||||
|
&& (Character.getType(nfthdl.charAt(start))
|
||||||
|
== Character.NON_SPACING_MARK))
|
||||||
|
|| (end > start && '\u0f7f' == nfthdl.charAt(start)
|
||||||
|
&& !al.isEmpty())) {
|
||||||
|
// U+0F7F is a COMBINING_SPACING_MARK, not a
|
||||||
|
// NON_SPACING_MARK, but we want to treat it like a
|
||||||
|
// NON_SPACING_MARK.
|
||||||
|
GC gc = new GC(((GC)al.get(al.size() - 1)).getNfthdl()
|
||||||
|
+ nfthdl.substring(start,end));
|
||||||
|
if (debug) {
|
||||||
|
System.out.println("debug: setting last el, "
|
||||||
|
+ al.get(al.size() - 1) + " to " + gc);
|
||||||
|
}
|
||||||
|
al.set(al.size() - 1, gc);
|
||||||
|
} else {
|
||||||
|
al.add(new GC(nfthdl.substring(start,end)));
|
||||||
|
}
|
||||||
|
just_saw_0f7f
|
||||||
|
= (end > start && '\u0f7f' == nfthdl.charAt(end - 1));
|
||||||
|
}
|
||||||
|
return al;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Converts Tibetan Unicode to computer-friendly EWTS
|
||||||
|
* transliteration. Computer-friendly is not human-friendly but
|
||||||
|
* hopefully even poorly written EWTS->Tibetan converters could
|
||||||
|
* handle the output. If errors is non-null, error messages are
|
||||||
|
* appended to it. (Errors are always inline.) */
|
||||||
|
public static String convertToEwtsForComputers(String unicode,
|
||||||
|
StringBuffer errors) {
|
||||||
|
|
||||||
|
// First, normalize as much as we can to reduce the number of
|
||||||
|
// cases we must handle.
|
||||||
|
String decomposed
|
||||||
|
= UnicodeUtils.toMostlyDecomposedUnicode(unicode,
|
||||||
|
UnicodeUtils.NORM_NFTHDL);
|
||||||
|
|
||||||
|
// TODO(dchandler): optionally warn if we see
|
||||||
|
// "\u0f40\u0f74\u0f71" which is in the wrong order.
|
||||||
|
|
||||||
|
List gcs = SplitGC(decomposed);
|
||||||
|
|
||||||
|
StringBuffer sb = new StringBuffer();
|
||||||
|
for (Iterator it = gcs.iterator(); it.hasNext(); ) {
|
||||||
|
GC gc = (GC)it.next();
|
||||||
|
StringBuffer ewts = gc.getEwtsForComputers();
|
||||||
|
if (null == ewts) {
|
||||||
|
// TODO(dchandler): use ErrorsAndWarnings?
|
||||||
|
ewts = new StringBuffer("[#ERROR 301: The Unicode '"
|
||||||
|
+ gc.getNfthdl()
|
||||||
|
+ "' (has no EWTS transliteration]");
|
||||||
|
if (null != errors) {
|
||||||
|
errors.append(ewts);
|
||||||
|
errors.append('\n');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sb.append(ewts);
|
||||||
|
}
|
||||||
|
return sb.toString();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO(dchandler): give a mode where an error is given if non-Tibetan
|
||||||
|
// or at least non-EWTS (think U+534D, e.g.) is found
|
||||||
|
|
|
@ -20,8 +20,9 @@ package org.thdl.tib.text.reverter;
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
import org.thdl.util.ThdlOptions;
|
import org.thdl.tib.text.tshegbar.UnicodeUtils;
|
||||||
import org.thdl.tib.text.ttt.ErrorsAndWarnings;
|
import org.thdl.tib.text.ttt.ErrorsAndWarnings;
|
||||||
|
import org.thdl.util.ThdlOptions;
|
||||||
|
|
||||||
/** Tests the Converter class.
|
/** Tests the Converter class.
|
||||||
*
|
*
|
||||||
|
@ -47,9 +48,112 @@ public class ConverterTest extends TestCase {
|
||||||
ThdlOptions.setUserPreference("thdl.debug", true);
|
ThdlOptions.setUserPreference("thdl.debug", true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Asserts that converting s from Unicode to EWTS yields an
|
||||||
|
* error. */
|
||||||
|
private void err(String s) {
|
||||||
|
StringBuffer sb = new StringBuffer();
|
||||||
|
String ewts = Converter.convertToEwtsForComputers(s, sb);
|
||||||
|
boolean error = (sb.length() > 0);
|
||||||
|
if (!error) {
|
||||||
|
System.out.println("expected error but got EWTS '" + ewts
|
||||||
|
+ "' for "
|
||||||
|
+ UnicodeUtils.unicodeStringToPrettyString(s));
|
||||||
|
}
|
||||||
|
assertTrue(error);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Tests Converter.convertToEwtsForHumans. */
|
||||||
|
private void hconv(String uni, String ewts) {
|
||||||
|
System.out.println("TODO(dchandler): DLC: implement me");
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Tests Converter.convertToEwtsForComputers. */
|
||||||
|
private void conv(String uni, String ewts) {
|
||||||
|
StringBuffer sb = new StringBuffer();
|
||||||
|
String actualEwts = Converter.convertToEwtsForComputers(uni, sb);
|
||||||
|
assertEquals("Expected " + ewts + " but got " + actualEwts + ":\n",
|
||||||
|
ewts, actualEwts);
|
||||||
|
boolean error = (sb.length() > 0);
|
||||||
|
assertTrue(!error);
|
||||||
|
}
|
||||||
|
|
||||||
public ConverterTest() { }
|
public ConverterTest() { }
|
||||||
|
|
||||||
public void testUnicodeToEwts() {
|
public void testUnicodeToEwts() {
|
||||||
assertEquals(Converter.convertToEwts("\u0f40", null), "ka");
|
conv("\u0f56\u0f62\u0f9f\u0f42\u0f66\u0f0b", "bar+tagasa ");
|
||||||
|
conv("\u0f40", "ka");
|
||||||
|
// TODO(dchandler): DLC Tibetans use Arabic numerals and English punctuation.
|
||||||
|
// conv("0123456789.\u0f40", "0123456789.ka");
|
||||||
|
conv("\u0f40\u0f7b", "kai");
|
||||||
|
conv("\u0f40\u0f76", "k+r-i");
|
||||||
|
conv("\u0f40\u0020\u0f40", "ka_ka");
|
||||||
|
conv("\u0f40\n\u0f40\t\u0f40\r\n", "ka\nka\tka\r\n");
|
||||||
|
conv("\u0f04\u0f05\u0f40\u0f0c\u00a0\u0f42", "@#ka*_ga");
|
||||||
|
conv("\u0f42\u0f61", "gaya");
|
||||||
|
hconv("\u0f42\u0f61", "g.ya");
|
||||||
|
conv("\u0f42\u0fb1", "g+ya");
|
||||||
|
hconv("\u0f42\u0fb1", "gya");
|
||||||
|
conv("\u0f54\u0f7e", "paM");
|
||||||
|
conv("\u0f54\u0f71\u0f7e", "pAM");
|
||||||
|
conv("\u0f54\u0f7e", "paM");
|
||||||
|
conv("\u0f54\u0f74\u0f7e", "puM");
|
||||||
|
conv("\u0f54\u0fc6", "p\\u0FC6");
|
||||||
|
conv("\u0f40\u0f72\u0f74", "ku+i"); // bottom-to-top
|
||||||
|
conv("\u0f40\u0f72\u0f74\u0f39", "k^u+i"); // 0f39 first
|
||||||
|
conv("\u0f40\u0f73", "kI");
|
||||||
|
conv("\u0f40\u0f71\u0f72", "kI");
|
||||||
|
conv("\u0f40\u0f72\u0f71", "kI");
|
||||||
|
conv("\u0f40\u0f73\u0f74", "kU+i");
|
||||||
|
err("\u0f48");
|
||||||
|
err("\u0f32\u0f39");
|
||||||
|
err("\u0f47\u0f98");
|
||||||
|
conv("\u0fcc", "\\u0FCC");
|
||||||
|
err("\u0fcd");
|
||||||
|
err("\u0f90");
|
||||||
|
err("\u0f90\u0fc6");
|
||||||
|
conv("\u0f0b\u0fc6", " \\u0FC6"); // ugly but legal...
|
||||||
|
err("\u0f0b\u0f90");
|
||||||
|
err("\u0f0b\u0f74");
|
||||||
|
err("\u0f0b\u0f7f");
|
||||||
|
err("\u0f0b\u0f3e");
|
||||||
|
conv("\u0f32\u0f18", "\\u0F32\\u0F18");
|
||||||
|
conv("\u0f54\u0fa4\u0f90", "p+p+ka");
|
||||||
|
// TODO(dchandler): warn("\u0f54\u0fa4\u0f90\u0f39"); (or do
|
||||||
|
// CCCVs work for this?)
|
||||||
|
if (false) {
|
||||||
|
// 0f39 could go with any of the three, so we give an error:
|
||||||
|
err("\u0f54\u0fa4\u0f90\u0f74\u0f39");
|
||||||
|
} else {
|
||||||
|
// TODO(dchandler): I want an error, not this:
|
||||||
|
conv("\u0f54\u0fa4\u0f90\u0f74\u0f39", "p+p+k^u");
|
||||||
|
}
|
||||||
|
conv("\u0f54\u0fa4\u0f90\u0f39", "p+p+k^a");
|
||||||
|
conv("\u0f55\u0f39", "fa");
|
||||||
|
conv("\u0f55\u0f74\u0f39", "fu");
|
||||||
|
conv("\u0f56\u0f39", "va");
|
||||||
|
conv("\u0f56\u0f74\u0f39", "vu");
|
||||||
|
conv("\u0f54\u0f39\u0fa4\u0f90", "p^+p+ka");
|
||||||
|
conv("\u0f40\u0f7e", "kaM");
|
||||||
|
conv("\u0f40\u0f83", "ka~M");
|
||||||
|
conv("\u0f40\u0f82", "ka~M`");
|
||||||
|
conv("\u0f40\u0f84", "ka?");
|
||||||
|
conv("\u0f40\u0f85\u0f40", "ka&ka");
|
||||||
|
err("\u0f7f");
|
||||||
|
conv("\u0f40\u0f7f", "kaH");
|
||||||
|
conv("\u0f40\u0f7f\u0f72", "kiH");
|
||||||
|
conv("\u0f40\u0f7f\u0f7f\u0f72\u0f7f", "kiHHH");
|
||||||
|
conv("\u0f40\u0f7f\u0f7e", "kaHM");
|
||||||
|
conv("\u0f40\u0f7e\u0f7f", "kaMH");
|
||||||
|
conv("\u0f40\u0f7f\u0f7e\u0f72", "kiHM");
|
||||||
|
conv("\u0f04\u0f05", "@#");
|
||||||
|
conv("\u0f04\u0f05\u0f05", "@##");
|
||||||
|
conv("\u0f04", "@"); // TODO(dchandler): Is this ever seen
|
||||||
|
// alone? warn/error otherwise.
|
||||||
|
conv("\u0f05", "#"); // TODO(dchandler): warn or error
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// TODO(dchandler): DLC: test all these round-trip, i.e. assert that
|
||||||
|
// Uni->EWTS->Uni produces the same Uni.
|
||||||
|
|
||||||
|
// TODO(dchandler): test with ZWSP or joiners or whatever weird crap
|
||||||
|
// you can throw in legally to alter boundaries
|
||||||
|
|
200
source/org/thdl/tib/text/reverter/GC.java
Normal file
200
source/org/thdl/tib/text/reverter/GC.java
Normal file
|
@ -0,0 +1,200 @@
|
||||||
|
/*
|
||||||
|
The contents of this file are subject to the THDL Open Community License
|
||||||
|
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||||
|
with the License. You may obtain a copy of the License on the THDL web site
|
||||||
|
(http://www.thdl.org/).
|
||||||
|
|
||||||
|
Software distributed under the License is distributed on an "AS IS" basis,
|
||||||
|
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||||
|
License for the specific terms governing rights and limitations under the
|
||||||
|
License.
|
||||||
|
|
||||||
|
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||||
|
Library (THDL). Portions created by the THDL are Copyright 2005 THDL.
|
||||||
|
All Rights Reserved.
|
||||||
|
|
||||||
|
Contributor(s): ______________________________________.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.thdl.tib.text.reverter;
|
||||||
|
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
|
||||||
|
import org.thdl.util.ThdlDebug;
|
||||||
|
import org.thdl.tib.text.THDLWylieConstants;
|
||||||
|
import org.thdl.tib.text.tshegbar.UnicodeUtils;
|
||||||
|
import org.thdl.tib.text.tshegbar.UnicodeCodepointToThdlWylie;
|
||||||
|
|
||||||
|
/** Grapheme cluster backed by a String of Unicode. For the most part
|
||||||
|
* these are <em>combining character sequences</em> as defined by
|
||||||
|
* Unicode, but (U+0F04 U+0F05+) [TODO(dchandler): not yet handled as
|
||||||
|
* a single GC] is an example of a grapheme cluster that is not a
|
||||||
|
* combining character sequence.
|
||||||
|
* @author David Chandler
|
||||||
|
*/
|
||||||
|
class GC {
|
||||||
|
/** NFTHDL-decomposed Unicode */
|
||||||
|
private String nfthdl;
|
||||||
|
|
||||||
|
/** True if valid. True for digits w/ digit combiners, character
|
||||||
|
* stack plus optional wowels, a standalone mark. False for
|
||||||
|
* anything else, e.g. "\u0f0b\u0f90". */
|
||||||
|
private boolean valid;
|
||||||
|
|
||||||
|
/** Constructor that takes the NFTHDL-decomposed Unicode for the
|
||||||
|
* grapheme cluster. */
|
||||||
|
public GC(String nfthdl) {
|
||||||
|
setNfthdl(nfthdl);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** A regex that matches the NFTHDL Unicode for a consonant stack
|
||||||
|
* with optional wowels. */
|
||||||
|
public static String consonantStackRegexString
|
||||||
|
= "[\u0f40-\u0f47\u0f49-\u0f6a]" // base consonant
|
||||||
|
+ "[\u0f90-\u0f97\u0f99-\u0fbc\u0f39]*" // subjoined cons.
|
||||||
|
+ "\u0f71?" // a-chung
|
||||||
|
+ "[\u0f72\u0f73\u0f74\u0f7a-\u0f7d\u0f80]*" // vowel proper
|
||||||
|
+ "[\u0f35\u0f37\u0f7e\u0f7f\u0f82-\u0f84" // wowels
|
||||||
|
+ "\u0f86\u0f87\u0fc6]*";
|
||||||
|
|
||||||
|
private static Pattern validGcRegex = Pattern.compile(
|
||||||
|
"^"
|
||||||
|
// numeric:
|
||||||
|
+ "([\u0f20-\u0f33][\u0f18\u0f19]*)|"
|
||||||
|
|
||||||
|
// consonant w/ optional wowels:
|
||||||
|
+ "(" + consonantStackRegexString + ")|"
|
||||||
|
|
||||||
|
// other symbol with optional U+0FC6
|
||||||
|
+ "([\u0f00-\u0f17\u0f1a-\u0f1f\u0f34\u0f36\u0f38"
|
||||||
|
+ "\u0f3a-\u0f3d\u0f85\u0f88-\u0f8b\u0fbe-\u0fc5"
|
||||||
|
+ "\u0fc7-\u0fcc\u0fcf-\u0fd1]\u0fc6?)|"
|
||||||
|
|
||||||
|
// other symbol that does not take U+0FC6.
|
||||||
|
// TODO(dchandler): include 0f0b etc. in this group?
|
||||||
|
+ "([ \t\u00a0\n\r]{1,})" // DLC handling of English... [0-9\\.:a-zA-Z] etc. what to do?
|
||||||
|
|
||||||
|
+ "$");
|
||||||
|
|
||||||
|
private static final boolean debug = false;
|
||||||
|
|
||||||
|
/** Returns NFTHDL-decomposed Unicode representing this grapheme
|
||||||
|
* cluster. */
|
||||||
|
private void setNfthdl(String nfthdl) {
|
||||||
|
if (debug) {
|
||||||
|
System.out.println("debug: GC is "
|
||||||
|
+ UnicodeUtils.unicodeStringToPrettyString(nfthdl));
|
||||||
|
}
|
||||||
|
this.nfthdl = nfthdl;
|
||||||
|
assert (nfthdl.length() > 0);
|
||||||
|
if (nfthdl.length() < 1)
|
||||||
|
valid = false;
|
||||||
|
valid = validGcRegex.matcher(nfthdl).matches();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns NFTHDL-decomposed Unicode representing this grapheme
|
||||||
|
* cluster. */
|
||||||
|
public String getNfthdl() { return nfthdl; }
|
||||||
|
|
||||||
|
/** Returns true iff ch is a vowel proper, not a wowel */
|
||||||
|
private boolean isVowel(char ch) {
|
||||||
|
// (We won't see \u0f76 etc. in NFTHDL, but the handling of
|
||||||
|
// them is suspect.)
|
||||||
|
return ((ch >= '\u0f71' && ch <= '\u0f75')
|
||||||
|
|| (ch >= '\u0f7a' && ch <= '\u0f7d')
|
||||||
|
|| (ch >= '\u0f81' && ch <= '\u0f82'));
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isWowelRequiringPrecedingVowel(char ch) {
|
||||||
|
// not 0f39 0f18 0f19 e.g.
|
||||||
|
return ("\u0f35\u0f37\u0f7e\u0f7f\u0f82\u0f83\u0f84\u0f86\u0f87".indexOf(ch) >= 0);
|
||||||
|
|
||||||
|
// NOTE: 0f7f is questionable 0fc6 too... we assume [k\\u0fc6]
|
||||||
|
// is good EWTS.
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns EWTS that is valid but not beautiful. It's better
|
||||||
|
* suited for consumption by computer programs than by humans,
|
||||||
|
* though it'll do in a pinch. (Humans like to see [rnams] instead
|
||||||
|
* of [r+namasa].)
|
||||||
|
* @return null if this grapheme cluster has no valid EWTS
|
||||||
|
* representation or valid-but-ugly EWTS otherwise */
|
||||||
|
public StringBuffer getEwtsForComputers() {
|
||||||
|
if (!valid) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
StringBuffer sb = new StringBuffer();
|
||||||
|
// We use ch after the loop. Initialization is not really
|
||||||
|
// needed; it's just to avoid compiler errors.
|
||||||
|
char ch = 'X';
|
||||||
|
boolean seenVowel = false;
|
||||||
|
String lastEwts = "";
|
||||||
|
boolean added_aVOWEL = false;
|
||||||
|
for (int i = 0; i < nfthdl.length(); i++) {
|
||||||
|
ch = nfthdl.charAt(i);
|
||||||
|
String ewts
|
||||||
|
= UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(ch);
|
||||||
|
if (i + 1 < nfthdl.length()) { // lookahead
|
||||||
|
// Even computers want to see kI because the spec
|
||||||
|
// isn't (or at least hasn't always been) crystal
|
||||||
|
// clear that kA+i is equivalent to kI.
|
||||||
|
if (('\u0f55' == ch || '\u0fa5' == ch)
|
||||||
|
&& '\u0f39' == nfthdl.charAt(i + 1)) {
|
||||||
|
++i;
|
||||||
|
ewts = "f"; // TODO(dchandler): hard-coded EWTS
|
||||||
|
} else if (('\u0f56' == ch || '\u0fa6' == ch)
|
||||||
|
&& '\u0f39' == nfthdl.charAt(i + 1)) {
|
||||||
|
++i;
|
||||||
|
ewts = "v"; // TODO(dchandler): hard-coded EWTS
|
||||||
|
} else if ('\u0f71' == ch && '\u0f72' == nfthdl.charAt(i + 1)) {
|
||||||
|
++i;
|
||||||
|
ewts = THDLWylieConstants.I_VOWEL;
|
||||||
|
// NOTE: we could normalize to 0f73 and 0f75 when
|
||||||
|
// possible in NFTHDL. That's closer to EWTS and
|
||||||
|
// would avoid these two special cases.
|
||||||
|
} else if ('\u0f71' == ch && '\u0f74' == nfthdl.charAt(i + 1)) {
|
||||||
|
++i;
|
||||||
|
ewts = THDLWylieConstants.U_VOWEL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (null == ewts && UnicodeUtils.isInTibetanRange(ch)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
if (UnicodeUtils.isSubjoinedConsonant(ch)
|
||||||
|
|| (seenVowel && isVowel(ch)))
|
||||||
|
sb.append(THDLWylieConstants.WYLIE_SANSKRIT_STACKING_KEY);
|
||||||
|
if (isWowelRequiringPrecedingVowel(ch) && !seenVowel) {
|
||||||
|
if (!added_aVOWEL) {
|
||||||
|
added_aVOWEL = true;
|
||||||
|
sb.append(THDLWylieConstants.WYLIE_aVOWEL); // paM, no pM
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (isVowel(ch)) {
|
||||||
|
seenVowel = true;
|
||||||
|
}
|
||||||
|
sb.append(ewts);
|
||||||
|
lastEwts = ewts;
|
||||||
|
}
|
||||||
|
if (UnicodeUtils.isNonSubjoinedConsonant(ch)
|
||||||
|
|| UnicodeUtils.isSubjoinedConsonant(ch)
|
||||||
|
|| '\u0f39' == ch) {
|
||||||
|
ThdlDebug.verify(!added_aVOWEL);
|
||||||
|
sb.append(THDLWylieConstants.WYLIE_aVOWEL);
|
||||||
|
}
|
||||||
|
return sb;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int hashCode() { return nfthdl.hashCode(); }
|
||||||
|
|
||||||
|
public boolean equals(Object o) {
|
||||||
|
return (o instanceof GC && ((GC)o).getNfthdl().equals(getNfthdl()));
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Quasi-XML for humans */
|
||||||
|
public String toString() {
|
||||||
|
return "<GC valid=" + valid + " pretty=\""
|
||||||
|
+ UnicodeUtils.unicodeStringToPrettyString(getNfthdl())
|
||||||
|
+ "\"/>";
|
||||||
|
}
|
||||||
|
}
|
|
@ -32,11 +32,12 @@ public class UnicodeToTranslitForXslt {
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Converts Tibetan Unicode to EWTS transliteration. */
|
/** Converts Tibetan Unicode to EWTS transliteration. */
|
||||||
public static String unicodeToEwts(String unicode) {
|
public static String unicodeToEwtsForComputers(String unicode) {
|
||||||
return Converter.convertToEwts(unicode, null);
|
return Converter.convertToEwtsForComputers(unicode, null);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Converts Tibetan Unicode to ACIP transliteration. */
|
/** Converts Tibetan Unicode to ACIP transliteration. */
|
||||||
public static String unicodeToAcip(String unicode) {
|
public static String unicodeToAcip(String unicode) {
|
||||||
throw new Error("DLC: not yet");
|
throw new Error("TODO(dchandler): not yet");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -50,12 +50,15 @@ public class UnicodeToTranslitForXsltTest extends TestCase {
|
||||||
public UnicodeToTranslitForXsltTest() { }
|
public UnicodeToTranslitForXsltTest() { }
|
||||||
|
|
||||||
public void testUnicodeToEwts() {
|
public void testUnicodeToEwts() {
|
||||||
assertEquals(UnicodeToTranslitForXslt.unicodeToEwts("\u0f40"), "ka");
|
assertEquals("ka", UnicodeToTranslitForXslt.unicodeToEwtsForComputers("\u0f40"));
|
||||||
assertEquals(UnicodeToTranslitForXslt.unicodeToEwts("\u0f56\u0f62\u0f4f\u0f42\u0f66\u0f0b"), "brtags ");
|
assertEquals("g+ya", UnicodeToTranslitForXslt.unicodeToEwtsForComputers("\u0f42\u0fb1"));
|
||||||
|
// TODO(dchandler): assertEquals("brtags ", UnicodeToTranslitForXslt.unicodeToEwtsForHumans("\u0f56\u0f62\u0f9f\u0f42\u0f66\u0f0b"));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testUnicodeToAcip() {
|
public void testUnicodeToAcip() {
|
||||||
assertEquals(UnicodeToTranslitForXslt.unicodeToEwts("\u0f40"), "KA");
|
if (false) {
|
||||||
assertEquals(UnicodeToTranslitForXslt.unicodeToEwts("\u0f56\u0f62\u0f4f\u0f42\u0f66\u0f0b"), "BRTAGS ");
|
assertEquals("KA", UnicodeToTranslitForXslt.unicodeToAcip("\u0f40"));
|
||||||
|
assertEquals("BRTAGS ", UnicodeToTranslitForXslt.unicodeToAcip("\u0f56\u0f62\u0f9f\u0f42\u0f66\u0f0b"));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -463,6 +463,7 @@ public final class LegalTshegBar
|
||||||
* concatenation like 'u'i'o. Returns false otherwise (including
|
* concatenation like 'u'i'o. Returns false otherwise (including
|
||||||
* the case that suffix is the empty string). */
|
* the case that suffix is the empty string). */
|
||||||
public static boolean isAchungBasedSuffix(String suffix) {
|
public static boolean isAchungBasedSuffix(String suffix) {
|
||||||
|
// TODO(dchandler): use java.util.regex
|
||||||
int i = 0; // so that the empty string causes false to be returned.
|
int i = 0; // so that the empty string causes false to be returned.
|
||||||
while (i == 0 || !suffix.equals("")) {
|
while (i == 0 || !suffix.equals("")) {
|
||||||
boolean startsWithOneOfThem = false;
|
boolean startsWithOneOfThem = false;
|
||||||
|
|
|
@ -67,11 +67,16 @@ public class UnicodeCodepointToThdlWylie {
|
||||||
// fail.
|
// fail.
|
||||||
|
|
||||||
switch (x) {
|
switch (x) {
|
||||||
|
case '\t': return "\t";
|
||||||
|
case '\n': return "\n";
|
||||||
|
case '\r': return "\r";
|
||||||
|
case ' ': return "_";
|
||||||
|
case '\u00a0': return "_";
|
||||||
|
|
||||||
case '\u0F00': return "oM";
|
case '\u0F00': return "oM";
|
||||||
case '\u0F01': return "\\u0F01";
|
case '\u0F01': return "\\u0F01";
|
||||||
case '\u0F02': return null; // DLC
|
case '\u0F02': return "\\u0F02";
|
||||||
case '\u0F03': return null; // DLC
|
case '\u0F03': return "\\u0F03";
|
||||||
case '\u0F04': return "@";
|
case '\u0F04': return "@";
|
||||||
case '\u0F05': return "#";
|
case '\u0F05': return "#";
|
||||||
case '\u0F06': return "$";
|
case '\u0F06': return "$";
|
||||||
|
@ -314,8 +319,6 @@ public class UnicodeCodepointToThdlWylie {
|
||||||
case '\u0FCF': return "\\u0FCF"; // DLC i added this to the 'EWTS document misspeaks' bug report... null I think...
|
case '\u0FCF': return "\\u0FCF"; // DLC i added this to the 'EWTS document misspeaks' bug report... null I think...
|
||||||
|
|
||||||
default: {
|
default: {
|
||||||
// DLC handle space (EW's "_")
|
|
||||||
|
|
||||||
// This codepoint is in the range 0FD0-0FFF or is not in
|
// This codepoint is in the range 0FD0-0FFF or is not in
|
||||||
// the Tibetan range at all. In either case, there is no
|
// the Tibetan range at all. In either case, there is no
|
||||||
// corresponding THDL Extended Wylie.
|
// corresponding THDL Extended Wylie.
|
||||||
|
|
|
@ -102,7 +102,10 @@ public class UnicodeUtils implements UnicodeConstants {
|
||||||
nor NFKD breaks down <code>U+0F00</code> into its constituent
|
nor NFKD breaks down <code>U+0F00</code> into its constituent
|
||||||
codepoints. NFTHDL uses a maximum of codepoints, and it never
|
codepoints. NFTHDL uses a maximum of codepoints, and it never
|
||||||
uses codepoints whose use has been {@link #isDiscouraged(char)
|
uses codepoints whose use has been {@link #isDiscouraged(char)
|
||||||
discouraged}.
|
discouraged}. NFTHDL also does not screw things up by using
|
||||||
|
the standard-but-wrong CCCVs. It sorts stretches of combining
|
||||||
|
characters wisely as per
|
||||||
|
{@link http://iris.lib.virginia.edu/tibet/xml/showEssay.php?xml=/tools/encodingTib.xml}.
|
||||||
|
|
||||||
<p>The Tibetan passages of the returned string are in the
|
<p>The Tibetan passages of the returned string are in the
|
||||||
chosen normalized form, but codepoints outside of the {@link
|
chosen normalized form, but codepoints outside of the {@link
|
||||||
|
@ -136,6 +139,9 @@ public class UnicodeUtils implements UnicodeConstants {
|
||||||
tibetanUnicode.insert(offset, s);
|
tibetanUnicode.insert(offset, s);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (normForm == NORM_NFTHDL) {
|
||||||
|
fixSomeOrderingErrorsInTibetanUnicode(tibetanUnicode);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Like {@link #toMostlyDecomposedUnicode(StringBuffer, byte)},
|
/** Like {@link #toMostlyDecomposedUnicode(StringBuffer, byte)},
|
||||||
|
@ -418,7 +424,39 @@ public class UnicodeUtils implements UnicodeConstants {
|
||||||
* product.)
|
* product.)
|
||||||
*/
|
*/
|
||||||
private static char unicode_pairs[][]
|
private static char unicode_pairs[][]
|
||||||
= { { '\u0f71', '\u0f74' },
|
= {
|
||||||
|
/* TODO(dchandler): use regex
|
||||||
|
* "[\u0f39\u0f71-\u0f84\u0f86\u0f87]{2,}" to find patches
|
||||||
|
* that need sorting and then sort each of those. This
|
||||||
|
* cross product is ugly. */
|
||||||
|
|
||||||
|
{ '\u0f39', '\u0f71' },
|
||||||
|
{ '\u0f39', '\u0f72' },
|
||||||
|
{ '\u0f39', '\u0f74' },
|
||||||
|
{ '\u0f39', '\u0f7a' },
|
||||||
|
{ '\u0f39', '\u0f7b' },
|
||||||
|
{ '\u0f39', '\u0f7c' },
|
||||||
|
{ '\u0f39', '\u0f7d' },
|
||||||
|
{ '\u0f39', '\u0f7e' },
|
||||||
|
{ '\u0f39', '\u0f7f' },
|
||||||
|
{ '\u0f39', '\u0f80' },
|
||||||
|
{ '\u0f39', '\u0f82' },
|
||||||
|
{ '\u0f39', '\u0f83' },
|
||||||
|
|
||||||
|
{ '\u0f71', '\u0f7f' },
|
||||||
|
{ '\u0f72', '\u0f7f' },
|
||||||
|
{ '\u0f74', '\u0f7f' },
|
||||||
|
{ '\u0f7a', '\u0f7f' },
|
||||||
|
{ '\u0f7b', '\u0f7f' },
|
||||||
|
{ '\u0f7c', '\u0f7f' },
|
||||||
|
{ '\u0f7d', '\u0f7f' },
|
||||||
|
// but not { '\u0f7e', '\u0f7f' },
|
||||||
|
{ '\u0f39', '\u0f7f' },
|
||||||
|
{ '\u0f80', '\u0f7f' },
|
||||||
|
{ '\u0f82', '\u0f7f' },
|
||||||
|
{ '\u0f83', '\u0f7f' },
|
||||||
|
|
||||||
|
{ '\u0f71', '\u0f74' },
|
||||||
|
|
||||||
{ '\u0f71', '\u0f72' },
|
{ '\u0f71', '\u0f72' },
|
||||||
{ '\u0f71', '\u0f7a' },
|
{ '\u0f71', '\u0f7a' },
|
||||||
|
@ -489,7 +527,9 @@ public class UnicodeUtils implements UnicodeConstants {
|
||||||
* the same file modulo Unicode booboos would be better. </p>
|
* the same file modulo Unicode booboos would be better. </p>
|
||||||
*
|
*
|
||||||
* @param sb the buffer to be mutated
|
* @param sb the buffer to be mutated
|
||||||
* @return true if sb was mutated */
|
* @return true if sb was mutated
|
||||||
|
* @see <a href="http://iris.lib.virginia.edu/tibet/xml/showEssay.php?xml=/tools/encodingTib.xml">Tibetan Encoding Model</a>
|
||||||
|
*/
|
||||||
public static boolean fixSomeOrderingErrorsInTibetanUnicode(StringBuffer sb) {
|
public static boolean fixSomeOrderingErrorsInTibetanUnicode(StringBuffer sb) {
|
||||||
boolean mutated = false;
|
boolean mutated = false;
|
||||||
int len = sb.length();
|
int len = sb.length();
|
||||||
|
@ -512,25 +552,5 @@ public class UnicodeUtils implements UnicodeConstants {
|
||||||
} while (mutated_this_time_through);
|
} while (mutated_this_time_through);
|
||||||
return mutated;
|
return mutated;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns true iff ch is a valid Tibetan codepoint in Unicode
|
|
||||||
* 4.0: */
|
|
||||||
public boolean isTibetanUnicodeCodepoint(char ch) {
|
|
||||||
// NOTE: could use an array of 256 booleans for speed but I'm lazy
|
|
||||||
return ((ch >= '\u0f00' && ch <= '\u0fcf')
|
|
||||||
&& !(ch == '\u0f48'
|
|
||||||
|| (ch > '\u0f6a' && ch < '\u0f71')
|
|
||||||
|| (ch > '\u0f8b' && ch < '\u0f90')
|
|
||||||
|| ch == '\u0f98'
|
|
||||||
|| ch == '\u0fbd'
|
|
||||||
|| ch == '\u0fcd'
|
|
||||||
|| ch == '\u0fce'));
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Returns true iff ch is in 0F00-0FFF but isn't a valid Tibetan
|
|
||||||
* codepoint in Unicode 4.0: */
|
|
||||||
public boolean isInvalidTibetanUnicode(char ch) {
|
|
||||||
return (isInTibetanRange(ch) && !isTibetanUnicodeCodepoint(ch));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -798,6 +798,7 @@ public class EWTSTest extends TestCase {
|
||||||
just_ewts2uni_test("\\uefff", "\uefff");
|
just_ewts2uni_test("\\uefff", "\uefff");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ewts2uni_test("kaHH", "\u0F40\u0f7f\u0f7f");
|
||||||
|
|
||||||
// Below was semiautomatically generated from the EWTS spec's
|
// Below was semiautomatically generated from the EWTS spec's
|
||||||
// 'ewts.xml' representation (early August 2004 edition):
|
// 'ewts.xml' representation (early August 2004 edition):
|
||||||
|
|
|
@ -405,6 +405,12 @@ class TPairListFactory {
|
||||||
|
|
||||||
"\u0f74", THDLWylieConstants.u_VOWEL,
|
"\u0f74", THDLWylieConstants.u_VOWEL,
|
||||||
|
|
||||||
|
// TODO(dchandler): equivalence classes I'm not
|
||||||
|
// sure.
|
||||||
|
// http://iris.lib.virginia.edu/tibet/xml/showEssay.php?xml=/tools/encodingTib.xml
|
||||||
|
// says to go above base and then upwards. Think
|
||||||
|
// it over.
|
||||||
|
|
||||||
// equivalence class:
|
// equivalence class:
|
||||||
"\u0f72", THDLWylieConstants.i_VOWEL,
|
"\u0f72", THDLWylieConstants.i_VOWEL,
|
||||||
"\u0f7a", THDLWylieConstants.e_VOWEL,
|
"\u0f7a", THDLWylieConstants.e_VOWEL,
|
||||||
|
|
Loading…
Reference in a new issue