Added a class for performing EWTS->Unicode conversions during XSLT
transformations. I haven't actually used it with Xalan XSLT yet, but it ought to work if TibetanHTML did (which it must have at one point). I do have a unit test, but an end-to-end test with Xalan is what we need.
This commit is contained in:
parent
6260c0889d
commit
dc18165992
5 changed files with 153 additions and 12 deletions
12
build.xml
12
build.xml
|
@ -364,11 +364,18 @@ the jvm starting tomcat:
|
||||||
<param name="my.included.source.file"
|
<param name="my.included.source.file"
|
||||||
value="org/thdl/tib/input/Jskad.java"/>
|
value="org/thdl/tib/input/Jskad.java"/>
|
||||||
</antcall>
|
</antcall>
|
||||||
|
<!-- For XSLT, we want this in jskad.jar: -->
|
||||||
<antcall target="our-internal-javac-task">
|
<antcall target="our-internal-javac-task">
|
||||||
<param name="mybin" value="${jskadbin}"/>
|
<param name="mybin" value="${jskadbin}"/>
|
||||||
<param name="my.included.source.file"
|
<param name="my.included.source.file"
|
||||||
value="org/thdl/tib/text/TibetanHTML.java"/>
|
value="org/thdl/tib/text/TibetanHTML.java"/>
|
||||||
</antcall>
|
</antcall>
|
||||||
|
<!-- For XSLT, we want this in jskad.jar: -->
|
||||||
|
<antcall target="our-internal-javac-task">
|
||||||
|
<param name="mybin" value="${jskadbin}"/>
|
||||||
|
<param name="my.included.source.file"
|
||||||
|
value="org/thdl/tib/text/ttt/EwtsToUnicodeForXslt.java"/>
|
||||||
|
</antcall>
|
||||||
<!-- Put org.thdl.VerboseUnicodeDump in Jskad's jar for those who
|
<!-- Put org.thdl.VerboseUnicodeDump in Jskad's jar for those who
|
||||||
want to use it. -->
|
want to use it. -->
|
||||||
<antcall target="our-internal-javac-task">
|
<antcall target="our-internal-javac-task">
|
||||||
|
@ -472,6 +479,11 @@ the jvm starting tomcat:
|
||||||
description="compiles all JUnit test cases that can be compiled in the present CLASSPATH (NB that this distinction is just wishful thinking for now because we have such weak test coverage at this point)" >
|
description="compiles all JUnit test cases that can be compiled in the present CLASSPATH (NB that this distinction is just wishful thinking for now because we have such weak test coverage at this point)" >
|
||||||
<mkdir dir="${junitbin}"/>
|
<mkdir dir="${junitbin}"/>
|
||||||
<antcall target="create-timestamp-source-code"/> <!-- DLC NOW! The -run targets are mucking with this! It isn't fatal, but it should be fixed. -->
|
<antcall target="create-timestamp-source-code"/> <!-- DLC NOW! The -run targets are mucking with this! It isn't fatal, but it should be fixed. -->
|
||||||
|
<antcall target="our-internal-javac-task">
|
||||||
|
<param name="mybin" value="${junitbin}"/>
|
||||||
|
<param name="my.included.source.file"
|
||||||
|
value="org/thdl/tib/text/ttt/EwtsToUnicodeForXsltTest.java"/>
|
||||||
|
</antcall>
|
||||||
<antcall target="our-internal-javac-task">
|
<antcall target="our-internal-javac-task">
|
||||||
<param name="mybin" value="${junitbin}"/>
|
<param name="mybin" value="${junitbin}"/>
|
||||||
<param name="my.included.source.file"
|
<param name="my.included.source.file"
|
||||||
|
|
|
@ -73,6 +73,7 @@
|
||||||
<formatter type="xml"/><!-- If not XML, then 'ant -buildfile
|
<formatter type="xml"/><!-- If not XML, then 'ant -buildfile
|
||||||
build.xml check-report' will fail. -->
|
build.xml check-report' will fail. -->
|
||||||
<sysproperty key="java.awt.headless" value="true"/>
|
<sysproperty key="java.awt.headless" value="true"/>
|
||||||
|
<test name="org.thdl.tib.text.ttt.EwtsToUnicodeForXsltTest"/>
|
||||||
<test name="org.thdl.tib.text.ttt.EWTSTest"/>
|
<test name="org.thdl.tib.text.ttt.EWTSTest"/>
|
||||||
<test name="org.thdl.tib.text.ttt.EWTStibwniniTest"/>
|
<test name="org.thdl.tib.text.ttt.EWTStibwniniTest"/>
|
||||||
<test name="org.thdl.tib.input.TMW_RTF_TO_THDL_WYLIETest"/>
|
<test name="org.thdl.tib.input.TMW_RTF_TO_THDL_WYLIETest"/>
|
||||||
|
|
46
source/org/thdl/tib/text/ttt/EwtsToUnicodeForXslt.java
Normal file
46
source/org/thdl/tib/text/ttt/EwtsToUnicodeForXslt.java
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
/*
|
||||||
|
The contents of this file are subject to the THDL Open Community License
|
||||||
|
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||||
|
with the License. You may obtain a copy of the License on the THDL web site
|
||||||
|
(http://www.thdl.org/).
|
||||||
|
|
||||||
|
Software distributed under the License is distributed on an "AS IS" basis,
|
||||||
|
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||||
|
License for the specific terms governing rights and limitations under the
|
||||||
|
License.
|
||||||
|
|
||||||
|
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||||
|
Library (THDL). Portions created by the THDL are Copyright 2005 THDL.
|
||||||
|
All Rights Reserved.
|
||||||
|
|
||||||
|
Contributor(s): ______________________________________.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.thdl.tib.text.ttt;
|
||||||
|
|
||||||
|
/** A class for use in XSL transformations that converts EWTS
|
||||||
|
* transliteration to Unicode. This is intended to be used by Xalan
|
||||||
|
* XSLT to convert an XML document that uses Wylie into
|
||||||
|
* HTML/text/whatever that uses Unicode (probably TibetanMachineUni
|
||||||
|
* font).
|
||||||
|
* @author David Chandler
|
||||||
|
*/
|
||||||
|
public class EwtsToUnicodeForXslt {
|
||||||
|
/** Static methods provide all the fun! */
|
||||||
|
private EwtsToUnicodeForXslt() {
|
||||||
|
throw new Error("There's no point in instantiating this class.");
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Converts EWTS transliteration into Tibetan Unicode.
|
||||||
|
* TODO(dchandler): must we worry about the encoding, UTF-8
|
||||||
|
* vs. UTF-16LE e.g.? */
|
||||||
|
public static String convertEwtsTo(String ewts) {
|
||||||
|
return TConverter.convertToUnicodeText(EWTSTraits.instance(),
|
||||||
|
ewts,
|
||||||
|
new StringBuffer(),
|
||||||
|
null,
|
||||||
|
false,
|
||||||
|
"None",
|
||||||
|
false);
|
||||||
|
}
|
||||||
|
}
|
79
source/org/thdl/tib/text/ttt/EwtsToUnicodeForXsltTest.java
Normal file
79
source/org/thdl/tib/text/ttt/EwtsToUnicodeForXsltTest.java
Normal file
|
@ -0,0 +1,79 @@
|
||||||
|
/*
|
||||||
|
The contents of this file are subject to the THDL Open Community License
|
||||||
|
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||||
|
with the License. You may obtain a copy of the License on the THDL web site
|
||||||
|
(http://www.thdl.org/).
|
||||||
|
|
||||||
|
Software distributed under the License is distributed on an "AS IS" basis,
|
||||||
|
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||||
|
License for the specific terms governing rights and limitations under the
|
||||||
|
License.
|
||||||
|
|
||||||
|
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||||
|
Library (THDL). Portions created by the THDL are Copyright 2005 THDL.
|
||||||
|
All Rights Reserved.
|
||||||
|
|
||||||
|
Contributor(s): ______________________________________.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.thdl.tib.text.ttt;
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
import org.thdl.util.ThdlOptions;
|
||||||
|
|
||||||
|
/** Tests EwtsToUnicodeForXslt at the unit level. For such a class, a
|
||||||
|
* much more important test is one that actually uses XSLT.
|
||||||
|
* TODO(dchandler): write such a test. You may even be able to use
|
||||||
|
* JUnit for it.
|
||||||
|
*
|
||||||
|
* @author David Chandler */
|
||||||
|
public class EwtsToUnicodeForXsltTest extends TestCase {
|
||||||
|
|
||||||
|
/** Invokes a text UI and runs all this class's tests. */
|
||||||
|
public static void main(String[] args) {
|
||||||
|
junit.textui.TestRunner.run(EwtsToUnicodeForXsltTest.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void setUp() {
|
||||||
|
// We don't want to use options.txt:
|
||||||
|
ThdlOptions.forTestingOnlyInitializeWithoutDefaultOptionsFile();
|
||||||
|
|
||||||
|
ThdlOptions.setUserPreference("thdl.acip.to.tibetan.warning.and.error.severities.are.built.in.defaults", "true");
|
||||||
|
ThdlOptions.setUserPreference("thdl.acip.to.tibetan.warning.severity.507", "Most");
|
||||||
|
ErrorsAndWarnings.setupSeverityMap();
|
||||||
|
|
||||||
|
// We don't want to load the TM or TMW font files ourselves:
|
||||||
|
ThdlOptions.setUserPreference("thdl.rely.on.system.tmw.fonts", true);
|
||||||
|
ThdlOptions.setUserPreference("thdl.rely.on.system.tm.fonts", true);
|
||||||
|
ThdlOptions.setUserPreference("thdl.debug", true);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public EwtsToUnicodeForXsltTest() { }
|
||||||
|
|
||||||
|
private static void help(String ewts, String expected) {
|
||||||
|
String actual = EwtsToUnicodeForXslt.convertEwtsTo(ewts);
|
||||||
|
assertEquals(expected, actual);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testIt() throws java.io.IOException {
|
||||||
|
help("ga",
|
||||||
|
"\u0f42");
|
||||||
|
help("\u0f00\u0f01\u0f02 \u0f03 \u0fcf",
|
||||||
|
"\u0f00\u0f01\u0f02\u0f0b\u0f03\u0f0b"
|
||||||
|
+ "\u0fcf");
|
||||||
|
// TODO(dchandler): I think EWTS->Tibetan ought to not give errors
|
||||||
|
// about the disambiguators here:
|
||||||
|
// help("\u0f00.\u0f01.\u0f02 \u0f03 \u0fcf",
|
||||||
|
// "\u0f00\u0f01\u0f02\u0f0b\u0f03\u0f0b"
|
||||||
|
// + "\u0fcf");
|
||||||
|
help("k+Shu+A+i+o+eHM",
|
||||||
|
"\u0f40\u0fb5\u0f71\u0f74\u0f72\u0f7a\u0f7c\u0f7e"
|
||||||
|
+ "\u0f7f");
|
||||||
|
help(" . ",
|
||||||
|
"\u0f0b[#ERROR 130: The tsheg bar (\"syllable\") {.} is"
|
||||||
|
+ " essentially nothing.]\u0f0b");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -196,18 +196,18 @@ public class TConverter {
|
||||||
loc[0] == tdoc.getLength());
|
loc[0] == tdoc.getLength());
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns UTF-8 encoded Unicode. A bit indirect, so use this
|
/** Returns the Unicode that the given translit corresponds to. A
|
||||||
* for testing only if performance is a concern. If errors occur
|
* bit indirect, so use this for testing only if performance is a
|
||||||
* in scanning the transliteration or in converting a tsheg bar,
|
* concern. If errors occur in scanning the transliteration or
|
||||||
* then they are appended to errors if errors is non-null, as
|
* in converting a tsheg bar, then they are appended to errors if
|
||||||
* well as written to the result. If warnings occur in scanning
|
* errors is non-null, as well as written to the result. If
|
||||||
* the transliteration or in converting a tsheg bar, then they
|
* warnings occur in scanning the transliteration or in
|
||||||
* are appended to warnings if warnings is non-null, and they are
|
* converting a tsheg bar, then they are appended to warnings if
|
||||||
* written to the result if writeWarningsToResult is true. Error
|
* warnings is non-null, and they are written to the result if
|
||||||
* and warning messages are long and self-contained unless
|
* writeWarningsToResult is true. Error and warning messages are
|
||||||
* shortMessages is true. Returns the conversion upon perfect
|
* long and self-contained unless shortMessages is true. Returns
|
||||||
* success or if there were merely warnings, null if errors
|
* the conversion upon perfect success or if there were merely
|
||||||
* occurred. */
|
* warnings, null if errors occurred. */
|
||||||
public static String convertToUnicodeText(TTraits ttraits,
|
public static String convertToUnicodeText(TTraits ttraits,
|
||||||
String translit,
|
String translit,
|
||||||
StringBuffer errors,
|
StringBuffer errors,
|
||||||
|
@ -229,6 +229,9 @@ public class TConverter {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
|
// Won't happen. UTF-8 is guaranteed to be a supported
|
||||||
|
// encoding, and ByteArrayOutputStreams don't have such
|
||||||
|
// problems I don't think.
|
||||||
throw new Error(e.toString());
|
throw new Error(e.toString());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue