The scaffolding for a Unicode->EWTS reverter. No guts yet.

This commit is contained in:
dchandler 2005-07-17 03:32:57 +00:00
parent ebc11a3425
commit 00afd75362
9 changed files with 282 additions and 16 deletions

View File

@ -479,6 +479,16 @@ the jvm starting tomcat:
description="compiles all JUnit test cases that can be compiled in the present CLASSPATH (NB that this distinction is just wishful thinking for now because we have such weak test coverage at this point)" >
<mkdir dir="${junitbin}"/>
<antcall target="create-timestamp-source-code"/> <!-- DLC NOW! The -run targets are mucking with this! It isn't fatal, but it should be fixed. -->
<antcall target="our-internal-javac-task">
<param name="mybin" value="${junitbin}"/>
<param name="my.included.source.file"
value="org/thdl/tib/text/reverter/UnicodeToTranslitForXsltTest.java"/>
</antcall>
<antcall target="our-internal-javac-task">
<param name="mybin" value="${junitbin}"/>
<param name="my.included.source.file"
value="org/thdl/tib/text/reverter/ConverterTest.java"/>
</antcall>
<antcall target="our-internal-javac-task">
<param name="mybin" value="${junitbin}"/>
<param name="my.included.source.file"

View File

@ -73,6 +73,10 @@
<formatter type="xml"/><!-- If not XML, then 'ant -buildfile
build.xml check-report' will fail. -->
<sysproperty key="java.awt.headless" value="true"/>
<!-- TODO(dchandler): DLC: enable these
<test name="org.thdl.tib.text.reverter.ConverterTest"/>
<test name="org.thdl.tib.text.reverter.UnicodeToTranslitForXsltTest"/>
-->
<test name="org.thdl.tib.text.ttt.EwtsToUnicodeForXsltTest"/>
<test name="org.thdl.tib.text.ttt.EWTSTest"/>
<test name="org.thdl.tib.text.ttt.EWTStibwniniTest"/>

View File

@ -97,14 +97,14 @@ class ConvertDialog extends JDialog
|| choices.getSelectedItem() == WYLIE_TO_UNI_TEXT);
}
private javax.swing.filechooser.FileFilter acipff, rtfff;
private javax.swing.filechooser.FileFilter textFileFilter, rtfFileFilter;
private void init()
{
jfc = new JFileChooser(controller.getDefaultDirectory());
jfc.setDialogTitle(LOCATE_FILE);
jfc.addChoosableFileFilter(acipff = new ACIPFileFilter());
jfc.addChoosableFileFilter(rtfff = new RTFFileFilter());
jfc.addChoosableFileFilter(textFileFilter = new TextFileFilter());
jfc.addChoosableFileFilter(rtfFileFilter = new RTFFileFilter());
content = new JPanel(new GridLayout(0,1));
JPanel temp = new JPanel(new FlowLayout(FlowLayout.CENTER,5,5));
@ -186,7 +186,7 @@ class ConvertDialog extends JDialog
content.add(buttonBox);
setContentPane(content);
pack();
setSize(new Dimension(600,240));
setSize(new Dimension(760,340));
}
private void setChoices(String[] choices)
@ -241,15 +241,17 @@ class ConvertDialog extends JDialog
if (src == browseOld) {
jfc.setFileFilter((ACIP_TO_UNI_TEXT.equals((String)choices.getSelectedItem())
|| WYLIE_TO_UNI_TEXT.equals((String)choices.getSelectedItem())
|| UNI_TO_WYLIE_TEXT.equals((String)choices.getSelectedItem())
|| ACIP_TO_TMW.equals((String)choices.getSelectedItem())
|| WYLIE_TO_TMW.equals((String)choices.getSelectedItem()))
? acipff : rtfff);
? textFileFilter : rtfFileFilter);
} else {
jfc.setFileFilter((ACIP_TO_UNI_TEXT.equals((String)choices.getSelectedItem())
|| WYLIE_TO_UNI_TEXT.equals((String)choices.getSelectedItem())
|| UNI_TO_WYLIE_TEXT.equals((String)choices.getSelectedItem())
|| TMW_TO_ACIP_TEXT.equals((String)choices.getSelectedItem())
|| TMW_TO_WYLIE_TEXT.equals((String)choices.getSelectedItem()))
? acipff : rtfff);
? textFileFilter : rtfFileFilter);
}
if (jfc.showOpenDialog(this) != jfc.APPROVE_OPTION)
return;
@ -445,13 +447,25 @@ class ConvertDialog extends JDialog
else
oldFileDirName = oldFileDirName + File.separator;
String oldFileNameSansThingy = of.getName();
if (oldFileNameSansThingy.startsWith("TMW_")) {
if (oldFileNameSansThingy.startsWith(suggested_TO_TMW_prefix)) {
oldFileNameSansThingy
= oldFileNameSansThingy.substring("TMW_".length(),
= oldFileNameSansThingy.substring(suggested_TO_TMW_prefix.length(),
oldFileNameSansThingy.length());
} else if (oldFileNameSansThingy.startsWith("TM_")) {
} else if (oldFileNameSansThingy.startsWith(suggested_TO_TM_prefix)) {
oldFileNameSansThingy
= oldFileNameSansThingy.substring("TM_".length(),
= oldFileNameSansThingy.substring(suggested_TO_TM_prefix.length(),
oldFileNameSansThingy.length());
} else if (oldFileNameSansThingy.startsWith(suggested_TO_UNI_prefix)) {
oldFileNameSansThingy
= oldFileNameSansThingy.substring(suggested_TO_UNI_prefix.length(),
oldFileNameSansThingy.length());
} else if (oldFileNameSansThingy.startsWith(suggested_ACIP_prefix)) {
oldFileNameSansThingy
= oldFileNameSansThingy.substring(suggested_ACIP_prefix.length(),
oldFileNameSansThingy.length());
} else if (oldFileNameSansThingy.startsWith(suggested_WYLIE_prefix)) {
oldFileNameSansThingy
= oldFileNameSansThingy.substring(suggested_WYLIE_prefix.length(),
oldFileNameSansThingy.length());
} else if (oldFileNameSansThingy.startsWith("TMW")) {
oldFileNameSansThingy
@ -481,8 +495,11 @@ class ConvertDialog extends JDialog
newFileNamePrefix = "TMW_to_same_TMW__";
newFileNameExtension = ".RTF";
} else { // conversion mode
if (TMW_TO_WYLIE == ct) {
if (TMW_TO_WYLIE == ct
|| UNI_TO_WYLIE_TEXT == ct) {
newFileNamePrefix = suggested_WYLIE_prefix;
if (UNI_TO_WYLIE_TEXT == ct)
newFileNameExtension = ".TXT";
} else if (TMW_TO_WYLIE_TEXT == ct) {
newFileNamePrefix = suggested_WYLIE_prefix;
newFileNameExtension = ".TXT";
@ -531,8 +548,7 @@ class ConvertDialog extends JDialog
}
}
// TODO(DLC)[EWTS->Tibetan]: we use for wylie (ewts) too...
public class ACIPFileFilter extends javax.swing.filechooser.FileFilter
public class TextFileFilter extends javax.swing.filechooser.FileFilter
{
public boolean accept(File f)
{

View File

@ -24,6 +24,7 @@ package org.thdl.tib.input;
@author Nathaniel Garson, Tibetan and Himalayan Digital Library */
interface FontConverterConstants
{
final String UNI_TO_WYLIE_TEXT = "Unicode to Wylie (UTF-8 Text->Text)";
final String WYLIE_TO_UNI_TEXT = "Wylie to Unicode (Text->Text)";
final String WYLIE_TO_TMW = "Wylie to TMW (Text->RTF)";
final String TMW_TO_SAME_TMW = "TMW to the same TMW (for testing only) (RTF->RTF)";
@ -60,6 +61,7 @@ interface FontConverterConstants
};
final String[] DEBUG_CHOICES = new String[] {
UNI_TO_WYLIE_TEXT,
TMW_TO_SAME_TMW,
WYLIE_TO_UNI_TEXT,
WYLIE_TO_TMW,

View File

@ -18,10 +18,12 @@ Contributor(s): ______________________________________.
package org.thdl.tib.input;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintStream;
import java.util.ArrayList;
@ -31,6 +33,7 @@ import javax.swing.text.StyleConstants;
import javax.swing.text.rtf.RTFEditorKit;
import org.thdl.tib.text.TibetanDocument;
import org.thdl.tib.text.reverter.Converter;
import org.thdl.tib.text.ttt.ACIPTraits;
import org.thdl.tib.text.ttt.EWTSTraits;
import org.thdl.tib.text.ttt.TConverter;
@ -89,6 +92,7 @@ public class TibetanConverter implements FontConverterConstants {
boolean convertToWylieTextMode = false;
boolean convertToACIPRTFMode = false;
boolean convertToACIPTextMode = false;
boolean convertUniToWylieTextMode = false;
boolean findSomeNonTMWMode = false;
boolean findAllNonTMWMode = false;
boolean findSomeNonTMMode = false;
@ -123,6 +127,8 @@ public class TibetanConverter implements FontConverterConstants {
= args[numArgs - 2].equals("--tmw-to-tmw-for-testing"))
|| (convertToTMMode
= args[numArgs - 2].equals("--to-tibetan-machine"))
|| (convertUniToWylieTextMode
= args[numArgs - 2].equals("--utf8-text-to-ewts-text"))
|| (convertToTMWMode
= args[numArgs - 2].equals("--to-tibetan-machine-web"))
|| (convertACIPToUniMode
@ -224,7 +230,8 @@ public class TibetanConverter implements FontConverterConstants {
out.println("are in your document waiting for your personal attention,");
out.println("43 if not even one glyph found was eligible for this conversion, which means");
out.println("that you probably selected the wrong conversion or the wrong document, or ");
out.println("nonzero otherwise.");
out.println("nonzero on some other error.");
// TODO(dchandler): describe 47 48 50 etc.
out.println("");
out.println("You may find it helpful to use `--find-some-non-tmw' mode (or");
out.println("`--find-some-non-tm' mode for Tibetan Machine input) before doing a");
@ -266,6 +273,8 @@ public class TibetanConverter implements FontConverterConstants {
conversionTag = TMW_TO_WYLIE;
} else if (convertToWylieTextMode) {
conversionTag = TMW_TO_WYLIE_TEXT;
} else if (convertUniToWylieTextMode) {
conversionTag = UNI_TO_WYLIE_TEXT;
} else if (convertToACIPRTFMode) {
conversionTag = TMW_TO_ACIP;
} else if (convertToACIPTextMode) {
@ -320,8 +329,36 @@ public class TibetanConverter implements FontConverterConstants {
static int reallyConvert(InputStream in, PrintStream out, String ct,
String warningLevel, boolean shortMessages,
boolean colors) {
if (ACIP_TO_UNI_TEXT == ct || ACIP_TO_TMW == ct
|| WYLIE_TO_UNI_TEXT == ct || WYLIE_TO_TMW == ct) {
if (UNI_TO_WYLIE_TEXT == ct) {
try {
String uniText;
{
// TODO(dchandler): use, here and elsewhere in the
// codebase,
// org.apache.commons.io.IOUtils.toString(InputStream,
// encoding)
StringBuffer s = new StringBuffer();
char ch[] = new char[8192];
BufferedReader bin
= new BufferedReader(new InputStreamReader(in,
"UTF-8"));
int amt;
while (-1 != (amt = bin.read(ch))) {
s.append(ch, 0, amt);
}
bin.close();
uniText = s.toString();
}
StringBuffer errors = new StringBuffer();
String ewtsText = Converter.convertToEwts(uniText, errors);
// TODO(dchandler): is 51 the right choice?
return (errors.length() > 0) ? 51 : 0;
} catch (IOException e) {
// TODO(dchandler): print it? where to?
return 48;
}
} else if (ACIP_TO_UNI_TEXT == ct || ACIP_TO_TMW == ct
|| WYLIE_TO_UNI_TEXT == ct || WYLIE_TO_TMW == ct) {
try {
ArrayList al
= ((ACIP_TO_UNI_TEXT == ct || ACIP_TO_TMW == ct)
@ -364,6 +401,7 @@ public class TibetanConverter implements FontConverterConstants {
else
return 0;
} catch (IOException e) {
// TODO(dchandler): print it? where to?
return 48;
}
} else {

View File

@ -0,0 +1,38 @@
/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2005 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text.reverter;
/** Static methods for converting Unicode to EWTS and
* (TODO(dchandler): ACIP).
* @author David Chandler
*/
public class Converter {
/** Static methods provide all the fun! */
private Converter() {
throw new Error("There's no point in instantiating this class.");
}
/** Converts Tibetan Unicode to EWTS transliteration. If errors
* is non-null, error messages are appended to it. (Errors are
* always inline.) */
public static String convertToEwts(String unicode,
StringBuffer errors /* DLC: use it */) {
throw new Error("DLC not yet");
}
}

View File

@ -0,0 +1,55 @@
/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2005 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text.reverter;
import junit.framework.TestCase;
import org.thdl.util.ThdlOptions;
import org.thdl.tib.text.ttt.ErrorsAndWarnings;
/** Tests the Converter class.
*
* @author David Chandler */
public class ConverterTest extends TestCase {
/** Invokes a text UI and runs all this class's tests. */
public static void main(String[] args) {
junit.textui.TestRunner.run(ConverterTest.class);
}
protected void setUp() {
// We don't want to use options.txt:
ThdlOptions.forTestingOnlyInitializeWithoutDefaultOptionsFile();
ThdlOptions.setUserPreference("thdl.acip.to.tibetan.warning.and.error.severities.are.built.in.defaults", "true");
ThdlOptions.setUserPreference("thdl.acip.to.tibetan.warning.severity.507", "Most");
ErrorsAndWarnings.setupSeverityMap();
// We don't want to load the TM or TMW font files ourselves:
ThdlOptions.setUserPreference("thdl.rely.on.system.tmw.fonts", true);
ThdlOptions.setUserPreference("thdl.rely.on.system.tm.fonts", true);
ThdlOptions.setUserPreference("thdl.debug", true);
}
public ConverterTest() { }
public void testUnicodeToEwts() {
assertEquals(Converter.convertToEwts("\u0f40", null), "ka");
}
}

View File

@ -0,0 +1,42 @@
/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2005 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text.reverter;
/** A class for use in XSL transformations that converts Unicode to
* EWTS or ACIP transliteration. Note that the syntax for calling
* Java extensions from XSL is vendor-specific; for more details,
* please consult the documentation for the XSLT processor you use,
* for example Saxon or Xalan-Java.
* @author David Chandler
*/
public class UnicodeToTranslitForXslt {
/** Static methods provide all the fun! */
private UnicodeToTranslitForXslt() {
throw new Error("There's no point in instantiating this class.");
}
/** Converts Tibetan Unicode to EWTS transliteration. */
public static String unicodeToEwts(String unicode) {
return Converter.convertToEwts(unicode, null);
}
/** Converts Tibetan Unicode to ACIP transliteration. */
public static String unicodeToAcip(String unicode) {
throw new Error("DLC: not yet");
}
}

View File

@ -0,0 +1,61 @@
/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2005 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text.reverter;
import junit.framework.TestCase;
import org.thdl.util.ThdlOptions;
import org.thdl.tib.text.ttt.ErrorsAndWarnings;
/** Tests the UnicodeToTranslitForXslt class.
*
* @author David Chandler */
public class UnicodeToTranslitForXsltTest extends TestCase {
/** Invokes a text UI and runs all this class's tests. */
public static void main(String[] args) {
junit.textui.TestRunner.run(UnicodeToTranslitForXsltTest.class);
}
protected void setUp() {
// We don't want to use options.txt:
ThdlOptions.forTestingOnlyInitializeWithoutDefaultOptionsFile();
ThdlOptions.setUserPreference("thdl.acip.to.tibetan.warning.and.error.severities.are.built.in.defaults", "true");
ThdlOptions.setUserPreference("thdl.acip.to.tibetan.warning.severity.507", "Most");
ErrorsAndWarnings.setupSeverityMap();
// We don't want to load the TM or TMW font files ourselves:
ThdlOptions.setUserPreference("thdl.rely.on.system.tmw.fonts", true);
ThdlOptions.setUserPreference("thdl.rely.on.system.tm.fonts", true);
ThdlOptions.setUserPreference("thdl.debug", true);
}
public UnicodeToTranslitForXsltTest() { }
public void testUnicodeToEwts() {
assertEquals(UnicodeToTranslitForXslt.unicodeToEwts("\u0f40"), "ka");
assertEquals(UnicodeToTranslitForXslt.unicodeToEwts("\u0f56\u0f62\u0f4f\u0f42\u0f66\u0f0b"), "brtags ");
}
public void testUnicodeToAcip() {
assertEquals(UnicodeToTranslitForXslt.unicodeToEwts("\u0f40"), "KA");
assertEquals(UnicodeToTranslitForXslt.unicodeToEwts("\u0f56\u0f62\u0f4f\u0f42\u0f66\u0f0b"), "BRTAGS ");
}
}