I've added a command-line converter,

org.thdl.tib.input.TMW_RTF_TO_THDL_WYLIE.  It converts RTF files
consisting of TMW characters to the corresponding THDL Extended Wylie.

It supports --find-some-non-tmw mode, which allows you to ensure that no
unusual characters will spoil the conversion.  The converter has built-in
intelligence that allows it to handle Tahoma '{', '}', and '\\' characters
properly.

The converter works on mixed Roman/TMW also, but --find-some-non-tmw
and --find-all-non-tmw modes are not as useful.

Invoke org.thdl.tib.input.TMW_RTF_TO_THDL_WYLIE, which resides in
Jskad's jar, with no command-line options to see usage information.
This commit is contained in:
dchandler 2003-05-18 14:14:47 +00:00
parent 17ea8fdf2a
commit e2a9720d9b
7 changed files with 345 additions and 17 deletions

View File

@ -296,6 +296,13 @@
<param name="my.included.source.file"
value="org/thdl/tib/text/TibetanHTML.java"/>
</antcall>
<!-- Put TMW_RTF_TO_THDL_WYLIE in Jskad's jar for those who want
to use it. -->
<antcall target="our-internal-javac-task">
<param name="mybin" value="${jskadbin}"/>
<param name="my.included.source.file"
value="org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIE.java"/>
</antcall>
<antcall target="copy-ini-files-to-bin-dir-for-jarring">
<param name="mybin" value="${jskadbin}"/>
</antcall>

View File

@ -23,6 +23,10 @@
# system-specific.
thdl.user.options.directory =
# Set this to true if you want more messages (probably on the console)
# about what's going on.
thdl.verbose = false
# Set this to the full path of Jskad's working directory. When you go
# to open a file or to save a file, this is the path you'll see first.
thdl.Jskad.working.directory =

View File

@ -62,9 +62,6 @@ import org.thdl.util.ThdlLazyException;
* @version 1.0
*/
public class Jskad extends JPanel implements DocumentListener {
private static final String rtfErrorMessage = "The Rich Text Format (RTF) file selected contains constructs that\nJskad cannot handle. If you got the RTF file from saving a Word\ndocument as RTF, try saving that same document as RTF in\nWord 2000 instead of Word XP or in Word 97 instead of\nWord 2000. Older versions of Word produce RTF that Jskad\ncan more easily deal with. OpenOffice and StarOffice also\nproduce better-behaved RTF.";
/** the name of the property a developer should set to see
low-level info on how keypresses in "Tibetan" input mode are
being interpreted */
@ -339,6 +336,28 @@ public class Jskad extends JPanel implements DocumentListener {
toolsMenu.add(DevelItem);
}
if (ThdlOptions.getBooleanOption("thdl.add.developer.options.to.menu")) {
toolsMenu.addSeparator();
JMenuItem DevelItem = new JMenuItem("Check for non-TMW characters"); // DLC NOW: do it just in the selection
DevelItem.addActionListener(new ThdlActionListener() {
public void theRealActionPerformed(ActionEvent e) {
((TibetanDocument)dp.getDocument()).findSomeNonTMWCharacters(0, -1); // entire document.
}
});
toolsMenu.add(DevelItem);
}
if (ThdlOptions.getBooleanOption("thdl.add.developer.options.to.menu")) {
toolsMenu.addSeparator();
JMenuItem DevelItem = new JMenuItem("Fix curly braces RTF problem"); // DLC NOW: do it just in the selection
DevelItem.addActionListener(new ThdlActionListener() {
public void theRealActionPerformed(ActionEvent e) {
((TibetanDocument)dp.getDocument()).replaceTahomaCurlyBracesAndBackslashes(0, -1); // entire document
}
});
toolsMenu.add(DevelItem);
}
menuBar.add(toolsMenu);
JMenu infoMenu = new JMenu("Info");
@ -668,7 +687,7 @@ public class Jskad extends JPanel implements DocumentListener {
newRTF.dp.rtfEd.read(in, newRTF.dp.getDocument(), 0);
} catch (Exception e) {
JOptionPane.showMessageDialog(newFrame,
rtfErrorMessage);
TMW_RTF_TO_THDL_WYLIE.rtfErrorMessage);
error = true;
}
in.close();
@ -693,7 +712,7 @@ public class Jskad extends JPanel implements DocumentListener {
dp.rtfEd.read(in, dp.getDocument(), 0);
} catch (Exception e) {
JOptionPane.showMessageDialog(this,
rtfErrorMessage);
TMW_RTF_TO_THDL_WYLIE.rtfErrorMessage);
error = true;
}

View File

@ -0,0 +1,125 @@
/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.input;
import java.io.*;
import org.thdl.util.*;
import org.thdl.tib.text.*;
/**
* TMW_RTF_TO_THDL_WYLIE is a command-line utility for converting TMW
* to Wylie. It is a TibetanMachineWeb-in-RichTextFormat to THDL
* Extended Wylie converter, more specifically. Invoke it with no
* parameters for usage information.
* @author David Chandler */
public class TMW_RTF_TO_THDL_WYLIE {
static final String rtfErrorMessage = "The Rich Text Format (RTF) file selected contains constructs that\nJskad cannot handle. If you got the RTF file from saving a Word\ndocument as RTF, try saving that same document as RTF in\nWord 2000 instead of Word XP or in Word 97 instead of\nWord 2000. Older versions of Word produce RTF that Jskad\ncan more easily deal with. OpenOffice and StarOffice may also\nproduce better-behaved RTF.";
static {
// No need for the TMW fonts.
System.setProperty("thdl.rely.on.system.tmw.fonts", "true");
}
/**
* Runs the converter. */
public static void main(String[] args) {
try {
boolean findSomeNonTMWMode = false;
boolean findAllNonTMWMode = false;
// Process arguments:
if ((args.length != 1 && args.length != 2)
|| (args.length == 1
&& (args[0].equals("-h")
|| args[0].equals("--help")))
|| (args.length == 2
&& !((findAllNonTMWMode
= args[0].equals("--find-all-non-tmw"))
|| (findSomeNonTMWMode
= args[0].equals("--find-some-non-tmw"))))) {
System.out.println("TMW_RTF_TO_THDL_WYLIE [--find-all-non-tmw | --find-some-non-tmw] RTF_file |");
System.out.println("TMW_RTF_TO_THDL_WYLIE [--version | -v | --help | -h]");
System.out.println("");
System.out.println("Distributed under the terms of the THDL Open Community License Version 1.0.");
System.out.println("");
System.out.println("Usage:");
System.out.println(" -v | --version for version info");
System.out.println(" -h | --help for this message");
System.out.println(" --find-all-non-tmw to locate all characters in the input document that are");
System.out.println(" not in Tibetan Machine Web fonts, exit zero iff none found");
System.out.println(" --find-some-non-tmw to locate all distinct characters in the input document");
System.out.println(" not in Tibetan Machine Web fonts, exit zero iff none found");
System.out.println(" Otherwise, needs one argument, the name of the TibetanMachineWeb RTF file.");
System.out.println(" Writes the Wylie transliteration of that file to standard output after");
System.out.println(" dealing with the curly brace problem. Exit code is zero on success,");
System.out.println(" nonzero otherwise.");
System.out.println("");
System.out.println(" You may find it helpful to use `--find-some-non-tmw' mode before doing a");
System.out.println(" conversion so that you have confidence in the conversion's correctness.");
System.exit(77);
}
if (args[0].equals("--version") || args[0].equals("-v")) {
System.out.println("TMW_RTF_TO_THDL_WYLIE version 0.8");
System.exit(77);
}
String tmwRtfPath = args[args.length - 1];
DuffPane dp = new DuffPane();
// Read in the rtf file.
{
InputStream in = new FileInputStream(tmwRtfPath);
try {
dp.rtfEd.read(in, dp.getDocument(), 0);
} catch (Exception e) {
System.out.println("TMW_RTF_TO_THDL_WYLIE:\n"
+ rtfErrorMessage);
System.exit(3);
}
in.close();
}
if (findAllNonTMWMode) {
// 0, -1 is the entire document.
System.exit(((TibetanDocument)dp.getDocument()).findAllNonTMWCharacters(0, -1));
} else if (findSomeNonTMWMode) {
// 0, -1 is the entire document.
System.exit(((TibetanDocument)dp.getDocument()).findSomeNonTMWCharacters(0, -1));
} else { // conversion mode
// Fix curly braces in the entire document:
((TibetanDocument)dp.getDocument()).replaceTahomaCurlyBracesAndBackslashes(0, -1);
// Convert to THDL Wylie:
dp.toWylie(0, dp.getDocument().getLength());
// Write to standard output the result:
((TibetanDocument)dp.getDocument()).writeRTFOutputStream(System.out);
// Exit normally:
System.exit(0);
}
} catch (ThdlLazyException e) {
System.out.println("TMW_RTF_TO_THDL_WYLIE has a BUG:");
e.getRealException().printStackTrace(System.out);
System.exit(1);
} catch (IOException e) {
e.printStackTrace();
System.exit(4);
}
}
}

View File

@ -17,17 +17,20 @@
<body bgcolor="white">
Provides classes and methods for inputting Tibetan text.
<p>
Designed for use with the Tibetan Computer
Company's free cross-platform TibetanMachineWeb fonts, this package
contains methods for inputting Tibetan using various keyboard
input methods, including true Wylie-based input, as well as
user-defined keyboards.
Designed for use with the Tibetan Computer Company's free
cross-platform Tibetan Machine Web fonts, this package contains
methods for inputting Tibetan using various keyboard input methods,
including true Wylie-based input, as well as user-defined keyboards.
<p>
The package includes a simple Tibetan text editor, Jskad,
which can be run as an local application or embedded in a
web page. Jskad supports a wide range of functions, including
conversion back and forth between TibetanMachineWeb and
Extended Wylie.
The package includes a simple Tibetan text editor, Jskad, which can be
run as an local application or embedded in a web page.&nbsp; Jskad
supports a wide range of functions, including conversion back and
forth between Tibetan Machine Web and Extended Wylie.
<p>
Also included is TMW_RTF_TO_THDL_WYLIE, a command-line utility for
converting Rich Text Format (RTF)a documents that use the Tibetan
Machine Web fonts into Extended Wylie.&nbsp; This utility is aware of
quirks in Java Swing's RTF support and works around them.
<p>
<h2>Related Documentation</h2>
@see <a href="../text/package-summary.html">org.thdl.tib.text</a>

View File

@ -26,6 +26,45 @@ import java.io.*;
import org.thdl.util.ThdlDebug;
/** Represents a character meant to be rendered in a certain font.
* @author David Chandler
*/
class CharacterInAGivenFont {
private char character;
private String fontName;
public CharacterInAGivenFont(char ch, String font) {
character = ch;
fontName = font;
}
public CharacterInAGivenFont(String s, String font) {
if (s.length() != 1)
throw new Error("character in a given font was given a string "
+ s + " in a given font");
character = s.charAt(0);
fontName = font;
}
public boolean equals(Object x) {
return ((x instanceof CharacterInAGivenFont)
&& ((CharacterInAGivenFont)x).character == character
&& ((CharacterInAGivenFont)x).fontName.equals(fontName));
}
public int hashCode() {
return (int)character + fontName.hashCode();
}
public String toString() {
String characterRepresentation
= "'" + new Character(character).toString() + "'";
if ('\n' == character)
characterRepresentation = "newline";
if ('\r' == character)
characterRepresentation = "carriage return";
return characterRepresentation + " in the font "
+ ((null == fontName)
? "_ERROR_FINDING_FONT_"
: fontName);
}
}
/**
* A TibetanDocument is a styled document that knows about Tibetan and
* will respect line breaks and the like. It allows you to insert
@ -202,4 +241,132 @@ public class TibetanDocument extends DefaultStyledDocument {
return "";
}
/** Prints to standard output a list of all the indices of
characters that are not in a TMW font within the range [start,
end). Using a negative number for end means that this will
run to the end of the document. SPEED_FIXME: might be faster
to run over the elements, if they are one per font.
@return 1 if at least one non-TMW character was found in
the specified range, zero if none were, -1 on error. */
public int findAllNonTMWCharacters(int begin, int end) {
if (end < 0)
end = getLength();
if (begin >= end)
return 0;
int i = begin;
int returnValue = 0;
try {
while (i < end) {
AttributeSet attr = getCharacterElement(i).getAttributes();
String fontName = StyleConstants.getFontFamily(attr);
if ((0 == TibetanMachineWeb.getTMWFontNumber(fontName))) {
returnValue = 1;
CharacterInAGivenFont cgf
= new CharacterInAGivenFont(getText(i, 1), fontName);
System.out.println("non-TMW character "
+ cgf + " at location " + i);
}
i++;
}
} catch (BadLocationException ble) {
ble.printStackTrace();
ThdlDebug.noteIffyCode();
returnValue = -1;
}
return returnValue;
}
/** Finds the first occurrence of a non-TMW character in a given
font and prints it to System.out. If you have a Tahoma
newline and an Arial newline, the first occurrence of each
will be reported.
<p>Works within the range [start, end). Using a negative
number for end means that this will run to the end of the
document. SPEED_FIXME: might be faster to run over the
elements, if they are one per font.
@return 1 if at least one non-TMW character was found in
the specified range, zero if none were, -1 on error. */
public int findSomeNonTMWCharacters(int begin, int end) {
if (end < 0)
end = getLength();
if (begin >= end)
return 0;
int i = begin;
int returnValue = 0;
try {
HashMap cgfTable = new HashMap();
while (i < end) {
AttributeSet attr = getCharacterElement(i).getAttributes();
String fontName = StyleConstants.getFontFamily(attr);
if ((0 == TibetanMachineWeb.getTMWFontNumber(fontName))) {
returnValue = 1;
CharacterInAGivenFont cgf
= new CharacterInAGivenFont(getText(i, 1), fontName);
if (!cgfTable.containsKey(cgf)) {
cgfTable.put(cgf, "yes this character appears once");
System.out.println("non-TMW character "
+ cgf + " appears first at location " + i);
}
}
i++;
}
} catch (BadLocationException ble) {
ble.printStackTrace();
ThdlDebug.noteIffyCode();
returnValue = -1;
}
return returnValue;
}
private static final DuffData[] leftCurlyBraceTMW
= new DuffData[] { new DuffData("{", 1) };
private static final DuffData[] rightCurlyBraceTMW
= new DuffData[] { new DuffData("}", 1) };
private static final DuffData[] backslashTMW
= new DuffData[] { new DuffData("\\", 2) };
/** This is a band-aid used to help Jskad fix RTF files that are
mostly TMW but have some Tahoma characters that should be TMW.
Replaces '{', '}', and '\\' characters with the correct
TibetanMachineWeb. Works within the range [start, end).
Using a negative number for end means that this will run to
the end of the document. Be sure to set the size for Tibetan
as you like it before using this. SPEED_FIXME: might be
faster to run over the elements, if they are one per font. */
public void replaceTahomaCurlyBracesAndBackslashes(int begin, int end) {
if (end < 0)
end = getLength();
if (begin >= end)
return;
int i = begin;
try {
while (i < end) {
AttributeSet attr = getCharacterElement(i).getAttributes();
String fontName = StyleConstants.getFontFamily(attr);
if (fontName.equals("Tahoma")) {
DuffData[] toReplaceWith = null;
switch (getText(i, 1).charAt(0)) {
case '{':
toReplaceWith = leftCurlyBraceTMW;
break;
case '}':
toReplaceWith = rightCurlyBraceTMW;
break;
case '\\':
toReplaceWith = backslashTMW;
break;
}
if (null != toReplaceWith) {
insertDuff(i, toReplaceWith);
remove(i+1, 1);
}
}
i++;
}
} catch (BadLocationException ble) {
ble.printStackTrace();
ThdlDebug.noteIffyCode();
}
}
}

View File

@ -282,8 +282,10 @@ public class TibetanMachineWeb implements THDLWylieConstants {
InputStreamReader isr = new InputStreamReader(url.openStream());
BufferedReader in = new BufferedReader(isr);
System.out.println("Reading Tibetan Machine Web code table "
+ fileName);
if (ThdlOptions.getBooleanOption("thdl.verbose")) {
System.out.println("Reading Tibetan Machine Web code table "
+ fileName);
}
String line;
boolean hashOn = false;
boolean isSanskrit = false; //FIXME: this is never read.
@ -419,6 +421,7 @@ public class TibetanMachineWeb implements THDLWylieConstants {
}
catch (IOException e) {
System.out.println("file Disappeared");
ThdlDebug.noteIffyCode();
}
hasReadData = true;