I've added a command-line converter,
org.thdl.tib.input.TMW_RTF_TO_THDL_WYLIE. It converts RTF files consisting of TMW characters to the corresponding THDL Extended Wylie. It supports --find-some-non-tmw mode, which allows you to ensure that no unusual characters will spoil the conversion. The converter has built-in intelligence that allows it to handle Tahoma '{', '}', and '\\' characters properly. The converter works on mixed Roman/TMW also, but --find-some-non-tmw and --find-all-non-tmw modes are not as useful. Invoke org.thdl.tib.input.TMW_RTF_TO_THDL_WYLIE, which resides in Jskad's jar, with no command-line options to see usage information.
This commit is contained in:
parent
17ea8fdf2a
commit
e2a9720d9b
7 changed files with 345 additions and 17 deletions
|
@ -296,6 +296,13 @@
|
|||
<param name="my.included.source.file"
|
||||
value="org/thdl/tib/text/TibetanHTML.java"/>
|
||||
</antcall>
|
||||
<!-- Put TMW_RTF_TO_THDL_WYLIE in Jskad's jar for those who want
|
||||
to use it. -->
|
||||
<antcall target="our-internal-javac-task">
|
||||
<param name="mybin" value="${jskadbin}"/>
|
||||
<param name="my.included.source.file"
|
||||
value="org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIE.java"/>
|
||||
</antcall>
|
||||
<antcall target="copy-ini-files-to-bin-dir-for-jarring">
|
||||
<param name="mybin" value="${jskadbin}"/>
|
||||
</antcall>
|
||||
|
|
|
@ -23,6 +23,10 @@
|
|||
# system-specific.
|
||||
thdl.user.options.directory =
|
||||
|
||||
# Set this to true if you want more messages (probably on the console)
|
||||
# about what's going on.
|
||||
thdl.verbose = false
|
||||
|
||||
# Set this to the full path of Jskad's working directory. When you go
|
||||
# to open a file or to save a file, this is the path you'll see first.
|
||||
thdl.Jskad.working.directory =
|
||||
|
|
|
@ -62,9 +62,6 @@ import org.thdl.util.ThdlLazyException;
|
|||
* @version 1.0
|
||||
*/
|
||||
public class Jskad extends JPanel implements DocumentListener {
|
||||
|
||||
private static final String rtfErrorMessage = "The Rich Text Format (RTF) file selected contains constructs that\nJskad cannot handle. If you got the RTF file from saving a Word\ndocument as RTF, try saving that same document as RTF in\nWord 2000 instead of Word XP or in Word 97 instead of\nWord 2000. Older versions of Word produce RTF that Jskad\ncan more easily deal with. OpenOffice and StarOffice also\nproduce better-behaved RTF.";
|
||||
|
||||
/** the name of the property a developer should set to see
|
||||
low-level info on how keypresses in "Tibetan" input mode are
|
||||
being interpreted */
|
||||
|
@ -339,6 +336,28 @@ public class Jskad extends JPanel implements DocumentListener {
|
|||
toolsMenu.add(DevelItem);
|
||||
}
|
||||
|
||||
if (ThdlOptions.getBooleanOption("thdl.add.developer.options.to.menu")) {
|
||||
toolsMenu.addSeparator();
|
||||
JMenuItem DevelItem = new JMenuItem("Check for non-TMW characters"); // DLC NOW: do it just in the selection
|
||||
DevelItem.addActionListener(new ThdlActionListener() {
|
||||
public void theRealActionPerformed(ActionEvent e) {
|
||||
((TibetanDocument)dp.getDocument()).findSomeNonTMWCharacters(0, -1); // entire document.
|
||||
}
|
||||
});
|
||||
toolsMenu.add(DevelItem);
|
||||
}
|
||||
|
||||
if (ThdlOptions.getBooleanOption("thdl.add.developer.options.to.menu")) {
|
||||
toolsMenu.addSeparator();
|
||||
JMenuItem DevelItem = new JMenuItem("Fix curly braces RTF problem"); // DLC NOW: do it just in the selection
|
||||
DevelItem.addActionListener(new ThdlActionListener() {
|
||||
public void theRealActionPerformed(ActionEvent e) {
|
||||
((TibetanDocument)dp.getDocument()).replaceTahomaCurlyBracesAndBackslashes(0, -1); // entire document
|
||||
}
|
||||
});
|
||||
toolsMenu.add(DevelItem);
|
||||
}
|
||||
|
||||
menuBar.add(toolsMenu);
|
||||
|
||||
JMenu infoMenu = new JMenu("Info");
|
||||
|
@ -668,7 +687,7 @@ public class Jskad extends JPanel implements DocumentListener {
|
|||
newRTF.dp.rtfEd.read(in, newRTF.dp.getDocument(), 0);
|
||||
} catch (Exception e) {
|
||||
JOptionPane.showMessageDialog(newFrame,
|
||||
rtfErrorMessage);
|
||||
TMW_RTF_TO_THDL_WYLIE.rtfErrorMessage);
|
||||
error = true;
|
||||
}
|
||||
in.close();
|
||||
|
@ -693,7 +712,7 @@ public class Jskad extends JPanel implements DocumentListener {
|
|||
dp.rtfEd.read(in, dp.getDocument(), 0);
|
||||
} catch (Exception e) {
|
||||
JOptionPane.showMessageDialog(this,
|
||||
rtfErrorMessage);
|
||||
TMW_RTF_TO_THDL_WYLIE.rtfErrorMessage);
|
||||
error = true;
|
||||
}
|
||||
|
||||
|
|
125
source/org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIE.java
Normal file
125
source/org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIE.java
Normal file
|
@ -0,0 +1,125 @@
|
|||
/*
|
||||
The contents of this file are subject to the THDL Open Community License
|
||||
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License on the THDL web site
|
||||
(http://www.thdl.org/).
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||
License for the specific terms governing rights and limitations under the
|
||||
License.
|
||||
|
||||
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
||||
All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
*/
|
||||
|
||||
package org.thdl.tib.input;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
import org.thdl.util.*;
|
||||
import org.thdl.tib.text.*;
|
||||
|
||||
/**
|
||||
* TMW_RTF_TO_THDL_WYLIE is a command-line utility for converting TMW
|
||||
* to Wylie. It is a TibetanMachineWeb-in-RichTextFormat to THDL
|
||||
* Extended Wylie converter, more specifically. Invoke it with no
|
||||
* parameters for usage information.
|
||||
* @author David Chandler */
|
||||
public class TMW_RTF_TO_THDL_WYLIE {
|
||||
static final String rtfErrorMessage = "The Rich Text Format (RTF) file selected contains constructs that\nJskad cannot handle. If you got the RTF file from saving a Word\ndocument as RTF, try saving that same document as RTF in\nWord 2000 instead of Word XP or in Word 97 instead of\nWord 2000. Older versions of Word produce RTF that Jskad\ncan more easily deal with. OpenOffice and StarOffice may also\nproduce better-behaved RTF.";
|
||||
|
||||
static {
|
||||
// No need for the TMW fonts.
|
||||
System.setProperty("thdl.rely.on.system.tmw.fonts", "true");
|
||||
}
|
||||
|
||||
/**
|
||||
* Runs the converter. */
|
||||
public static void main(String[] args) {
|
||||
try {
|
||||
boolean findSomeNonTMWMode = false;
|
||||
boolean findAllNonTMWMode = false;
|
||||
// Process arguments:
|
||||
if ((args.length != 1 && args.length != 2)
|
||||
|| (args.length == 1
|
||||
&& (args[0].equals("-h")
|
||||
|| args[0].equals("--help")))
|
||||
|| (args.length == 2
|
||||
&& !((findAllNonTMWMode
|
||||
= args[0].equals("--find-all-non-tmw"))
|
||||
|| (findSomeNonTMWMode
|
||||
= args[0].equals("--find-some-non-tmw"))))) {
|
||||
System.out.println("TMW_RTF_TO_THDL_WYLIE [--find-all-non-tmw | --find-some-non-tmw] RTF_file |");
|
||||
System.out.println("TMW_RTF_TO_THDL_WYLIE [--version | -v | --help | -h]");
|
||||
System.out.println("");
|
||||
System.out.println("Distributed under the terms of the THDL Open Community License Version 1.0.");
|
||||
System.out.println("");
|
||||
System.out.println("Usage:");
|
||||
System.out.println(" -v | --version for version info");
|
||||
System.out.println(" -h | --help for this message");
|
||||
System.out.println(" --find-all-non-tmw to locate all characters in the input document that are");
|
||||
System.out.println(" not in Tibetan Machine Web fonts, exit zero iff none found");
|
||||
System.out.println(" --find-some-non-tmw to locate all distinct characters in the input document");
|
||||
System.out.println(" not in Tibetan Machine Web fonts, exit zero iff none found");
|
||||
System.out.println(" Otherwise, needs one argument, the name of the TibetanMachineWeb RTF file.");
|
||||
System.out.println(" Writes the Wylie transliteration of that file to standard output after");
|
||||
System.out.println(" dealing with the curly brace problem. Exit code is zero on success,");
|
||||
System.out.println(" nonzero otherwise.");
|
||||
System.out.println("");
|
||||
System.out.println(" You may find it helpful to use `--find-some-non-tmw' mode before doing a");
|
||||
System.out.println(" conversion so that you have confidence in the conversion's correctness.");
|
||||
System.exit(77);
|
||||
}
|
||||
if (args[0].equals("--version") || args[0].equals("-v")) {
|
||||
System.out.println("TMW_RTF_TO_THDL_WYLIE version 0.8");
|
||||
System.exit(77);
|
||||
}
|
||||
String tmwRtfPath = args[args.length - 1];
|
||||
|
||||
DuffPane dp = new DuffPane();
|
||||
// Read in the rtf file.
|
||||
{
|
||||
InputStream in = new FileInputStream(tmwRtfPath);
|
||||
try {
|
||||
dp.rtfEd.read(in, dp.getDocument(), 0);
|
||||
} catch (Exception e) {
|
||||
System.out.println("TMW_RTF_TO_THDL_WYLIE:\n"
|
||||
+ rtfErrorMessage);
|
||||
System.exit(3);
|
||||
}
|
||||
in.close();
|
||||
}
|
||||
|
||||
if (findAllNonTMWMode) {
|
||||
// 0, -1 is the entire document.
|
||||
System.exit(((TibetanDocument)dp.getDocument()).findAllNonTMWCharacters(0, -1));
|
||||
} else if (findSomeNonTMWMode) {
|
||||
// 0, -1 is the entire document.
|
||||
System.exit(((TibetanDocument)dp.getDocument()).findSomeNonTMWCharacters(0, -1));
|
||||
} else { // conversion mode
|
||||
// Fix curly braces in the entire document:
|
||||
((TibetanDocument)dp.getDocument()).replaceTahomaCurlyBracesAndBackslashes(0, -1);
|
||||
|
||||
// Convert to THDL Wylie:
|
||||
dp.toWylie(0, dp.getDocument().getLength());
|
||||
|
||||
// Write to standard output the result:
|
||||
((TibetanDocument)dp.getDocument()).writeRTFOutputStream(System.out);
|
||||
|
||||
// Exit normally:
|
||||
System.exit(0);
|
||||
}
|
||||
} catch (ThdlLazyException e) {
|
||||
System.out.println("TMW_RTF_TO_THDL_WYLIE has a BUG:");
|
||||
e.getRealException().printStackTrace(System.out);
|
||||
System.exit(1);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
System.exit(4);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -17,17 +17,20 @@
|
|||
<body bgcolor="white">
|
||||
Provides classes and methods for inputting Tibetan text.
|
||||
<p>
|
||||
Designed for use with the Tibetan Computer
|
||||
Company's free cross-platform TibetanMachineWeb fonts, this package
|
||||
contains methods for inputting Tibetan using various keyboard
|
||||
input methods, including true Wylie-based input, as well as
|
||||
user-defined keyboards.
|
||||
Designed for use with the Tibetan Computer Company's free
|
||||
cross-platform Tibetan Machine Web fonts, this package contains
|
||||
methods for inputting Tibetan using various keyboard input methods,
|
||||
including true Wylie-based input, as well as user-defined keyboards.
|
||||
<p>
|
||||
The package includes a simple Tibetan text editor, Jskad,
|
||||
which can be run as an local application or embedded in a
|
||||
web page. Jskad supports a wide range of functions, including
|
||||
conversion back and forth between TibetanMachineWeb and
|
||||
Extended Wylie.
|
||||
The package includes a simple Tibetan text editor, Jskad, which can be
|
||||
run as an local application or embedded in a web page. Jskad
|
||||
supports a wide range of functions, including conversion back and
|
||||
forth between Tibetan Machine Web and Extended Wylie.
|
||||
<p>
|
||||
Also included is TMW_RTF_TO_THDL_WYLIE, a command-line utility for
|
||||
converting Rich Text Format (RTF)a documents that use the Tibetan
|
||||
Machine Web fonts into Extended Wylie. This utility is aware of
|
||||
quirks in Java Swing's RTF support and works around them.
|
||||
<p>
|
||||
<h2>Related Documentation</h2>
|
||||
@see <a href="../text/package-summary.html">org.thdl.tib.text</a>
|
||||
|
|
|
@ -26,6 +26,45 @@ import java.io.*;
|
|||
|
||||
import org.thdl.util.ThdlDebug;
|
||||
|
||||
/** Represents a character meant to be rendered in a certain font.
|
||||
* @author David Chandler
|
||||
*/
|
||||
class CharacterInAGivenFont {
|
||||
private char character;
|
||||
private String fontName;
|
||||
public CharacterInAGivenFont(char ch, String font) {
|
||||
character = ch;
|
||||
fontName = font;
|
||||
}
|
||||
public CharacterInAGivenFont(String s, String font) {
|
||||
if (s.length() != 1)
|
||||
throw new Error("character in a given font was given a string "
|
||||
+ s + " in a given font");
|
||||
character = s.charAt(0);
|
||||
fontName = font;
|
||||
}
|
||||
public boolean equals(Object x) {
|
||||
return ((x instanceof CharacterInAGivenFont)
|
||||
&& ((CharacterInAGivenFont)x).character == character
|
||||
&& ((CharacterInAGivenFont)x).fontName.equals(fontName));
|
||||
}
|
||||
public int hashCode() {
|
||||
return (int)character + fontName.hashCode();
|
||||
}
|
||||
public String toString() {
|
||||
String characterRepresentation
|
||||
= "'" + new Character(character).toString() + "'";
|
||||
if ('\n' == character)
|
||||
characterRepresentation = "newline";
|
||||
if ('\r' == character)
|
||||
characterRepresentation = "carriage return";
|
||||
return characterRepresentation + " in the font "
|
||||
+ ((null == fontName)
|
||||
? "_ERROR_FINDING_FONT_"
|
||||
: fontName);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A TibetanDocument is a styled document that knows about Tibetan and
|
||||
* will respect line breaks and the like. It allows you to insert
|
||||
|
@ -202,4 +241,132 @@ public class TibetanDocument extends DefaultStyledDocument {
|
|||
|
||||
return "";
|
||||
}
|
||||
|
||||
/** Prints to standard output a list of all the indices of
|
||||
characters that are not in a TMW font within the range [start,
|
||||
end). Using a negative number for end means that this will
|
||||
run to the end of the document. SPEED_FIXME: might be faster
|
||||
to run over the elements, if they are one per font.
|
||||
@return 1 if at least one non-TMW character was found in
|
||||
the specified range, zero if none were, -1 on error. */
|
||||
public int findAllNonTMWCharacters(int begin, int end) {
|
||||
if (end < 0)
|
||||
end = getLength();
|
||||
if (begin >= end)
|
||||
return 0;
|
||||
int i = begin;
|
||||
int returnValue = 0;
|
||||
try {
|
||||
while (i < end) {
|
||||
AttributeSet attr = getCharacterElement(i).getAttributes();
|
||||
String fontName = StyleConstants.getFontFamily(attr);
|
||||
if ((0 == TibetanMachineWeb.getTMWFontNumber(fontName))) {
|
||||
returnValue = 1;
|
||||
CharacterInAGivenFont cgf
|
||||
= new CharacterInAGivenFont(getText(i, 1), fontName);
|
||||
System.out.println("non-TMW character "
|
||||
+ cgf + " at location " + i);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
} catch (BadLocationException ble) {
|
||||
ble.printStackTrace();
|
||||
ThdlDebug.noteIffyCode();
|
||||
returnValue = -1;
|
||||
}
|
||||
return returnValue;
|
||||
}
|
||||
|
||||
/** Finds the first occurrence of a non-TMW character in a given
|
||||
font and prints it to System.out. If you have a Tahoma
|
||||
newline and an Arial newline, the first occurrence of each
|
||||
will be reported.
|
||||
|
||||
<p>Works within the range [start, end). Using a negative
|
||||
number for end means that this will run to the end of the
|
||||
document. SPEED_FIXME: might be faster to run over the
|
||||
elements, if they are one per font.
|
||||
@return 1 if at least one non-TMW character was found in
|
||||
the specified range, zero if none were, -1 on error. */
|
||||
public int findSomeNonTMWCharacters(int begin, int end) {
|
||||
if (end < 0)
|
||||
end = getLength();
|
||||
if (begin >= end)
|
||||
return 0;
|
||||
int i = begin;
|
||||
int returnValue = 0;
|
||||
try {
|
||||
HashMap cgfTable = new HashMap();
|
||||
while (i < end) {
|
||||
AttributeSet attr = getCharacterElement(i).getAttributes();
|
||||
String fontName = StyleConstants.getFontFamily(attr);
|
||||
if ((0 == TibetanMachineWeb.getTMWFontNumber(fontName))) {
|
||||
returnValue = 1;
|
||||
CharacterInAGivenFont cgf
|
||||
= new CharacterInAGivenFont(getText(i, 1), fontName);
|
||||
if (!cgfTable.containsKey(cgf)) {
|
||||
cgfTable.put(cgf, "yes this character appears once");
|
||||
System.out.println("non-TMW character "
|
||||
+ cgf + " appears first at location " + i);
|
||||
}
|
||||
}
|
||||
i++;
|
||||
}
|
||||
} catch (BadLocationException ble) {
|
||||
ble.printStackTrace();
|
||||
ThdlDebug.noteIffyCode();
|
||||
returnValue = -1;
|
||||
}
|
||||
return returnValue;
|
||||
}
|
||||
|
||||
private static final DuffData[] leftCurlyBraceTMW
|
||||
= new DuffData[] { new DuffData("{", 1) };
|
||||
private static final DuffData[] rightCurlyBraceTMW
|
||||
= new DuffData[] { new DuffData("}", 1) };
|
||||
private static final DuffData[] backslashTMW
|
||||
= new DuffData[] { new DuffData("\\", 2) };
|
||||
/** This is a band-aid used to help Jskad fix RTF files that are
|
||||
mostly TMW but have some Tahoma characters that should be TMW.
|
||||
Replaces '{', '}', and '\\' characters with the correct
|
||||
TibetanMachineWeb. Works within the range [start, end).
|
||||
Using a negative number for end means that this will run to
|
||||
the end of the document. Be sure to set the size for Tibetan
|
||||
as you like it before using this. SPEED_FIXME: might be
|
||||
faster to run over the elements, if they are one per font. */
|
||||
public void replaceTahomaCurlyBracesAndBackslashes(int begin, int end) {
|
||||
if (end < 0)
|
||||
end = getLength();
|
||||
if (begin >= end)
|
||||
return;
|
||||
int i = begin;
|
||||
try {
|
||||
while (i < end) {
|
||||
AttributeSet attr = getCharacterElement(i).getAttributes();
|
||||
String fontName = StyleConstants.getFontFamily(attr);
|
||||
if (fontName.equals("Tahoma")) {
|
||||
DuffData[] toReplaceWith = null;
|
||||
switch (getText(i, 1).charAt(0)) {
|
||||
case '{':
|
||||
toReplaceWith = leftCurlyBraceTMW;
|
||||
break;
|
||||
case '}':
|
||||
toReplaceWith = rightCurlyBraceTMW;
|
||||
break;
|
||||
case '\\':
|
||||
toReplaceWith = backslashTMW;
|
||||
break;
|
||||
}
|
||||
if (null != toReplaceWith) {
|
||||
insertDuff(i, toReplaceWith);
|
||||
remove(i+1, 1);
|
||||
}
|
||||
}
|
||||
i++;
|
||||
}
|
||||
} catch (BadLocationException ble) {
|
||||
ble.printStackTrace();
|
||||
ThdlDebug.noteIffyCode();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -282,8 +282,10 @@ public class TibetanMachineWeb implements THDLWylieConstants {
|
|||
InputStreamReader isr = new InputStreamReader(url.openStream());
|
||||
BufferedReader in = new BufferedReader(isr);
|
||||
|
||||
System.out.println("Reading Tibetan Machine Web code table "
|
||||
+ fileName);
|
||||
if (ThdlOptions.getBooleanOption("thdl.verbose")) {
|
||||
System.out.println("Reading Tibetan Machine Web code table "
|
||||
+ fileName);
|
||||
}
|
||||
String line;
|
||||
boolean hashOn = false;
|
||||
boolean isSanskrit = false; //FIXME: this is never read.
|
||||
|
@ -419,6 +421,7 @@ public class TibetanMachineWeb implements THDLWylieConstants {
|
|||
}
|
||||
catch (IOException e) {
|
||||
System.out.println("file Disappeared");
|
||||
ThdlDebug.noteIffyCode();
|
||||
}
|
||||
|
||||
hasReadData = true;
|
||||
|
|
Loading…
Reference in a new issue