I've added a command-line converter,

org.thdl.tib.input.TMW_RTF_TO_THDL_WYLIE.  It converts RTF files
consisting of TMW characters to the corresponding THDL Extended Wylie.

It supports --find-some-non-tmw mode, which allows you to ensure that no
unusual characters will spoil the conversion.  The converter has built-in
intelligence that allows it to handle Tahoma '{', '}', and '\\' characters
properly.

The converter works on mixed Roman/TMW also, but --find-some-non-tmw
and --find-all-non-tmw modes are not as useful.

Invoke org.thdl.tib.input.TMW_RTF_TO_THDL_WYLIE, which resides in
Jskad's jar, with no command-line options to see usage information.
This commit is contained in:
dchandler 2003-05-18 14:14:47 +00:00
parent 17ea8fdf2a
commit e2a9720d9b
7 changed files with 345 additions and 17 deletions

View file

@ -26,6 +26,45 @@ import java.io.*;
import org.thdl.util.ThdlDebug;
/** Represents a character meant to be rendered in a certain font.
* @author David Chandler
*/
class CharacterInAGivenFont {
private char character;
private String fontName;
public CharacterInAGivenFont(char ch, String font) {
character = ch;
fontName = font;
}
public CharacterInAGivenFont(String s, String font) {
if (s.length() != 1)
throw new Error("character in a given font was given a string "
+ s + " in a given font");
character = s.charAt(0);
fontName = font;
}
public boolean equals(Object x) {
return ((x instanceof CharacterInAGivenFont)
&& ((CharacterInAGivenFont)x).character == character
&& ((CharacterInAGivenFont)x).fontName.equals(fontName));
}
public int hashCode() {
return (int)character + fontName.hashCode();
}
public String toString() {
String characterRepresentation
= "'" + new Character(character).toString() + "'";
if ('\n' == character)
characterRepresentation = "newline";
if ('\r' == character)
characterRepresentation = "carriage return";
return characterRepresentation + " in the font "
+ ((null == fontName)
? "_ERROR_FINDING_FONT_"
: fontName);
}
}
/**
* A TibetanDocument is a styled document that knows about Tibetan and
* will respect line breaks and the like. It allows you to insert
@ -202,4 +241,132 @@ public class TibetanDocument extends DefaultStyledDocument {
return "";
}
/** Prints to standard output a list of all the indices of
characters that are not in a TMW font within the range [start,
end). Using a negative number for end means that this will
run to the end of the document. SPEED_FIXME: might be faster
to run over the elements, if they are one per font.
@return 1 if at least one non-TMW character was found in
the specified range, zero if none were, -1 on error. */
public int findAllNonTMWCharacters(int begin, int end) {
if (end < 0)
end = getLength();
if (begin >= end)
return 0;
int i = begin;
int returnValue = 0;
try {
while (i < end) {
AttributeSet attr = getCharacterElement(i).getAttributes();
String fontName = StyleConstants.getFontFamily(attr);
if ((0 == TibetanMachineWeb.getTMWFontNumber(fontName))) {
returnValue = 1;
CharacterInAGivenFont cgf
= new CharacterInAGivenFont(getText(i, 1), fontName);
System.out.println("non-TMW character "
+ cgf + " at location " + i);
}
i++;
}
} catch (BadLocationException ble) {
ble.printStackTrace();
ThdlDebug.noteIffyCode();
returnValue = -1;
}
return returnValue;
}
/** Finds the first occurrence of a non-TMW character in a given
font and prints it to System.out. If you have a Tahoma
newline and an Arial newline, the first occurrence of each
will be reported.
<p>Works within the range [start, end). Using a negative
number for end means that this will run to the end of the
document. SPEED_FIXME: might be faster to run over the
elements, if they are one per font.
@return 1 if at least one non-TMW character was found in
the specified range, zero if none were, -1 on error. */
public int findSomeNonTMWCharacters(int begin, int end) {
if (end < 0)
end = getLength();
if (begin >= end)
return 0;
int i = begin;
int returnValue = 0;
try {
HashMap cgfTable = new HashMap();
while (i < end) {
AttributeSet attr = getCharacterElement(i).getAttributes();
String fontName = StyleConstants.getFontFamily(attr);
if ((0 == TibetanMachineWeb.getTMWFontNumber(fontName))) {
returnValue = 1;
CharacterInAGivenFont cgf
= new CharacterInAGivenFont(getText(i, 1), fontName);
if (!cgfTable.containsKey(cgf)) {
cgfTable.put(cgf, "yes this character appears once");
System.out.println("non-TMW character "
+ cgf + " appears first at location " + i);
}
}
i++;
}
} catch (BadLocationException ble) {
ble.printStackTrace();
ThdlDebug.noteIffyCode();
returnValue = -1;
}
return returnValue;
}
private static final DuffData[] leftCurlyBraceTMW
= new DuffData[] { new DuffData("{", 1) };
private static final DuffData[] rightCurlyBraceTMW
= new DuffData[] { new DuffData("}", 1) };
private static final DuffData[] backslashTMW
= new DuffData[] { new DuffData("\\", 2) };
/** This is a band-aid used to help Jskad fix RTF files that are
mostly TMW but have some Tahoma characters that should be TMW.
Replaces '{', '}', and '\\' characters with the correct
TibetanMachineWeb. Works within the range [start, end).
Using a negative number for end means that this will run to
the end of the document. Be sure to set the size for Tibetan
as you like it before using this. SPEED_FIXME: might be
faster to run over the elements, if they are one per font. */
public void replaceTahomaCurlyBracesAndBackslashes(int begin, int end) {
if (end < 0)
end = getLength();
if (begin >= end)
return;
int i = begin;
try {
while (i < end) {
AttributeSet attr = getCharacterElement(i).getAttributes();
String fontName = StyleConstants.getFontFamily(attr);
if (fontName.equals("Tahoma")) {
DuffData[] toReplaceWith = null;
switch (getText(i, 1).charAt(0)) {
case '{':
toReplaceWith = leftCurlyBraceTMW;
break;
case '}':
toReplaceWith = rightCurlyBraceTMW;
break;
case '\\':
toReplaceWith = backslashTMW;
break;
}
if (null != toReplaceWith) {
insertDuff(i, toReplaceWith);
remove(i+1, 1);
}
}
i++;
}
} catch (BadLocationException ble) {
ble.printStackTrace();
ThdlDebug.noteIffyCode();
}
}
}