I've added a command-line converter,

org.thdl.tib.input.TMW_RTF_TO_THDL_WYLIE. It converts RTF files consisting of TMW characters to the corresponding THDL Extended Wylie. It supports --find-some-non-tmw mode, which allows you to ensure that no unusual characters will spoil the conversion. The converter has built-in intelligence that allows it to handle Tahoma '{', '}', and '\\' characters properly. The converter works on mixed Roman/TMW also, but --find-some-non-tmw and --find-all-non-tmw modes are not as useful. Invoke org.thdl.tib.input.TMW_RTF_TO_THDL_WYLIE, which resides in Jskad's jar, with no command-line options to see usage information.
2003-05-18 14:14:47 +00:00 · 2003-05-18 14:14:47 +00:00 · e2a9720d9b
commit e2a9720d9b
parent 17ea8fdf2a
7 changed files with 345 additions and 17 deletions
--- a/source/org/thdl/tib/text/TibetanDocument.java
+++ b/source/org/thdl/tib/text/TibetanDocument.java
@ -26,6 +26,45 @@ import java.io.*;

 import org.thdl.util.ThdlDebug;

+/** Represents a character meant to be rendered in a certain font.
+ *  @author David Chandler
+ */
+class CharacterInAGivenFont {
+    private char character;
+    private String fontName;
+    public CharacterInAGivenFont(char ch, String font) {
+        character = ch;
+        fontName = font;
+    }
+    public CharacterInAGivenFont(String s, String font) {
+        if (s.length() != 1)
+            throw new Error("character in a given font was given a string "
+                            + s + " in a given font");
+        character = s.charAt(0);
+        fontName = font;
+    }
+    public boolean equals(Object x) {
+        return ((x instanceof CharacterInAGivenFont)
+                && ((CharacterInAGivenFont)x).character == character
+                && ((CharacterInAGivenFont)x).fontName.equals(fontName));
+    }
+    public int hashCode() {
+        return (int)character + fontName.hashCode();
+    }
+    public String toString() {
+        String characterRepresentation
+            = "'" + new Character(character).toString() + "'";
+        if ('\n' == character)
+            characterRepresentation = "newline";
+        if ('\r' == character)
+            characterRepresentation = "carriage return";
+        return characterRepresentation + " in the font "
+            + ((null == fontName)
+               ? "_ERROR_FINDING_FONT_"
+               : fontName);
+    }
+}
+
 /**
 * A TibetanDocument is a styled document that knows about Tibetan and
 * will respect line breaks and the like.  It allows you to insert
@ -202,4 +241,132 @@ public class TibetanDocument extends DefaultStyledDocument {

 		return "";
 	}
+
+    /** Prints to standard output a list of all the indices of
+        characters that are not in a TMW font within the range [start,
+        end).  Using a negative number for end means that this will
+        run to the end of the document.  SPEED_FIXME: might be faster
+        to run over the elements, if they are one per font.
+        @return 1 if at least one non-TMW character was found in
+        the specified range, zero if none were, -1 on error. */
+    public int findAllNonTMWCharacters(int begin, int end) {
+        if (end < 0)
+            end = getLength();
+        if (begin >= end)
+            return 0;
+        int i = begin;
+        int returnValue = 0;
+        try {
+            while (i < end) {
+                AttributeSet attr = getCharacterElement(i).getAttributes();
+                String fontName = StyleConstants.getFontFamily(attr);
+                if ((0 == TibetanMachineWeb.getTMWFontNumber(fontName))) {
+                    returnValue = 1;
+                    CharacterInAGivenFont cgf
+                        = new CharacterInAGivenFont(getText(i, 1), fontName);
+                    System.out.println("non-TMW character "
+                                       + cgf + " at location " + i);
+                }
+                i++;
+            }
+        } catch (BadLocationException ble) {
+            ble.printStackTrace();
+            ThdlDebug.noteIffyCode();
+            returnValue = -1;
+        }
+        return returnValue;
+    }
+
+    /** Finds the first occurrence of a non-TMW character in a given
+        font and prints it to System.out.  If you have a Tahoma
+        newline and an Arial newline, the first occurrence of each
+        will be reported.
+        
+        <p>Works within the range [start, end).  Using a negative
+        number for end means that this will run to the end of the
+        document.  SPEED_FIXME: might be faster to run over the
+        elements, if they are one per font.
+        @return 1 if at least one non-TMW character was found in
+        the specified range, zero if none were, -1 on error. */
+    public int findSomeNonTMWCharacters(int begin, int end) {
+        if (end < 0)
+            end = getLength();
+        if (begin >= end)
+            return 0;
+        int i = begin;
+        int returnValue = 0;
+        try {
+            HashMap cgfTable = new HashMap();
+            while (i < end) {
+                AttributeSet attr = getCharacterElement(i).getAttributes();
+                String fontName = StyleConstants.getFontFamily(attr);
+                if ((0 == TibetanMachineWeb.getTMWFontNumber(fontName))) {
+                    returnValue = 1;
+                    CharacterInAGivenFont cgf
+                        = new CharacterInAGivenFont(getText(i, 1), fontName);
+                    if (!cgfTable.containsKey(cgf)) {
+                        cgfTable.put(cgf, "yes this character appears once");
+                        System.out.println("non-TMW character "
+                                           + cgf + " appears first at location " + i);
+                    }
+                }
+                i++;
+            }
+        } catch (BadLocationException ble) {
+            ble.printStackTrace();
+            ThdlDebug.noteIffyCode();
+            returnValue = -1;
+        }
+        return returnValue;
+    }
+
+    private static final DuffData[] leftCurlyBraceTMW
+        = new DuffData[] { new DuffData("{", 1) };
+    private static final DuffData[] rightCurlyBraceTMW
+        = new DuffData[] { new DuffData("}", 1) };
+    private static final DuffData[] backslashTMW
+        = new DuffData[] { new DuffData("\\", 2) };
+    /** This is a band-aid used to help Jskad fix RTF files that are
+        mostly TMW but have some Tahoma characters that should be TMW.
+        Replaces '{', '}', and '\\' characters with the correct
+        TibetanMachineWeb.  Works within the range [start, end).
+        Using a negative number for end means that this will run to
+        the end of the document.  Be sure to set the size for Tibetan
+        as you like it before using this.  SPEED_FIXME: might be
+        faster to run over the elements, if they are one per font. */
+    public void replaceTahomaCurlyBracesAndBackslashes(int begin, int end) {
+        if (end < 0)
+            end = getLength();
+        if (begin >= end)
+            return;
+        int i = begin;
+        try {
+            while (i < end) {
+                AttributeSet attr = getCharacterElement(i).getAttributes();
+                String fontName = StyleConstants.getFontFamily(attr);
+                if (fontName.equals("Tahoma")) {
+                    DuffData[] toReplaceWith = null;
+                    switch (getText(i, 1).charAt(0)) {
+                    case '{':
+                        toReplaceWith = leftCurlyBraceTMW;
+                        break;
+                    case '}':
+                        toReplaceWith = rightCurlyBraceTMW;
+                        break;
+                    case '\\':
+                        toReplaceWith = backslashTMW;
+                        break;
+                    }
+                    if (null != toReplaceWith) {
+                        insertDuff(i, toReplaceWith);
+                        remove(i+1, 1);
+                    }
+                }
+                i++;
+            }
+        } catch (BadLocationException ble) {
+            ble.printStackTrace();
+            ThdlDebug.noteIffyCode();
+        }
+    }
 }