Added --find-some-non-tm and --find-all-non-tm modes to the converter to

help ensure worry-free TM->TMW conversions.
2003-06-22 00:14:18 +00:00 · 2003-06-22 00:14:18 +00:00 · dfe64a1927
commit dfe64a1927
parent 80101666c7
2 changed files with 95 additions and 40 deletions
--- a/source/org/thdl/tib/input/TibetanConverter.java
+++ b/source/org/thdl/tib/input/TibetanConverter.java
@ -57,6 +57,8 @@ public class TibetanConverter {
            boolean convertToWylieMode = false;
            boolean findSomeNonTMWMode = false;
            boolean findAllNonTMWMode = false;
+            boolean findSomeNonTMMode = false;
+            boolean findAllNonTMMode = false;
            // Process arguments:
            if ((args.length != 1 && args.length != 2)
                || (args.length == 1
@ -74,7 +76,12 @@ public class TibetanConverter {
                         || (convertToWylieMode
                             = args[0].equals("--to-wylie"))
                         || (findSomeNonTMWMode
-                             = args[0].equals("--find-some-non-tmw"))))) {
+                             = args[0].equals("--find-some-non-tmw"))
+                         || (findSomeNonTMMode
+                             = args[0].equals("--find-some-non-tm"))
+                         || (findAllNonTMMode
+                             = args[0].equals("--find-all-non-tm"))
+                ))) {
                out.println("TibetanConverter [--find-all-non-tmw | --find-some-non-tmw");
                out.println("                  | --to-tibetan-machine | --to-tibetan-machine-web");
                out.println("                  | --to-unicode | --to-wylie] RTF_file");
@ -86,13 +93,18 @@ public class TibetanConverter {
                out.println(" -v | --version for version info");
                out.println(" -h | --help for this message");
                out.println(" --find-all-non-tmw to locate all characters in the input document that are");
-                out.println("   not in Tibetan Machine Web fonts, exit zero iff none found");
+                out.println("   not in Tibetan Machine Web fonts, exit zero if and only if none found");
                out.println(" --find-some-non-tmw to locate all distinct characters in the input document");
-                out.println("   not in Tibetan Machine Web fonts, exit zero iff none found");
+                out.println("   not in Tibetan Machine Web fonts, exit zero if and only if none found");
+                out.println(" --find-all-non-tm to locate all characters in the input document that are");
+                out.println("   not in Tibetan Machine fonts, exit zero if and only if none found");
+                out.println(" --find-some-non-tm to locate all distinct characters in the input document");
+                out.println("   not in Tibetan Machine fonts, exit zero if and only if none found");
                out.println(" --to-tibetan-machine to convert TibetanMachineWeb to TibetanMachine");
                out.println(" --to-unicode to convert TibetanMachineWeb to Unicode");
                out.println(" --to-tibetan-machine-web to convert TibetanMachine to TibetanMachineWeb");
                out.println(" --to-wylie to convert TibetanMachineWeb to THDL Extended Wylie");
+                out.println("");
                out.println(" In --to... modes, needs one argument, the name of the TibetanMachineWeb RTF");
                out.println(" file (for --to-wylie, --to-unicode, and --to-tibetan-machine) or the name of");
                out.println(" the TibetanMachine RTF file (for --to-tibetan-machine-web).  Writes the");
@ -101,9 +113,9 @@ public class TibetanConverter {
                out.println(" glyphs couldn't be converted (in which case the output is just those glyphs),");
                out.println(" nonzero otherwise.");
                out.println("");
-                out.println(" You may find it helpful to use `--find-some-non-tmw' mode before doing a");
+                out.println(" You may find it helpful to use `--find-some-non-tmw' mode (or");
+                out.println(" `--find-some-non-tm' mode for Tibetan Machine input) before doing a");
                out.println(" conversion so that you have confidence in the conversion's correctness.");
-                // DLC add find-some/all-non-tm
                // DLC add Wylie->TMW mode.
                // DLC give error if you have a TM file and try TMW->Unicode.
                return 77;
@ -136,6 +148,12 @@ public class TibetanConverter {
            } else if (findSomeNonTMWMode) {
                // 0, -1 is the entire document.
                return ((TibetanDocument)dp.getDocument()).findSomeNonTMWCharacters(0, -1, out);
+            } else if (findSomeNonTMMode) {
+                // 0, -1 is the entire document.
+                return ((TibetanDocument)dp.getDocument()).findSomeNonTMCharacters(0, -1, out);
+            } else if (findAllNonTMMode) {
+                // 0, -1 is the entire document.
+                return ((TibetanDocument)dp.getDocument()).findAllNonTMCharacters(0, -1, out);
            } else { // conversion {to Wylie or TM} mode
                // Fix curly braces in the entire document if the input is TMW:
                if (!convertToTMWMode) {
--- a/source/org/thdl/tib/text/TibetanDocument.java
+++ b/source/org/thdl/tib/text/TibetanDocument.java
@ -54,11 +54,16 @@ class CharacterInAGivenFont {
    }
    public String toString() {
        String characterRepresentation
-            = "'" + new Character(character).toString() + "'";
+            = "'" + (('\'' == character)
+                     ? "\\'"
+                     : new Character(character).toString())
+            + "' [decimal " + (int)character + "]";
        if ('\n' == character)
-            characterRepresentation = "newline";
+            characterRepresentation
+                = "newline [decimal " + (int)character + "]";
        if ('\r' == character)
-            characterRepresentation = "carriage return";
+            characterRepresentation
+                = "carriage return" + (int)character + "]";
        return characterRepresentation + " in the font "
            + ((null == fontName)
               ? "_ERROR_FINDING_FONT_"
@ -301,37 +306,29 @@ public class TibetanDocument extends DefaultStyledDocument {

    /** Configurable so that System.out isn't necessarily used. */
    public int findAllNonTMWCharacters(int begin, int end, PrintStream out) {
-        if (end < 0)
-            end = getLength();
-        if (begin >= end)
-            return 0;
-        int i = begin;
-        int returnValue = 0;
-        try {
-            while (i < end) {
-                AttributeSet attr = getCharacterElement(i).getAttributes();
-                String fontName = StyleConstants.getFontFamily(attr);
-                if ((0 == TibetanMachineWeb.getTMWFontNumber(fontName))) {
-                    returnValue = 1;
-                    CharacterInAGivenFont cgf
-                        = new CharacterInAGivenFont(getText(i, 1), fontName);
-                    out.println("non-TMW character "
-                                       + cgf + " at location " + i);
+        return findCharacters(begin, end, out, "Non-TMW", true);
    }
-                i++;
+
+    /** Prints to standard output a list of all the indices of
+        characters that are not in a TM font within the range [start,
+        end).  Using a negative number for end means that this will
+        run to the end of the document.  SPEED_FIXME: might be faster
+        to run over the elements, if they are one per font.
+        @return 1 if at least one non-TM character was found in
+        the specified range, zero if none were, -1 on error. */
+    public int findAllNonTMCharacters(int begin, int end) {
+        return findAllNonTMCharacters(begin, end, System.out);
    }
-        } catch (BadLocationException ble) {
-            ble.printStackTrace(out);
-            ThdlDebug.noteIffyCode();
-            returnValue = -1;
-        }
-        return returnValue;
+
+    /** Configurable so that System.out isn't necessarily used. */
+    public int findAllNonTMCharacters(int begin, int end, PrintStream out) {
+        return findCharacters(begin, end, out, "Non-TM", true);
    }

    /** Finds the first occurrence of a non-TMW character in a given
        font and prints it to System.out.  If you have a Tahoma
-        newline and an Arial newline, the first occurrence of each
-        will be reported.
+        newline and an Arial newline, e.g., the first occurrence of
+        each will be reported.
        
        <p>Works within the range [start, end).  Using a negative
        number for end means that this will run to the end of the
@ -343,8 +340,39 @@ public class TibetanDocument extends DefaultStyledDocument {
        return findSomeNonTMWCharacters(begin, end, System.out);
    }

+    /** Finds the first occurrence of a non-TM character in a given
+        font and prints it to System.out.  If you have a Tahoma
+        newline and an Arial newline, e.g., the first occurrence of
+        each will be reported.
+        
+        <p>Works within the range [start, end).  Using a negative
+        number for end means that this will run to the end of the
+        document.  SPEED_FIXME: might be faster to run over the
+        elements, if they are one per font.
+        @return 1 if at least one non-TMW character was found in
+        the specified range, zero if none were, -1 on error. */
+    public int findSomeNonTMCharacters(int begin, int end) {
+        return findSomeNonTMCharacters(begin, end, System.out);
+    }
+
    /** Configurable so that System.out isn't necessarily used. */
    public int findSomeNonTMWCharacters(int begin, int end, PrintStream out) {
+        return findCharacters(begin, end, out, "Non-TMW", false);
+    }
+
+    /** Configurable so that System.out isn't necessarily used. */
+    public int findSomeNonTMCharacters(int begin, int end, PrintStream out) {
+        return findCharacters(begin, end, out, "Non-TM", false);
+    }
+
+    /** Pass in whatKind=="Non-TMW" or whatKind=="Non-TM" for now; see
+        callers and the code to understand the semantics.  Pass in all
+        == true to find all characters or all == false to report each
+        character just once. */
+    private int findCharacters(int begin, int end, PrintStream out,
+                               String whatKind, boolean all) {
+        if (whatKind != "Non-TMW" && whatKind != "Non-TM")
+            throw new IllegalArgumentException("You didn't use an interned string.");
        if (end < 0)
            end = getLength();
        if (begin >= end)
@ -352,19 +380,28 @@ public class TibetanDocument extends DefaultStyledDocument {
        int i = begin;
        int returnValue = 0;
        try {
-            HashMap cgfTable = new HashMap();
+            HashMap cgfTable = null;
+            if (!all) cgfTable = new HashMap();
            while (i < end) {
                AttributeSet attr = getCharacterElement(i).getAttributes();
                String fontName = StyleConstants.getFontFamily(attr);
-                if ((0 == TibetanMachineWeb.getTMWFontNumber(fontName))) {
+                if ((whatKind == "Non-TMW"
+                     && (0 == TibetanMachineWeb.getTMWFontNumber(fontName)))
+                    || (whatKind == "Non-TM"
+                        && (0 == TibetanMachineWeb.getTMFontNumber(fontName)))) {
                    returnValue = 1;
                    CharacterInAGivenFont cgf
                        = new CharacterInAGivenFont(getText(i, 1), fontName);
-                    if (!cgfTable.containsKey(cgf)) {
+                    boolean doOutput = all;
+                    if (!all && !cgfTable.containsKey(cgf)) {
                        cgfTable.put(cgf, "yes this character appears once");
-                        out.println("non-TMW character "
-                                    + cgf + " appears first at location " + i);
+                        doOutput = true;
                    }
+                    if (true == doOutput)
+                        out.println(whatKind + " character "
+                                    + cgf + " appears "
+                                    + ((all) ? "" : "first ")
+                                    + "at location " + i);
                }
                i++;
            }