diff --git a/source/org/thdl/tib/input/TibetanConverter.java b/source/org/thdl/tib/input/TibetanConverter.java index 52598c3..d91edfa 100644 --- a/source/org/thdl/tib/input/TibetanConverter.java +++ b/source/org/thdl/tib/input/TibetanConverter.java @@ -57,6 +57,8 @@ public class TibetanConverter { boolean convertToWylieMode = false; boolean findSomeNonTMWMode = false; boolean findAllNonTMWMode = false; + boolean findSomeNonTMMode = false; + boolean findAllNonTMMode = false; // Process arguments: if ((args.length != 1 && args.length != 2) || (args.length == 1 @@ -74,7 +76,12 @@ public class TibetanConverter { || (convertToWylieMode = args[0].equals("--to-wylie")) || (findSomeNonTMWMode - = args[0].equals("--find-some-non-tmw"))))) { + = args[0].equals("--find-some-non-tmw")) + || (findSomeNonTMMode + = args[0].equals("--find-some-non-tm")) + || (findAllNonTMMode + = args[0].equals("--find-all-non-tm")) + ))) { out.println("TibetanConverter [--find-all-non-tmw | --find-some-non-tmw"); out.println(" | --to-tibetan-machine | --to-tibetan-machine-web"); out.println(" | --to-unicode | --to-wylie] RTF_file"); @@ -86,13 +93,18 @@ public class TibetanConverter { out.println(" -v | --version for version info"); out.println(" -h | --help for this message"); out.println(" --find-all-non-tmw to locate all characters in the input document that are"); - out.println(" not in Tibetan Machine Web fonts, exit zero iff none found"); + out.println(" not in Tibetan Machine Web fonts, exit zero if and only if none found"); out.println(" --find-some-non-tmw to locate all distinct characters in the input document"); - out.println(" not in Tibetan Machine Web fonts, exit zero iff none found"); + out.println(" not in Tibetan Machine Web fonts, exit zero if and only if none found"); + out.println(" --find-all-non-tm to locate all characters in the input document that are"); + out.println(" not in Tibetan Machine fonts, exit zero if and only if none found"); + out.println(" --find-some-non-tm to locate all distinct characters in the input document"); + out.println(" not in Tibetan Machine fonts, exit zero if and only if none found"); out.println(" --to-tibetan-machine to convert TibetanMachineWeb to TibetanMachine"); out.println(" --to-unicode to convert TibetanMachineWeb to Unicode"); out.println(" --to-tibetan-machine-web to convert TibetanMachine to TibetanMachineWeb"); out.println(" --to-wylie to convert TibetanMachineWeb to THDL Extended Wylie"); + out.println(""); out.println(" In --to... modes, needs one argument, the name of the TibetanMachineWeb RTF"); out.println(" file (for --to-wylie, --to-unicode, and --to-tibetan-machine) or the name of"); out.println(" the TibetanMachine RTF file (for --to-tibetan-machine-web). Writes the"); @@ -101,9 +113,9 @@ public class TibetanConverter { out.println(" glyphs couldn't be converted (in which case the output is just those glyphs),"); out.println(" nonzero otherwise."); out.println(""); - out.println(" You may find it helpful to use `--find-some-non-tmw' mode before doing a"); + out.println(" You may find it helpful to use `--find-some-non-tmw' mode (or"); + out.println(" `--find-some-non-tm' mode for Tibetan Machine input) before doing a"); out.println(" conversion so that you have confidence in the conversion's correctness."); - // DLC add find-some/all-non-tm // DLC add Wylie->TMW mode. // DLC give error if you have a TM file and try TMW->Unicode. return 77; @@ -136,6 +148,12 @@ public class TibetanConverter { } else if (findSomeNonTMWMode) { // 0, -1 is the entire document. return ((TibetanDocument)dp.getDocument()).findSomeNonTMWCharacters(0, -1, out); + } else if (findSomeNonTMMode) { + // 0, -1 is the entire document. + return ((TibetanDocument)dp.getDocument()).findSomeNonTMCharacters(0, -1, out); + } else if (findAllNonTMMode) { + // 0, -1 is the entire document. + return ((TibetanDocument)dp.getDocument()).findAllNonTMCharacters(0, -1, out); } else { // conversion {to Wylie or TM} mode // Fix curly braces in the entire document if the input is TMW: if (!convertToTMWMode) { diff --git a/source/org/thdl/tib/text/TibetanDocument.java b/source/org/thdl/tib/text/TibetanDocument.java index 49d02a6..c59b0f1 100644 --- a/source/org/thdl/tib/text/TibetanDocument.java +++ b/source/org/thdl/tib/text/TibetanDocument.java @@ -54,11 +54,16 @@ class CharacterInAGivenFont { } public String toString() { String characterRepresentation - = "'" + new Character(character).toString() + "'"; + = "'" + (('\'' == character) + ? "\\'" + : new Character(character).toString()) + + "' [decimal " + (int)character + "]"; if ('\n' == character) - characterRepresentation = "newline"; + characterRepresentation + = "newline [decimal " + (int)character + "]"; if ('\r' == character) - characterRepresentation = "carriage return"; + characterRepresentation + = "carriage return" + (int)character + "]"; return characterRepresentation + " in the font " + ((null == fontName) ? "_ERROR_FINDING_FONT_" @@ -301,37 +306,29 @@ public class TibetanDocument extends DefaultStyledDocument { /** Configurable so that System.out isn't necessarily used. */ public int findAllNonTMWCharacters(int begin, int end, PrintStream out) { - if (end < 0) - end = getLength(); - if (begin >= end) - return 0; - int i = begin; - int returnValue = 0; - try { - while (i < end) { - AttributeSet attr = getCharacterElement(i).getAttributes(); - String fontName = StyleConstants.getFontFamily(attr); - if ((0 == TibetanMachineWeb.getTMWFontNumber(fontName))) { - returnValue = 1; - CharacterInAGivenFont cgf - = new CharacterInAGivenFont(getText(i, 1), fontName); - out.println("non-TMW character " - + cgf + " at location " + i); - } - i++; - } - } catch (BadLocationException ble) { - ble.printStackTrace(out); - ThdlDebug.noteIffyCode(); - returnValue = -1; - } - return returnValue; + return findCharacters(begin, end, out, "Non-TMW", true); + } + + /** Prints to standard output a list of all the indices of + characters that are not in a TM font within the range [start, + end). Using a negative number for end means that this will + run to the end of the document. SPEED_FIXME: might be faster + to run over the elements, if they are one per font. + @return 1 if at least one non-TM character was found in + the specified range, zero if none were, -1 on error. */ + public int findAllNonTMCharacters(int begin, int end) { + return findAllNonTMCharacters(begin, end, System.out); + } + + /** Configurable so that System.out isn't necessarily used. */ + public int findAllNonTMCharacters(int begin, int end, PrintStream out) { + return findCharacters(begin, end, out, "Non-TM", true); } /** Finds the first occurrence of a non-TMW character in a given font and prints it to System.out. If you have a Tahoma - newline and an Arial newline, the first occurrence of each - will be reported. + newline and an Arial newline, e.g., the first occurrence of + each will be reported.

Works within the range [start, end). Using a negative number for end means that this will run to the end of the @@ -343,8 +340,39 @@ public class TibetanDocument extends DefaultStyledDocument { return findSomeNonTMWCharacters(begin, end, System.out); } + /** Finds the first occurrence of a non-TM character in a given + font and prints it to System.out. If you have a Tahoma + newline and an Arial newline, e.g., the first occurrence of + each will be reported. + +

Works within the range [start, end). Using a negative + number for end means that this will run to the end of the + document. SPEED_FIXME: might be faster to run over the + elements, if they are one per font. + @return 1 if at least one non-TMW character was found in + the specified range, zero if none were, -1 on error. */ + public int findSomeNonTMCharacters(int begin, int end) { + return findSomeNonTMCharacters(begin, end, System.out); + } + /** Configurable so that System.out isn't necessarily used. */ public int findSomeNonTMWCharacters(int begin, int end, PrintStream out) { + return findCharacters(begin, end, out, "Non-TMW", false); + } + + /** Configurable so that System.out isn't necessarily used. */ + public int findSomeNonTMCharacters(int begin, int end, PrintStream out) { + return findCharacters(begin, end, out, "Non-TM", false); + } + + /** Pass in whatKind=="Non-TMW" or whatKind=="Non-TM" for now; see + callers and the code to understand the semantics. Pass in all + == true to find all characters or all == false to report each + character just once. */ + private int findCharacters(int begin, int end, PrintStream out, + String whatKind, boolean all) { + if (whatKind != "Non-TMW" && whatKind != "Non-TM") + throw new IllegalArgumentException("You didn't use an interned string."); if (end < 0) end = getLength(); if (begin >= end) @@ -352,19 +380,28 @@ public class TibetanDocument extends DefaultStyledDocument { int i = begin; int returnValue = 0; try { - HashMap cgfTable = new HashMap(); + HashMap cgfTable = null; + if (!all) cgfTable = new HashMap(); while (i < end) { AttributeSet attr = getCharacterElement(i).getAttributes(); String fontName = StyleConstants.getFontFamily(attr); - if ((0 == TibetanMachineWeb.getTMWFontNumber(fontName))) { + if ((whatKind == "Non-TMW" + && (0 == TibetanMachineWeb.getTMWFontNumber(fontName))) + || (whatKind == "Non-TM" + && (0 == TibetanMachineWeb.getTMFontNumber(fontName)))) { returnValue = 1; CharacterInAGivenFont cgf = new CharacterInAGivenFont(getText(i, 1), fontName); - if (!cgfTable.containsKey(cgf)) { + boolean doOutput = all; + if (!all && !cgfTable.containsKey(cgf)) { cgfTable.put(cgf, "yes this character appears once"); - out.println("non-TMW character " - + cgf + " appears first at location " + i); + doOutput = true; } + if (true == doOutput) + out.println(whatKind + " character " + + cgf + " appears " + + ((all) ? "" : "first ") + + "at location " + i); } i++; }