Added --find-some-non-tm and --find-all-non-tm modes to the converter to

help ensure worry-free TM->TMW conversions.
This commit is contained in:
dchandler 2003-06-22 00:14:18 +00:00
parent 80101666c7
commit dfe64a1927
2 changed files with 95 additions and 40 deletions

View file

@ -57,6 +57,8 @@ public class TibetanConverter {
boolean convertToWylieMode = false; boolean convertToWylieMode = false;
boolean findSomeNonTMWMode = false; boolean findSomeNonTMWMode = false;
boolean findAllNonTMWMode = false; boolean findAllNonTMWMode = false;
boolean findSomeNonTMMode = false;
boolean findAllNonTMMode = false;
// Process arguments: // Process arguments:
if ((args.length != 1 && args.length != 2) if ((args.length != 1 && args.length != 2)
|| (args.length == 1 || (args.length == 1
@ -74,7 +76,12 @@ public class TibetanConverter {
|| (convertToWylieMode || (convertToWylieMode
= args[0].equals("--to-wylie")) = args[0].equals("--to-wylie"))
|| (findSomeNonTMWMode || (findSomeNonTMWMode
= args[0].equals("--find-some-non-tmw"))))) { = args[0].equals("--find-some-non-tmw"))
|| (findSomeNonTMMode
= args[0].equals("--find-some-non-tm"))
|| (findAllNonTMMode
= args[0].equals("--find-all-non-tm"))
))) {
out.println("TibetanConverter [--find-all-non-tmw | --find-some-non-tmw"); out.println("TibetanConverter [--find-all-non-tmw | --find-some-non-tmw");
out.println(" | --to-tibetan-machine | --to-tibetan-machine-web"); out.println(" | --to-tibetan-machine | --to-tibetan-machine-web");
out.println(" | --to-unicode | --to-wylie] RTF_file"); out.println(" | --to-unicode | --to-wylie] RTF_file");
@ -86,13 +93,18 @@ public class TibetanConverter {
out.println(" -v | --version for version info"); out.println(" -v | --version for version info");
out.println(" -h | --help for this message"); out.println(" -h | --help for this message");
out.println(" --find-all-non-tmw to locate all characters in the input document that are"); out.println(" --find-all-non-tmw to locate all characters in the input document that are");
out.println(" not in Tibetan Machine Web fonts, exit zero iff none found"); out.println(" not in Tibetan Machine Web fonts, exit zero if and only if none found");
out.println(" --find-some-non-tmw to locate all distinct characters in the input document"); out.println(" --find-some-non-tmw to locate all distinct characters in the input document");
out.println(" not in Tibetan Machine Web fonts, exit zero iff none found"); out.println(" not in Tibetan Machine Web fonts, exit zero if and only if none found");
out.println(" --find-all-non-tm to locate all characters in the input document that are");
out.println(" not in Tibetan Machine fonts, exit zero if and only if none found");
out.println(" --find-some-non-tm to locate all distinct characters in the input document");
out.println(" not in Tibetan Machine fonts, exit zero if and only if none found");
out.println(" --to-tibetan-machine to convert TibetanMachineWeb to TibetanMachine"); out.println(" --to-tibetan-machine to convert TibetanMachineWeb to TibetanMachine");
out.println(" --to-unicode to convert TibetanMachineWeb to Unicode"); out.println(" --to-unicode to convert TibetanMachineWeb to Unicode");
out.println(" --to-tibetan-machine-web to convert TibetanMachine to TibetanMachineWeb"); out.println(" --to-tibetan-machine-web to convert TibetanMachine to TibetanMachineWeb");
out.println(" --to-wylie to convert TibetanMachineWeb to THDL Extended Wylie"); out.println(" --to-wylie to convert TibetanMachineWeb to THDL Extended Wylie");
out.println("");
out.println(" In --to... modes, needs one argument, the name of the TibetanMachineWeb RTF"); out.println(" In --to... modes, needs one argument, the name of the TibetanMachineWeb RTF");
out.println(" file (for --to-wylie, --to-unicode, and --to-tibetan-machine) or the name of"); out.println(" file (for --to-wylie, --to-unicode, and --to-tibetan-machine) or the name of");
out.println(" the TibetanMachine RTF file (for --to-tibetan-machine-web). Writes the"); out.println(" the TibetanMachine RTF file (for --to-tibetan-machine-web). Writes the");
@ -101,9 +113,9 @@ public class TibetanConverter {
out.println(" glyphs couldn't be converted (in which case the output is just those glyphs),"); out.println(" glyphs couldn't be converted (in which case the output is just those glyphs),");
out.println(" nonzero otherwise."); out.println(" nonzero otherwise.");
out.println(""); out.println("");
out.println(" You may find it helpful to use `--find-some-non-tmw' mode before doing a"); out.println(" You may find it helpful to use `--find-some-non-tmw' mode (or");
out.println(" `--find-some-non-tm' mode for Tibetan Machine input) before doing a");
out.println(" conversion so that you have confidence in the conversion's correctness."); out.println(" conversion so that you have confidence in the conversion's correctness.");
// DLC add find-some/all-non-tm
// DLC add Wylie->TMW mode. // DLC add Wylie->TMW mode.
// DLC give error if you have a TM file and try TMW->Unicode. // DLC give error if you have a TM file and try TMW->Unicode.
return 77; return 77;
@ -136,6 +148,12 @@ public class TibetanConverter {
} else if (findSomeNonTMWMode) { } else if (findSomeNonTMWMode) {
// 0, -1 is the entire document. // 0, -1 is the entire document.
return ((TibetanDocument)dp.getDocument()).findSomeNonTMWCharacters(0, -1, out); return ((TibetanDocument)dp.getDocument()).findSomeNonTMWCharacters(0, -1, out);
} else if (findSomeNonTMMode) {
// 0, -1 is the entire document.
return ((TibetanDocument)dp.getDocument()).findSomeNonTMCharacters(0, -1, out);
} else if (findAllNonTMMode) {
// 0, -1 is the entire document.
return ((TibetanDocument)dp.getDocument()).findAllNonTMCharacters(0, -1, out);
} else { // conversion {to Wylie or TM} mode } else { // conversion {to Wylie or TM} mode
// Fix curly braces in the entire document if the input is TMW: // Fix curly braces in the entire document if the input is TMW:
if (!convertToTMWMode) { if (!convertToTMWMode) {

View file

@ -54,11 +54,16 @@ class CharacterInAGivenFont {
} }
public String toString() { public String toString() {
String characterRepresentation String characterRepresentation
= "'" + new Character(character).toString() + "'"; = "'" + (('\'' == character)
? "\\'"
: new Character(character).toString())
+ "' [decimal " + (int)character + "]";
if ('\n' == character) if ('\n' == character)
characterRepresentation = "newline"; characterRepresentation
= "newline [decimal " + (int)character + "]";
if ('\r' == character) if ('\r' == character)
characterRepresentation = "carriage return"; characterRepresentation
= "carriage return" + (int)character + "]";
return characterRepresentation + " in the font " return characterRepresentation + " in the font "
+ ((null == fontName) + ((null == fontName)
? "_ERROR_FINDING_FONT_" ? "_ERROR_FINDING_FONT_"
@ -301,37 +306,29 @@ public class TibetanDocument extends DefaultStyledDocument {
/** Configurable so that System.out isn't necessarily used. */ /** Configurable so that System.out isn't necessarily used. */
public int findAllNonTMWCharacters(int begin, int end, PrintStream out) { public int findAllNonTMWCharacters(int begin, int end, PrintStream out) {
if (end < 0) return findCharacters(begin, end, out, "Non-TMW", true);
end = getLength(); }
if (begin >= end)
return 0; /** Prints to standard output a list of all the indices of
int i = begin; characters that are not in a TM font within the range [start,
int returnValue = 0; end). Using a negative number for end means that this will
try { run to the end of the document. SPEED_FIXME: might be faster
while (i < end) { to run over the elements, if they are one per font.
AttributeSet attr = getCharacterElement(i).getAttributes(); @return 1 if at least one non-TM character was found in
String fontName = StyleConstants.getFontFamily(attr); the specified range, zero if none were, -1 on error. */
if ((0 == TibetanMachineWeb.getTMWFontNumber(fontName))) { public int findAllNonTMCharacters(int begin, int end) {
returnValue = 1; return findAllNonTMCharacters(begin, end, System.out);
CharacterInAGivenFont cgf }
= new CharacterInAGivenFont(getText(i, 1), fontName);
out.println("non-TMW character " /** Configurable so that System.out isn't necessarily used. */
+ cgf + " at location " + i); public int findAllNonTMCharacters(int begin, int end, PrintStream out) {
} return findCharacters(begin, end, out, "Non-TM", true);
i++;
}
} catch (BadLocationException ble) {
ble.printStackTrace(out);
ThdlDebug.noteIffyCode();
returnValue = -1;
}
return returnValue;
} }
/** Finds the first occurrence of a non-TMW character in a given /** Finds the first occurrence of a non-TMW character in a given
font and prints it to System.out. If you have a Tahoma font and prints it to System.out. If you have a Tahoma
newline and an Arial newline, the first occurrence of each newline and an Arial newline, e.g., the first occurrence of
will be reported. each will be reported.
<p>Works within the range [start, end). Using a negative <p>Works within the range [start, end). Using a negative
number for end means that this will run to the end of the number for end means that this will run to the end of the
@ -343,8 +340,39 @@ public class TibetanDocument extends DefaultStyledDocument {
return findSomeNonTMWCharacters(begin, end, System.out); return findSomeNonTMWCharacters(begin, end, System.out);
} }
/** Finds the first occurrence of a non-TM character in a given
font and prints it to System.out. If you have a Tahoma
newline and an Arial newline, e.g., the first occurrence of
each will be reported.
<p>Works within the range [start, end). Using a negative
number for end means that this will run to the end of the
document. SPEED_FIXME: might be faster to run over the
elements, if they are one per font.
@return 1 if at least one non-TMW character was found in
the specified range, zero if none were, -1 on error. */
public int findSomeNonTMCharacters(int begin, int end) {
return findSomeNonTMCharacters(begin, end, System.out);
}
/** Configurable so that System.out isn't necessarily used. */ /** Configurable so that System.out isn't necessarily used. */
public int findSomeNonTMWCharacters(int begin, int end, PrintStream out) { public int findSomeNonTMWCharacters(int begin, int end, PrintStream out) {
return findCharacters(begin, end, out, "Non-TMW", false);
}
/** Configurable so that System.out isn't necessarily used. */
public int findSomeNonTMCharacters(int begin, int end, PrintStream out) {
return findCharacters(begin, end, out, "Non-TM", false);
}
/** Pass in whatKind=="Non-TMW" or whatKind=="Non-TM" for now; see
callers and the code to understand the semantics. Pass in all
== true to find all characters or all == false to report each
character just once. */
private int findCharacters(int begin, int end, PrintStream out,
String whatKind, boolean all) {
if (whatKind != "Non-TMW" && whatKind != "Non-TM")
throw new IllegalArgumentException("You didn't use an interned string.");
if (end < 0) if (end < 0)
end = getLength(); end = getLength();
if (begin >= end) if (begin >= end)
@ -352,19 +380,28 @@ public class TibetanDocument extends DefaultStyledDocument {
int i = begin; int i = begin;
int returnValue = 0; int returnValue = 0;
try { try {
HashMap cgfTable = new HashMap(); HashMap cgfTable = null;
if (!all) cgfTable = new HashMap();
while (i < end) { while (i < end) {
AttributeSet attr = getCharacterElement(i).getAttributes(); AttributeSet attr = getCharacterElement(i).getAttributes();
String fontName = StyleConstants.getFontFamily(attr); String fontName = StyleConstants.getFontFamily(attr);
if ((0 == TibetanMachineWeb.getTMWFontNumber(fontName))) { if ((whatKind == "Non-TMW"
&& (0 == TibetanMachineWeb.getTMWFontNumber(fontName)))
|| (whatKind == "Non-TM"
&& (0 == TibetanMachineWeb.getTMFontNumber(fontName)))) {
returnValue = 1; returnValue = 1;
CharacterInAGivenFont cgf CharacterInAGivenFont cgf
= new CharacterInAGivenFont(getText(i, 1), fontName); = new CharacterInAGivenFont(getText(i, 1), fontName);
if (!cgfTable.containsKey(cgf)) { boolean doOutput = all;
if (!all && !cgfTable.containsKey(cgf)) {
cgfTable.put(cgf, "yes this character appears once"); cgfTable.put(cgf, "yes this character appears once");
out.println("non-TMW character " doOutput = true;
+ cgf + " appears first at location " + i);
} }
if (true == doOutput)
out.println(whatKind + " character "
+ cgf + " appears "
+ ((all) ? "" : "first ")
+ "at location " + i);
} }
i++; i++;
} }