Added --find-some-non-tm and --find-all-non-tm modes to the converter to
help ensure worry-free TM->TMW conversions.
This commit is contained in:
parent
80101666c7
commit
dfe64a1927
2 changed files with 95 additions and 40 deletions
|
@ -57,6 +57,8 @@ public class TibetanConverter {
|
||||||
boolean convertToWylieMode = false;
|
boolean convertToWylieMode = false;
|
||||||
boolean findSomeNonTMWMode = false;
|
boolean findSomeNonTMWMode = false;
|
||||||
boolean findAllNonTMWMode = false;
|
boolean findAllNonTMWMode = false;
|
||||||
|
boolean findSomeNonTMMode = false;
|
||||||
|
boolean findAllNonTMMode = false;
|
||||||
// Process arguments:
|
// Process arguments:
|
||||||
if ((args.length != 1 && args.length != 2)
|
if ((args.length != 1 && args.length != 2)
|
||||||
|| (args.length == 1
|
|| (args.length == 1
|
||||||
|
@ -74,7 +76,12 @@ public class TibetanConverter {
|
||||||
|| (convertToWylieMode
|
|| (convertToWylieMode
|
||||||
= args[0].equals("--to-wylie"))
|
= args[0].equals("--to-wylie"))
|
||||||
|| (findSomeNonTMWMode
|
|| (findSomeNonTMWMode
|
||||||
= args[0].equals("--find-some-non-tmw"))))) {
|
= args[0].equals("--find-some-non-tmw"))
|
||||||
|
|| (findSomeNonTMMode
|
||||||
|
= args[0].equals("--find-some-non-tm"))
|
||||||
|
|| (findAllNonTMMode
|
||||||
|
= args[0].equals("--find-all-non-tm"))
|
||||||
|
))) {
|
||||||
out.println("TibetanConverter [--find-all-non-tmw | --find-some-non-tmw");
|
out.println("TibetanConverter [--find-all-non-tmw | --find-some-non-tmw");
|
||||||
out.println(" | --to-tibetan-machine | --to-tibetan-machine-web");
|
out.println(" | --to-tibetan-machine | --to-tibetan-machine-web");
|
||||||
out.println(" | --to-unicode | --to-wylie] RTF_file");
|
out.println(" | --to-unicode | --to-wylie] RTF_file");
|
||||||
|
@ -86,13 +93,18 @@ public class TibetanConverter {
|
||||||
out.println(" -v | --version for version info");
|
out.println(" -v | --version for version info");
|
||||||
out.println(" -h | --help for this message");
|
out.println(" -h | --help for this message");
|
||||||
out.println(" --find-all-non-tmw to locate all characters in the input document that are");
|
out.println(" --find-all-non-tmw to locate all characters in the input document that are");
|
||||||
out.println(" not in Tibetan Machine Web fonts, exit zero iff none found");
|
out.println(" not in Tibetan Machine Web fonts, exit zero if and only if none found");
|
||||||
out.println(" --find-some-non-tmw to locate all distinct characters in the input document");
|
out.println(" --find-some-non-tmw to locate all distinct characters in the input document");
|
||||||
out.println(" not in Tibetan Machine Web fonts, exit zero iff none found");
|
out.println(" not in Tibetan Machine Web fonts, exit zero if and only if none found");
|
||||||
|
out.println(" --find-all-non-tm to locate all characters in the input document that are");
|
||||||
|
out.println(" not in Tibetan Machine fonts, exit zero if and only if none found");
|
||||||
|
out.println(" --find-some-non-tm to locate all distinct characters in the input document");
|
||||||
|
out.println(" not in Tibetan Machine fonts, exit zero if and only if none found");
|
||||||
out.println(" --to-tibetan-machine to convert TibetanMachineWeb to TibetanMachine");
|
out.println(" --to-tibetan-machine to convert TibetanMachineWeb to TibetanMachine");
|
||||||
out.println(" --to-unicode to convert TibetanMachineWeb to Unicode");
|
out.println(" --to-unicode to convert TibetanMachineWeb to Unicode");
|
||||||
out.println(" --to-tibetan-machine-web to convert TibetanMachine to TibetanMachineWeb");
|
out.println(" --to-tibetan-machine-web to convert TibetanMachine to TibetanMachineWeb");
|
||||||
out.println(" --to-wylie to convert TibetanMachineWeb to THDL Extended Wylie");
|
out.println(" --to-wylie to convert TibetanMachineWeb to THDL Extended Wylie");
|
||||||
|
out.println("");
|
||||||
out.println(" In --to... modes, needs one argument, the name of the TibetanMachineWeb RTF");
|
out.println(" In --to... modes, needs one argument, the name of the TibetanMachineWeb RTF");
|
||||||
out.println(" file (for --to-wylie, --to-unicode, and --to-tibetan-machine) or the name of");
|
out.println(" file (for --to-wylie, --to-unicode, and --to-tibetan-machine) or the name of");
|
||||||
out.println(" the TibetanMachine RTF file (for --to-tibetan-machine-web). Writes the");
|
out.println(" the TibetanMachine RTF file (for --to-tibetan-machine-web). Writes the");
|
||||||
|
@ -101,9 +113,9 @@ public class TibetanConverter {
|
||||||
out.println(" glyphs couldn't be converted (in which case the output is just those glyphs),");
|
out.println(" glyphs couldn't be converted (in which case the output is just those glyphs),");
|
||||||
out.println(" nonzero otherwise.");
|
out.println(" nonzero otherwise.");
|
||||||
out.println("");
|
out.println("");
|
||||||
out.println(" You may find it helpful to use `--find-some-non-tmw' mode before doing a");
|
out.println(" You may find it helpful to use `--find-some-non-tmw' mode (or");
|
||||||
|
out.println(" `--find-some-non-tm' mode for Tibetan Machine input) before doing a");
|
||||||
out.println(" conversion so that you have confidence in the conversion's correctness.");
|
out.println(" conversion so that you have confidence in the conversion's correctness.");
|
||||||
// DLC add find-some/all-non-tm
|
|
||||||
// DLC add Wylie->TMW mode.
|
// DLC add Wylie->TMW mode.
|
||||||
// DLC give error if you have a TM file and try TMW->Unicode.
|
// DLC give error if you have a TM file and try TMW->Unicode.
|
||||||
return 77;
|
return 77;
|
||||||
|
@ -136,6 +148,12 @@ public class TibetanConverter {
|
||||||
} else if (findSomeNonTMWMode) {
|
} else if (findSomeNonTMWMode) {
|
||||||
// 0, -1 is the entire document.
|
// 0, -1 is the entire document.
|
||||||
return ((TibetanDocument)dp.getDocument()).findSomeNonTMWCharacters(0, -1, out);
|
return ((TibetanDocument)dp.getDocument()).findSomeNonTMWCharacters(0, -1, out);
|
||||||
|
} else if (findSomeNonTMMode) {
|
||||||
|
// 0, -1 is the entire document.
|
||||||
|
return ((TibetanDocument)dp.getDocument()).findSomeNonTMCharacters(0, -1, out);
|
||||||
|
} else if (findAllNonTMMode) {
|
||||||
|
// 0, -1 is the entire document.
|
||||||
|
return ((TibetanDocument)dp.getDocument()).findAllNonTMCharacters(0, -1, out);
|
||||||
} else { // conversion {to Wylie or TM} mode
|
} else { // conversion {to Wylie or TM} mode
|
||||||
// Fix curly braces in the entire document if the input is TMW:
|
// Fix curly braces in the entire document if the input is TMW:
|
||||||
if (!convertToTMWMode) {
|
if (!convertToTMWMode) {
|
||||||
|
|
|
@ -54,11 +54,16 @@ class CharacterInAGivenFont {
|
||||||
}
|
}
|
||||||
public String toString() {
|
public String toString() {
|
||||||
String characterRepresentation
|
String characterRepresentation
|
||||||
= "'" + new Character(character).toString() + "'";
|
= "'" + (('\'' == character)
|
||||||
|
? "\\'"
|
||||||
|
: new Character(character).toString())
|
||||||
|
+ "' [decimal " + (int)character + "]";
|
||||||
if ('\n' == character)
|
if ('\n' == character)
|
||||||
characterRepresentation = "newline";
|
characterRepresentation
|
||||||
|
= "newline [decimal " + (int)character + "]";
|
||||||
if ('\r' == character)
|
if ('\r' == character)
|
||||||
characterRepresentation = "carriage return";
|
characterRepresentation
|
||||||
|
= "carriage return" + (int)character + "]";
|
||||||
return characterRepresentation + " in the font "
|
return characterRepresentation + " in the font "
|
||||||
+ ((null == fontName)
|
+ ((null == fontName)
|
||||||
? "_ERROR_FINDING_FONT_"
|
? "_ERROR_FINDING_FONT_"
|
||||||
|
@ -301,37 +306,29 @@ public class TibetanDocument extends DefaultStyledDocument {
|
||||||
|
|
||||||
/** Configurable so that System.out isn't necessarily used. */
|
/** Configurable so that System.out isn't necessarily used. */
|
||||||
public int findAllNonTMWCharacters(int begin, int end, PrintStream out) {
|
public int findAllNonTMWCharacters(int begin, int end, PrintStream out) {
|
||||||
if (end < 0)
|
return findCharacters(begin, end, out, "Non-TMW", true);
|
||||||
end = getLength();
|
|
||||||
if (begin >= end)
|
|
||||||
return 0;
|
|
||||||
int i = begin;
|
|
||||||
int returnValue = 0;
|
|
||||||
try {
|
|
||||||
while (i < end) {
|
|
||||||
AttributeSet attr = getCharacterElement(i).getAttributes();
|
|
||||||
String fontName = StyleConstants.getFontFamily(attr);
|
|
||||||
if ((0 == TibetanMachineWeb.getTMWFontNumber(fontName))) {
|
|
||||||
returnValue = 1;
|
|
||||||
CharacterInAGivenFont cgf
|
|
||||||
= new CharacterInAGivenFont(getText(i, 1), fontName);
|
|
||||||
out.println("non-TMW character "
|
|
||||||
+ cgf + " at location " + i);
|
|
||||||
}
|
}
|
||||||
i++;
|
|
||||||
|
/** Prints to standard output a list of all the indices of
|
||||||
|
characters that are not in a TM font within the range [start,
|
||||||
|
end). Using a negative number for end means that this will
|
||||||
|
run to the end of the document. SPEED_FIXME: might be faster
|
||||||
|
to run over the elements, if they are one per font.
|
||||||
|
@return 1 if at least one non-TM character was found in
|
||||||
|
the specified range, zero if none were, -1 on error. */
|
||||||
|
public int findAllNonTMCharacters(int begin, int end) {
|
||||||
|
return findAllNonTMCharacters(begin, end, System.out);
|
||||||
}
|
}
|
||||||
} catch (BadLocationException ble) {
|
|
||||||
ble.printStackTrace(out);
|
/** Configurable so that System.out isn't necessarily used. */
|
||||||
ThdlDebug.noteIffyCode();
|
public int findAllNonTMCharacters(int begin, int end, PrintStream out) {
|
||||||
returnValue = -1;
|
return findCharacters(begin, end, out, "Non-TM", true);
|
||||||
}
|
|
||||||
return returnValue;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Finds the first occurrence of a non-TMW character in a given
|
/** Finds the first occurrence of a non-TMW character in a given
|
||||||
font and prints it to System.out. If you have a Tahoma
|
font and prints it to System.out. If you have a Tahoma
|
||||||
newline and an Arial newline, the first occurrence of each
|
newline and an Arial newline, e.g., the first occurrence of
|
||||||
will be reported.
|
each will be reported.
|
||||||
|
|
||||||
<p>Works within the range [start, end). Using a negative
|
<p>Works within the range [start, end). Using a negative
|
||||||
number for end means that this will run to the end of the
|
number for end means that this will run to the end of the
|
||||||
|
@ -343,8 +340,39 @@ public class TibetanDocument extends DefaultStyledDocument {
|
||||||
return findSomeNonTMWCharacters(begin, end, System.out);
|
return findSomeNonTMWCharacters(begin, end, System.out);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Finds the first occurrence of a non-TM character in a given
|
||||||
|
font and prints it to System.out. If you have a Tahoma
|
||||||
|
newline and an Arial newline, e.g., the first occurrence of
|
||||||
|
each will be reported.
|
||||||
|
|
||||||
|
<p>Works within the range [start, end). Using a negative
|
||||||
|
number for end means that this will run to the end of the
|
||||||
|
document. SPEED_FIXME: might be faster to run over the
|
||||||
|
elements, if they are one per font.
|
||||||
|
@return 1 if at least one non-TMW character was found in
|
||||||
|
the specified range, zero if none were, -1 on error. */
|
||||||
|
public int findSomeNonTMCharacters(int begin, int end) {
|
||||||
|
return findSomeNonTMCharacters(begin, end, System.out);
|
||||||
|
}
|
||||||
|
|
||||||
/** Configurable so that System.out isn't necessarily used. */
|
/** Configurable so that System.out isn't necessarily used. */
|
||||||
public int findSomeNonTMWCharacters(int begin, int end, PrintStream out) {
|
public int findSomeNonTMWCharacters(int begin, int end, PrintStream out) {
|
||||||
|
return findCharacters(begin, end, out, "Non-TMW", false);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Configurable so that System.out isn't necessarily used. */
|
||||||
|
public int findSomeNonTMCharacters(int begin, int end, PrintStream out) {
|
||||||
|
return findCharacters(begin, end, out, "Non-TM", false);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Pass in whatKind=="Non-TMW" or whatKind=="Non-TM" for now; see
|
||||||
|
callers and the code to understand the semantics. Pass in all
|
||||||
|
== true to find all characters or all == false to report each
|
||||||
|
character just once. */
|
||||||
|
private int findCharacters(int begin, int end, PrintStream out,
|
||||||
|
String whatKind, boolean all) {
|
||||||
|
if (whatKind != "Non-TMW" && whatKind != "Non-TM")
|
||||||
|
throw new IllegalArgumentException("You didn't use an interned string.");
|
||||||
if (end < 0)
|
if (end < 0)
|
||||||
end = getLength();
|
end = getLength();
|
||||||
if (begin >= end)
|
if (begin >= end)
|
||||||
|
@ -352,19 +380,28 @@ public class TibetanDocument extends DefaultStyledDocument {
|
||||||
int i = begin;
|
int i = begin;
|
||||||
int returnValue = 0;
|
int returnValue = 0;
|
||||||
try {
|
try {
|
||||||
HashMap cgfTable = new HashMap();
|
HashMap cgfTable = null;
|
||||||
|
if (!all) cgfTable = new HashMap();
|
||||||
while (i < end) {
|
while (i < end) {
|
||||||
AttributeSet attr = getCharacterElement(i).getAttributes();
|
AttributeSet attr = getCharacterElement(i).getAttributes();
|
||||||
String fontName = StyleConstants.getFontFamily(attr);
|
String fontName = StyleConstants.getFontFamily(attr);
|
||||||
if ((0 == TibetanMachineWeb.getTMWFontNumber(fontName))) {
|
if ((whatKind == "Non-TMW"
|
||||||
|
&& (0 == TibetanMachineWeb.getTMWFontNumber(fontName)))
|
||||||
|
|| (whatKind == "Non-TM"
|
||||||
|
&& (0 == TibetanMachineWeb.getTMFontNumber(fontName)))) {
|
||||||
returnValue = 1;
|
returnValue = 1;
|
||||||
CharacterInAGivenFont cgf
|
CharacterInAGivenFont cgf
|
||||||
= new CharacterInAGivenFont(getText(i, 1), fontName);
|
= new CharacterInAGivenFont(getText(i, 1), fontName);
|
||||||
if (!cgfTable.containsKey(cgf)) {
|
boolean doOutput = all;
|
||||||
|
if (!all && !cgfTable.containsKey(cgf)) {
|
||||||
cgfTable.put(cgf, "yes this character appears once");
|
cgfTable.put(cgf, "yes this character appears once");
|
||||||
out.println("non-TMW character "
|
doOutput = true;
|
||||||
+ cgf + " appears first at location " + i);
|
|
||||||
}
|
}
|
||||||
|
if (true == doOutput)
|
||||||
|
out.println(whatKind + " character "
|
||||||
|
+ cgf + " appears "
|
||||||
|
+ ((all) ? "" : "first ")
|
||||||
|
+ "at location " + i);
|
||||||
}
|
}
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue