Added a mechanism for end users to have the ACIP/EWTS=>Tibetan converters print all tsheg bars or all unique tsheg bars to standard output. This will be useful for getting a list of all the tsheg bars in ACIP texts, e.g., which can then go into PackageTest.java. A lot of postprocessing would be required to get frequency counts, but you could do it with a perl script, awk, etc.
This commit is contained in:
parent
ef24c608bf
commit
7ba1ad0735
1 changed files with 35 additions and 3 deletions
|
@ -18,6 +18,10 @@ Contributor(s): ______________________________________.
|
||||||
|
|
||||||
package org.thdl.tib.text.ttt;
|
package org.thdl.tib.text.ttt;
|
||||||
|
|
||||||
|
import org.thdl.util.ThdlOptions;
|
||||||
|
|
||||||
|
import java.util.HashSet;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* An TString is some Latin text and a type, the type stating whether
|
* An TString is some Latin text and a type, the type stating whether
|
||||||
* said text is Latin (usually English) or transliteration of Tibetan,
|
* said text is Latin (usually English) or transliteration of Tibetan,
|
||||||
|
@ -118,10 +122,38 @@ public class TString {
|
||||||
* <i>type</i> being a characterization like {@link #DD}. */
|
* <i>type</i> being a characterization like {@link #DD}. */
|
||||||
public TString(String text, int type) {
|
public TString(String text, int type) {
|
||||||
setType(type);
|
setType(type);
|
||||||
setText((TIBETAN_NON_PUNCTUATION == type)
|
String ftext = (TIBETAN_NON_PUNCTUATION == type)
|
||||||
? MidLexSubstitution.getFinalValueForTibetanNonPunctuationToken(text)
|
? MidLexSubstitution.getFinalValueForTibetanNonPunctuationToken(text)
|
||||||
: text);
|
: text;
|
||||||
|
setText(ftext);
|
||||||
|
if ((outputAllTshegBars || outputUniqueTshegBars) && TIBETAN_NON_PUNCTUATION == type)
|
||||||
|
outputTshegBar(ftext);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Prints x to standard output if and only if we have never
|
||||||
|
encountered x before. */
|
||||||
|
private static void outputTshegBar(String x) {
|
||||||
|
if (outputAllTshegBars) {
|
||||||
|
System.out.println(outputTshegBarsPrefix + x);
|
||||||
|
} else if (outputUniqueTshegBars) {
|
||||||
|
if (!tshegBars.contains(x)) {
|
||||||
|
tshegBars.add(x);
|
||||||
|
System.out.println(outputTshegBarsPrefix + x);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean outputAllTshegBars
|
||||||
|
= ThdlOptions.getBooleanOption("org.thdl.tib.text.ttt.OutputAllTshegBars"); // DLC DOC -- use me to generate frequency info
|
||||||
|
|
||||||
|
private static boolean outputUniqueTshegBars
|
||||||
|
= ThdlOptions.getBooleanOption("org.thdl.tib.text.ttt.OutputUniqueTshegBars"); // DLC DOC
|
||||||
|
|
||||||
|
private static String outputTshegBarsPrefix
|
||||||
|
= ThdlOptions.getStringOption("org.thdl.tib.text.ttt.PrefixForOutputTshegBars", ""); // DLC DOC
|
||||||
|
|
||||||
|
private static final HashSet tshegBars = new HashSet();
|
||||||
|
|
||||||
public String toString() {
|
public String toString() {
|
||||||
String typeString = "HUH?????";
|
String typeString = "HUH?????";
|
||||||
if (type == COMMENT) typeString = "COMMENT";
|
if (type == COMMENT) typeString = "COMMENT";
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue