Added a mechanism for end users to customize ACIP/EWTS=>Tibetan conversions by giving a list of substitutions to be performed. E.g., when I invoke Jskad via 'java -Dorg.thdl.tib.text.ttt.VerboseReplacementMap=false -Dorg.thdl.tib.text.ttt.ReplacementMap="KAsh=>K+sh" -jar Jskad.jar', then the ACIP KAsh becomes K+sh automatically.
This mechanism is for Andres (who noticed KAsh=>K+sh in practice) and power users only, and not power users until I document the thing outside of the source code.
This commit is contained in:
parent
6bda550157
commit
ef24c608bf
3 changed files with 229 additions and 3 deletions
224
source/org/thdl/tib/text/ttt/MidLexSubstitution.java
Normal file
224
source/org/thdl/tib/text/ttt/MidLexSubstitution.java
Normal file
|
@ -0,0 +1,224 @@
|
|||
/*
|
||||
The contents of this file are subject to the THDL Open Community License
|
||||
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License on the THDL web site
|
||||
(http://www.thdl.org/).
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||
License for the specific terms governing rights and limitations under the
|
||||
License.
|
||||
|
||||
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
||||
All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
*/
|
||||
|
||||
package org.thdl.tib.text.ttt;
|
||||
|
||||
import org.thdl.util.ThdlOptions;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.StringTokenizer;
|
||||
|
||||
// DLC FIXME: document this.
|
||||
|
||||
/** MidLexSubstitution is a hack that lets the end user clumsily fix
|
||||
* the EWTS-to-Tibetan and ACIP-to-Tibetan converters without having
|
||||
* to modify the source code.
|
||||
*
|
||||
* <p>If the converter isn't giving you what you want for some
|
||||
* tsheg bar, then set up a replacement.
|
||||
*
|
||||
* <p>To do so, set the system property
|
||||
* org.thdl.tib.text.ttt.ReplacementMap to be a comma-delimited list
|
||||
* of "x=>y" pairs. For example, if you think BLKU, which parses
|
||||
* as B+L+KU, should parse as B-L+KU, and you want KAsh to be parsed
|
||||
* as K+sh because the input operators mistyped it, then set
|
||||
* org.thdl.tib.text.ttt.ReplacementMap to
|
||||
* "BLKU=>B-L+KU,KAsh=>K+sh". Note that this will not cause
|
||||
* B+L+KU to become B-L+KU -- we are doing the replacement during
|
||||
* lexical analysis of the input file, not during parsing. And it
|
||||
* will cause SBLKU to become SB-L+KU, which is parsed as S+B-L+KU,
|
||||
* probably not what you wanted. If you fear such things, you can
|
||||
* see if they happen by setting the system property
|
||||
* org.thdl.tib.text.ttt.VerboseReplacementMap to "true", which will
|
||||
* cause an informational message to be printed on the Java console
|
||||
* every time a replacement is made.
|
||||
*
|
||||
* <p>Furthermore, you can use the regexp notation
|
||||
* "^BLKU$=>B-L+KU". Note that regular expressions are not
|
||||
* supported -- we're just borrowing the notation.
|
||||
* "^BLKU=>B-L+KU" means that BLKUM and BLKU will both be
|
||||
* replaced, but SBLKU and SBLKUM will not be. The caret, '^', means
|
||||
* that we only match if BLKU is at the beginning. The dollar sign,
|
||||
* '$', means that we only match if the pattern is at the end.
|
||||
* "BLKU$=>B-L+KU" will cause SBLKU to be replaced, but not BLKUM.
|
||||
* Note that performance is far better for ^FOO$ than for ^FOO, FOO$,
|
||||
* or FOO alone.
|
||||
*
|
||||
* <p>Only one substitution is made per tsheg bar. ^FOO$-type
|
||||
* mappings will be tried first, then ^FOO, then FOO$, then FOO.
|
||||
*
|
||||
* <p>Note that you cannot literally replace FOO with BAR using this
|
||||
* -- F is not an ACIP character, so the lex will not get far enough
|
||||
* to use this substitution mechanism. This is not a design flaw --
|
||||
* serious errors require user intervention (and our user can use an
|
||||
* awk script if he or she likes).
|
||||
*
|
||||
* @author David Chandler */
|
||||
final class MidLexSubstitution {
|
||||
|
||||
private MidLexSubstitution() { throw new Error("not instantiable"); }
|
||||
|
||||
/** substitutions that apply to whole tsheg bars only,
|
||||
i.e. ^FOO$=>BAR substitutions */
|
||||
private static HashMap wholeSubstMap = null;
|
||||
|
||||
/** ^FOO=>BAR (but not ^FOO$=>BAR) substitutions */
|
||||
private static ArrayList startSubstMap = null;
|
||||
|
||||
/** FOO$=>BAR (but not ^FOO$=>BAR) substitutions */
|
||||
private static ArrayList endSubstMap = null;
|
||||
|
||||
/** FOO=>BAR (but not ^FOO$=>BAR or FOO$=>BAR or
|
||||
^FOO=>BAR) substitutions */
|
||||
private static ArrayList anywhereSubstMap = null;
|
||||
|
||||
private static boolean verbose = false;
|
||||
|
||||
private static boolean inited = false;
|
||||
|
||||
private static final String ARROW = "=>";
|
||||
|
||||
/** Reads the system properties and initializes based on them. */
|
||||
private static void init() {
|
||||
inited = true;
|
||||
|
||||
verbose = ThdlOptions.getBooleanOption("org.thdl.tib.text.ttt.VerboseReplacementMap");
|
||||
if (verbose) {
|
||||
System.out.println("You have set org.thdl.tib.text.ttt.VerboseReplacementMap to true. You must be a power user.");
|
||||
}
|
||||
|
||||
String rm = ThdlOptions.getStringOption("org.thdl.tib.text.ttt.ReplacementMap", null);
|
||||
if (null != rm) {
|
||||
StringTokenizer stok = new StringTokenizer(rm, ",");
|
||||
while (stok.hasMoreElements()) {
|
||||
String mapping = stok.nextToken();
|
||||
String from, to;
|
||||
int arrowIndex = mapping.indexOf(ARROW);
|
||||
if (arrowIndex < 0) {
|
||||
System.err.println("You went to the trouble of setting the property org.thdl.tib.text.ttt.ReplacementMap, but you had a mapping, \"" + mapping + "\", in it without an arrow (" + ARROW + "). Aborting.");
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
from = mapping.substring(0, arrowIndex);
|
||||
to = mapping.substring(arrowIndex + ARROW.length());
|
||||
|
||||
boolean atStartOnly = false;
|
||||
boolean atEndOnly = false;
|
||||
if (from.length() > 0 && from.charAt(0) == '^') {
|
||||
atStartOnly = true;
|
||||
from = from.substring(1);
|
||||
}
|
||||
if (from.length() > 0 && from.charAt(from.length() - 1) == '$') {
|
||||
atEndOnly = true;
|
||||
from = from.substring(0, from.length() - 1);
|
||||
}
|
||||
if (from.length() == 0) {
|
||||
System.err.println("You went to the trouble of setting the property org.thdl.tib.text.ttt.ReplacementMap, but you had a mapping, \"" + mapping + "\", in it from the empty string to something. That's nonsense. Aborting.");
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
if (atStartOnly) {
|
||||
if (atEndOnly) {
|
||||
if (null == wholeSubstMap)
|
||||
wholeSubstMap = new HashMap(2);
|
||||
wholeSubstMap.put(from, to);
|
||||
if (verbose)
|
||||
System.out.println("You have set org.thdl.tib.text.ttt.VerboseReplacementMap to true, so you will want to know that wholeSubstMap maps " + from + " to " + to + ".");
|
||||
} else {
|
||||
if (null == startSubstMap)
|
||||
startSubstMap = new ArrayList(2);
|
||||
startSubstMap.add(new StringMapping(from, to));
|
||||
if (verbose)
|
||||
System.out.println("You have set org.thdl.tib.text.ttt.VerboseReplacementMap to true, so you will want to know that startSubstMap maps " + from + " to " + to + ".");
|
||||
}
|
||||
} else {
|
||||
if (atEndOnly) {
|
||||
if (null == endSubstMap)
|
||||
endSubstMap = new ArrayList(2);
|
||||
endSubstMap.add(new StringMapping(from, to));
|
||||
if (verbose)
|
||||
System.out.println("You have set org.thdl.tib.text.ttt.VerboseReplacementMap to true, so you will want to know that endSubstMap maps " + from + " to " + to + ".");
|
||||
} else {
|
||||
if (null == anywhereSubstMap)
|
||||
anywhereSubstMap = new ArrayList(2);
|
||||
anywhereSubstMap.add(new StringMapping(from, to));
|
||||
if (verbose)
|
||||
System.out.println("You have set org.thdl.tib.text.ttt.VerboseReplacementMap to true, so you will want to know that anywhereSubstMap maps " + from + " to " + to + ".");
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns the post-substitution value for tok, most often tok
|
||||
itself. See the class comment to understand when tok will
|
||||
change. */
|
||||
public static String getFinalValueForTibetanNonPunctuationToken(String tok) {
|
||||
if (!inited) init();
|
||||
String subst = null;
|
||||
if (null != wholeSubstMap)
|
||||
subst = (String)wholeSubstMap.get(tok);
|
||||
if (null == subst && null != startSubstMap) {
|
||||
for (int i = 0; i < startSubstMap.size(); i++) {
|
||||
StringMapping sm = (StringMapping)startSubstMap.get(i);
|
||||
if (tok.startsWith(sm.from)) {
|
||||
subst = sm.to + tok.substring(sm.from.length());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (null == subst && null != endSubstMap) {
|
||||
for (int i = 0; i < endSubstMap.size(); i++) {
|
||||
StringMapping sm = (StringMapping)endSubstMap.get(i);
|
||||
if (tok.endsWith(sm.from)) {
|
||||
subst = tok.substring(0, tok.length() - sm.from.length()) + sm.to;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (null == subst && null != anywhereSubstMap) {
|
||||
for (int i = 0; i < anywhereSubstMap.size(); i++) {
|
||||
StringMapping sm = (StringMapping)anywhereSubstMap.get(i);
|
||||
int toki = tok.indexOf(sm.from);
|
||||
if (toki >= 0) {
|
||||
subst = tok.substring(0, toki) + sm.to + tok.substring(toki+sm.from.length(), tok.length());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (null != subst) {
|
||||
if (verbose && null != subst) {
|
||||
System.out.println("Because org.thdl.tib.text.ttt.VerboseReplacementMap is true, you're being notified that " + tok + " is being replaced with " + subst);
|
||||
}
|
||||
return subst;
|
||||
} else {
|
||||
return tok;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Simple from=>to mapping for non-null Strings. */
|
||||
class StringMapping {
|
||||
public String from, to;
|
||||
public StringMapping(String from, String to) {
|
||||
this.from = from;
|
||||
this.to = to;
|
||||
}
|
||||
}
|
|
@ -242,6 +242,6 @@ class TPairListFactory {
|
|||
}
|
||||
|
||||
|
||||
// DLC strip out [#...] comments; test for nested comments
|
||||
// DLC test for nested comments
|
||||
|
||||
// DLC see Translit directory on ACIP v4 CD-ROM
|
||||
|
|
|
@ -111,14 +111,16 @@ public class TString {
|
|||
text = t;
|
||||
}
|
||||
|
||||
/** Don't instantiate me. */
|
||||
/** Don't instantiate using this constructor. */
|
||||
private TString() { }
|
||||
|
||||
/** Creates a new TString with source text <i>text</i> and type
|
||||
* <i>type</i> being a characterization like {@link #DD}. */
|
||||
public TString(String text, int type) {
|
||||
setType(type);
|
||||
setText(text);
|
||||
setText((TIBETAN_NON_PUNCTUATION == type)
|
||||
? MidLexSubstitution.getFinalValueForTibetanNonPunctuationToken(text)
|
||||
: text);
|
||||
}
|
||||
public String toString() {
|
||||
String typeString = "HUH?????";
|
||||
|
|
Loading…
Reference in a new issue