7198f23361
table exactly and I fear that it makes the ACIP->Tibetan converter code a lot uglier. The TODO(DLC)[EWTS->Tibetan] comments littered throughout are part of the ugliness; they point to the ugliness. If each were addressed, cleanliness could perhaps be achieved. I've largely forgotten exactly what this change does, but it attempts to improve EWTS->Tibetan conversion. The lexer is probably really, really primitive. I concentrate here on converting a single tsheg bar rather than a whole document. Eclipse was used during part of my journey here and some imports were reorganized merely because I could. :) (Eclipse was needed when the usual ant build failed to run a new test EWTSTest. And I wanted its debugger.) Next steps: end-to-end EWTS tests should bring many problems to light. Fix those. Triage all the TODO comments. I don't know that I'll ever really trust the implementation. The tests are valuable, though. A clean implementation of EWTS->Tibetan in Jython might hold enough interest for me; I'd like to learn Python.
224 lines
9.9 KiB
Java
224 lines
9.9 KiB
Java
/*
|
|
The contents of this file are subject to the THDL Open Community License
|
|
Version 1.0 (the "License"); you may not use this file except in compliance
|
|
with the License. You may obtain a copy of the License on the THDL web site
|
|
(http://www.thdl.org/).
|
|
|
|
Software distributed under the License is distributed on an "AS IS" basis,
|
|
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
|
License for the specific terms governing rights and limitations under the
|
|
License.
|
|
|
|
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
|
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
|
All Rights Reserved.
|
|
|
|
Contributor(s): ______________________________________.
|
|
*/
|
|
|
|
package org.thdl.tib.text.ttt;
|
|
|
|
import java.util.ArrayList;
|
|
import java.util.HashMap;
|
|
import java.util.StringTokenizer;
|
|
|
|
import org.thdl.util.ThdlOptions;
|
|
|
|
/** MidLexSubstitution is a hack that lets the end user clumsily fix
|
|
* the EWTS-to-Tibetan and ACIP-to-Tibetan converters without having
|
|
* to modify the source code.
|
|
*
|
|
* <p>If the converter isn't giving you what you want for some
|
|
* tsheg bar, then set up a replacement.
|
|
*
|
|
* <p>To do so, set the system property
|
|
* org.thdl.tib.text.ttt.ReplacementMap to be a comma-delimited list
|
|
* of "x=>y" pairs. For example, if you think BLKU, which parses
|
|
* as B+L+KU, should parse as B-L+KU, and you want KAsh to be parsed
|
|
* as K+sh because the input operators mistyped it, then set
|
|
* org.thdl.tib.text.ttt.ReplacementMap to
|
|
* "BLKU=>B-L+KU,KAsh=>K+sh". Note that this will not cause
|
|
* B+L+KU to become B-L+KU -- we are doing the replacement during
|
|
* lexical analysis of the input file, not during parsing. And it
|
|
* will cause SBLKU to become SB-L+KU, which is parsed as S+B-L+KU,
|
|
* probably not what you wanted. If you fear such things, you can
|
|
* see if they happen by setting the system property
|
|
* org.thdl.tib.text.ttt.VerboseReplacementMap to "true", which will
|
|
* cause an informational message to be printed on the Java console
|
|
* every time a replacement is made.
|
|
*
|
|
* <p>Furthermore, you can use the regexp notation
|
|
* "^BLKU$=>B-L+KU". Note that regular expressions are not
|
|
* supported -- we're just borrowing the notation.
|
|
* "^BLKU=>B-L+KU" means that BLKUM and BLKU will both be
|
|
* replaced, but SBLKU and SBLKUM will not be. The caret, '^', means
|
|
* that we only match if BLKU is at the beginning. The dollar sign,
|
|
* '$', means that we only match if the pattern is at the end.
|
|
* "BLKU$=>B-L+KU" will cause SBLKU to be replaced, but not BLKUM.
|
|
* Note that performance is far better for ^FOO$ than for ^FOO, FOO$,
|
|
* or FOO alone.
|
|
*
|
|
* <p>Only one substitution is made per tsheg bar. ^FOO$-type
|
|
* mappings will be tried first, then ^FOO, then FOO$, then FOO.
|
|
*
|
|
* <p>Note that you cannot literally replace FOO with BAR using this
|
|
* -- F is not an ACIP character, so the lex will not get far enough
|
|
* to use this substitution mechanism. This is not a design flaw --
|
|
* serious errors require user intervention (and our user can use an
|
|
* awk script if he or she likes).
|
|
*
|
|
* @author David Chandler */
|
|
final class MidLexSubstitution {
|
|
|
|
private MidLexSubstitution() { throw new Error("not instantiable"); }
|
|
|
|
/** substitutions that apply to whole tsheg bars only,
|
|
i.e. ^FOO$=>BAR substitutions */
|
|
private static HashMap wholeSubstMap = null;
|
|
|
|
/** ^FOO=>BAR (but not ^FOO$=>BAR) substitutions */
|
|
private static ArrayList startSubstMap = null;
|
|
|
|
/** FOO$=>BAR (but not ^FOO$=>BAR) substitutions */
|
|
private static ArrayList endSubstMap = null;
|
|
|
|
/** FOO=>BAR (but not ^FOO$=>BAR or FOO$=>BAR or
|
|
^FOO=>BAR) substitutions */
|
|
private static ArrayList anywhereSubstMap = null;
|
|
|
|
private static boolean verbose = false;
|
|
|
|
private static boolean inited = false;
|
|
|
|
private static final String ARROW = "=>";
|
|
|
|
/** Reads the system properties and initializes based on them. */
|
|
private static void init() {
|
|
inited = true;
|
|
|
|
verbose = ThdlOptions.getBooleanOption("org.thdl.tib.text.ttt.VerboseReplacementMap");
|
|
if (verbose) {
|
|
System.out.println("You have set org.thdl.tib.text.ttt.VerboseReplacementMap to true. You must be a power user.");
|
|
}
|
|
|
|
String rm = ThdlOptions.getStringOption("org.thdl.tib.text.ttt.ReplacementMap", null);
|
|
if (null != rm) {
|
|
StringTokenizer stok = new StringTokenizer(rm, ",");
|
|
while (stok.hasMoreElements()) {
|
|
String mapping = stok.nextToken();
|
|
String from, to;
|
|
int arrowIndex = mapping.indexOf(ARROW);
|
|
if (arrowIndex < 0) {
|
|
System.err.println("You went to the trouble of setting the property org.thdl.tib.text.ttt.ReplacementMap, but you had a mapping, \"" + mapping + "\", in it without an arrow (" + ARROW + "). Aborting.");
|
|
System.exit(1);
|
|
}
|
|
|
|
from = mapping.substring(0, arrowIndex);
|
|
to = mapping.substring(arrowIndex + ARROW.length());
|
|
|
|
boolean atStartOnly = false;
|
|
boolean atEndOnly = false;
|
|
if (from.length() > 0 && from.charAt(0) == '^') {
|
|
atStartOnly = true;
|
|
from = from.substring(1);
|
|
}
|
|
if (from.length() > 0 && from.charAt(from.length() - 1) == '$') {
|
|
atEndOnly = true;
|
|
from = from.substring(0, from.length() - 1);
|
|
}
|
|
if (from.length() == 0) {
|
|
System.err.println("You went to the trouble of setting the property org.thdl.tib.text.ttt.ReplacementMap, but you had a mapping, \"" + mapping + "\", in it from the empty string to something. That's nonsense. Aborting.");
|
|
System.exit(1);
|
|
}
|
|
|
|
if (atStartOnly) {
|
|
if (atEndOnly) {
|
|
if (null == wholeSubstMap)
|
|
wholeSubstMap = new HashMap(2);
|
|
wholeSubstMap.put(from, to);
|
|
if (verbose)
|
|
System.out.println("You have set org.thdl.tib.text.ttt.VerboseReplacementMap to true, so you will want to know that wholeSubstMap maps " + from + " to " + to + ".");
|
|
} else {
|
|
if (null == startSubstMap)
|
|
startSubstMap = new ArrayList(2);
|
|
startSubstMap.add(new StringMapping(from, to));
|
|
if (verbose)
|
|
System.out.println("You have set org.thdl.tib.text.ttt.VerboseReplacementMap to true, so you will want to know that startSubstMap maps " + from + " to " + to + ".");
|
|
}
|
|
} else {
|
|
if (atEndOnly) {
|
|
if (null == endSubstMap)
|
|
endSubstMap = new ArrayList(2);
|
|
endSubstMap.add(new StringMapping(from, to));
|
|
if (verbose)
|
|
System.out.println("You have set org.thdl.tib.text.ttt.VerboseReplacementMap to true, so you will want to know that endSubstMap maps " + from + " to " + to + ".");
|
|
} else {
|
|
if (null == anywhereSubstMap)
|
|
anywhereSubstMap = new ArrayList(2);
|
|
anywhereSubstMap.add(new StringMapping(from, to));
|
|
if (verbose)
|
|
System.out.println("You have set org.thdl.tib.text.ttt.VerboseReplacementMap to true, so you will want to know that anywhereSubstMap maps " + from + " to " + to + ".");
|
|
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/** Returns the post-substitution value for tok, most often tok
|
|
itself. See the class comment to understand when tok will
|
|
change. */
|
|
public static String getFinalValueForTibetanNonPunctuationToken(String tok) {
|
|
if (!inited) init();
|
|
String subst = null;
|
|
if (null != wholeSubstMap)
|
|
subst = (String)wholeSubstMap.get(tok);
|
|
if (null == subst && null != startSubstMap) {
|
|
for (int i = 0; i < startSubstMap.size(); i++) {
|
|
StringMapping sm = (StringMapping)startSubstMap.get(i);
|
|
if (tok.startsWith(sm.from)) {
|
|
subst = sm.to + tok.substring(sm.from.length());
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
if (null == subst && null != endSubstMap) {
|
|
for (int i = 0; i < endSubstMap.size(); i++) {
|
|
StringMapping sm = (StringMapping)endSubstMap.get(i);
|
|
if (tok.endsWith(sm.from)) {
|
|
subst = tok.substring(0, tok.length() - sm.from.length()) + sm.to;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
if (null == subst && null != anywhereSubstMap) {
|
|
for (int i = 0; i < anywhereSubstMap.size(); i++) {
|
|
StringMapping sm = (StringMapping)anywhereSubstMap.get(i);
|
|
int toki = tok.indexOf(sm.from);
|
|
if (toki >= 0) {
|
|
subst = tok.substring(0, toki) + sm.to + tok.substring(toki+sm.from.length(), tok.length());
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
if (null != subst) {
|
|
if (verbose && null != subst) {
|
|
System.out.println("Because org.thdl.tib.text.ttt.VerboseReplacementMap is true, you're being notified that " + tok + " is being replaced with " + subst);
|
|
}
|
|
return subst;
|
|
} else {
|
|
return tok;
|
|
}
|
|
}
|
|
}
|
|
|
|
/** Simple from=>to mapping for non-null Strings. */
|
|
class StringMapping {
|
|
public String from, to;
|
|
public StringMapping(String from, String to) {
|
|
this.from = from;
|
|
this.to = to;
|
|
}
|
|
}
|
|
// DLC NOW: defaults: KAsh=>K+sh, A=>?, '=>? (THESE ARE {A} AND {'} ALONE, NOT AS COMPONENTS OF A TSHEG-BAR.)
|
|
|