diff --git a/source/org/thdl/tib/text/ttt/MidLexSubstitution.java b/source/org/thdl/tib/text/ttt/MidLexSubstitution.java new file mode 100644 index 0000000..aae7d9a --- /dev/null +++ b/source/org/thdl/tib/text/ttt/MidLexSubstitution.java @@ -0,0 +1,224 @@ +/* +The contents of this file are subject to the THDL Open Community License +Version 1.0 (the "License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License on the THDL web site +(http://www.thdl.org/). + +Software distributed under the License is distributed on an "AS IS" basis, +WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +License for the specific terms governing rights and limitations under the +License. + +The Initial Developer of this software is the Tibetan and Himalayan Digital +Library (THDL). Portions created by the THDL are Copyright 2003 THDL. +All Rights Reserved. + +Contributor(s): ______________________________________. +*/ + +package org.thdl.tib.text.ttt; + +import org.thdl.util.ThdlOptions; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.StringTokenizer; + +// DLC FIXME: document this. + +/** MidLexSubstitution is a hack that lets the end user clumsily fix + * the EWTS-to-Tibetan and ACIP-to-Tibetan converters without having + * to modify the source code. + * + *
If the converter isn't giving you what you want for some + * tsheg bar, then set up a replacement. + * + *
To do so, set the system property + * org.thdl.tib.text.ttt.ReplacementMap to be a comma-delimited list + * of "x=>y" pairs. For example, if you think BLKU, which parses + * as B+L+KU, should parse as B-L+KU, and you want KAsh to be parsed + * as K+sh because the input operators mistyped it, then set + * org.thdl.tib.text.ttt.ReplacementMap to + * "BLKU=>B-L+KU,KAsh=>K+sh". Note that this will not cause + * B+L+KU to become B-L+KU -- we are doing the replacement during + * lexical analysis of the input file, not during parsing. And it + * will cause SBLKU to become SB-L+KU, which is parsed as S+B-L+KU, + * probably not what you wanted. If you fear such things, you can + * see if they happen by setting the system property + * org.thdl.tib.text.ttt.VerboseReplacementMap to "true", which will + * cause an informational message to be printed on the Java console + * every time a replacement is made. + * + *
Furthermore, you can use the regexp notation + * "^BLKU$=>B-L+KU". Note that regular expressions are not + * supported -- we're just borrowing the notation. + * "^BLKU=>B-L+KU" means that BLKUM and BLKU will both be + * replaced, but SBLKU and SBLKUM will not be. The caret, '^', means + * that we only match if BLKU is at the beginning. The dollar sign, + * '$', means that we only match if the pattern is at the end. + * "BLKU$=>B-L+KU" will cause SBLKU to be replaced, but not BLKUM. + * Note that performance is far better for ^FOO$ than for ^FOO, FOO$, + * or FOO alone. + * + *
Only one substitution is made per tsheg bar. ^FOO$-type + * mappings will be tried first, then ^FOO, then FOO$, then FOO. + * + *
Note that you cannot literally replace FOO with BAR using this + * -- F is not an ACIP character, so the lex will not get far enough + * to use this substitution mechanism. This is not a design flaw -- + * serious errors require user intervention (and our user can use an + * awk script if he or she likes). + * + * @author David Chandler */ +final class MidLexSubstitution { + + private MidLexSubstitution() { throw new Error("not instantiable"); } + + /** substitutions that apply to whole tsheg bars only, + i.e. ^FOO$=>BAR substitutions */ + private static HashMap wholeSubstMap = null; + + /** ^FOO=>BAR (but not ^FOO$=>BAR) substitutions */ + private static ArrayList startSubstMap = null; + + /** FOO$=>BAR (but not ^FOO$=>BAR) substitutions */ + private static ArrayList endSubstMap = null; + + /** FOO=>BAR (but not ^FOO$=>BAR or FOO$=>BAR or + ^FOO=>BAR) substitutions */ + private static ArrayList anywhereSubstMap = null; + + private static boolean verbose = false; + + private static boolean inited = false; + + private static final String ARROW = "=>"; + + /** Reads the system properties and initializes based on them. */ + private static void init() { + inited = true; + + verbose = ThdlOptions.getBooleanOption("org.thdl.tib.text.ttt.VerboseReplacementMap"); + if (verbose) { + System.out.println("You have set org.thdl.tib.text.ttt.VerboseReplacementMap to true. You must be a power user."); + } + + String rm = ThdlOptions.getStringOption("org.thdl.tib.text.ttt.ReplacementMap", null); + if (null != rm) { + StringTokenizer stok = new StringTokenizer(rm, ","); + while (stok.hasMoreElements()) { + String mapping = stok.nextToken(); + String from, to; + int arrowIndex = mapping.indexOf(ARROW); + if (arrowIndex < 0) { + System.err.println("You went to the trouble of setting the property org.thdl.tib.text.ttt.ReplacementMap, but you had a mapping, \"" + mapping + "\", in it without an arrow (" + ARROW + "). Aborting."); + System.exit(1); + } + + from = mapping.substring(0, arrowIndex); + to = mapping.substring(arrowIndex + ARROW.length()); + + boolean atStartOnly = false; + boolean atEndOnly = false; + if (from.length() > 0 && from.charAt(0) == '^') { + atStartOnly = true; + from = from.substring(1); + } + if (from.length() > 0 && from.charAt(from.length() - 1) == '$') { + atEndOnly = true; + from = from.substring(0, from.length() - 1); + } + if (from.length() == 0) { + System.err.println("You went to the trouble of setting the property org.thdl.tib.text.ttt.ReplacementMap, but you had a mapping, \"" + mapping + "\", in it from the empty string to something. That's nonsense. Aborting."); + System.exit(1); + } + + if (atStartOnly) { + if (atEndOnly) { + if (null == wholeSubstMap) + wholeSubstMap = new HashMap(2); + wholeSubstMap.put(from, to); + if (verbose) + System.out.println("You have set org.thdl.tib.text.ttt.VerboseReplacementMap to true, so you will want to know that wholeSubstMap maps " + from + " to " + to + "."); + } else { + if (null == startSubstMap) + startSubstMap = new ArrayList(2); + startSubstMap.add(new StringMapping(from, to)); + if (verbose) + System.out.println("You have set org.thdl.tib.text.ttt.VerboseReplacementMap to true, so you will want to know that startSubstMap maps " + from + " to " + to + "."); + } + } else { + if (atEndOnly) { + if (null == endSubstMap) + endSubstMap = new ArrayList(2); + endSubstMap.add(new StringMapping(from, to)); + if (verbose) + System.out.println("You have set org.thdl.tib.text.ttt.VerboseReplacementMap to true, so you will want to know that endSubstMap maps " + from + " to " + to + "."); + } else { + if (null == anywhereSubstMap) + anywhereSubstMap = new ArrayList(2); + anywhereSubstMap.add(new StringMapping(from, to)); + if (verbose) + System.out.println("You have set org.thdl.tib.text.ttt.VerboseReplacementMap to true, so you will want to know that anywhereSubstMap maps " + from + " to " + to + "."); + + } + } + } + } + } + + /** Returns the post-substitution value for tok, most often tok + itself. See the class comment to understand when tok will + change. */ + public static String getFinalValueForTibetanNonPunctuationToken(String tok) { + if (!inited) init(); + String subst = null; + if (null != wholeSubstMap) + subst = (String)wholeSubstMap.get(tok); + if (null == subst && null != startSubstMap) { + for (int i = 0; i < startSubstMap.size(); i++) { + StringMapping sm = (StringMapping)startSubstMap.get(i); + if (tok.startsWith(sm.from)) { + subst = sm.to + tok.substring(sm.from.length()); + break; + } + } + } + if (null == subst && null != endSubstMap) { + for (int i = 0; i < endSubstMap.size(); i++) { + StringMapping sm = (StringMapping)endSubstMap.get(i); + if (tok.endsWith(sm.from)) { + subst = tok.substring(0, tok.length() - sm.from.length()) + sm.to; + break; + } + } + } + if (null == subst && null != anywhereSubstMap) { + for (int i = 0; i < anywhereSubstMap.size(); i++) { + StringMapping sm = (StringMapping)anywhereSubstMap.get(i); + int toki = tok.indexOf(sm.from); + if (toki >= 0) { + subst = tok.substring(0, toki) + sm.to + tok.substring(toki+sm.from.length(), tok.length()); + break; + } + } + } + if (null != subst) { + if (verbose && null != subst) { + System.out.println("Because org.thdl.tib.text.ttt.VerboseReplacementMap is true, you're being notified that " + tok + " is being replaced with " + subst); + } + return subst; + } else { + return tok; + } + } +} + +/** Simple from=>to mapping for non-null Strings. */ +class StringMapping { + public String from, to; + public StringMapping(String from, String to) { + this.from = from; + this.to = to; + } +} diff --git a/source/org/thdl/tib/text/ttt/TPairListFactory.java b/source/org/thdl/tib/text/ttt/TPairListFactory.java index ef149d4..5895555 100644 --- a/source/org/thdl/tib/text/ttt/TPairListFactory.java +++ b/source/org/thdl/tib/text/ttt/TPairListFactory.java @@ -242,6 +242,6 @@ class TPairListFactory { } -// DLC strip out [#...] comments; test for nested comments +// DLC test for nested comments // DLC see Translit directory on ACIP v4 CD-ROM diff --git a/source/org/thdl/tib/text/ttt/TString.java b/source/org/thdl/tib/text/ttt/TString.java index 4bfc16a..94de609 100644 --- a/source/org/thdl/tib/text/ttt/TString.java +++ b/source/org/thdl/tib/text/ttt/TString.java @@ -111,14 +111,16 @@ public class TString { text = t; } - /** Don't instantiate me. */ + /** Don't instantiate using this constructor. */ private TString() { } /** Creates a new TString with source text text and type * type being a characterization like {@link #DD}. */ public TString(String text, int type) { setType(type); - setText(text); + setText((TIBETAN_NON_PUNCTUATION == type) + ? MidLexSubstitution.getFinalValueForTibetanNonPunctuationToken(text) + : text); } public String toString() { String typeString = "HUH?????";