8ccd68789a
imports. It made two errors, but the compiler found them. I've cvs tagged the tree before doing this, just in case.
575 lines
25 KiB
Java
575 lines
25 KiB
Java
/*
|
|
The contents of this file are subject to the THDL Open Community License
|
|
Version 1.0 (the "License"); you may not use this file except in compliance
|
|
with the License. You may obtain a copy of the License on the THDL web site
|
|
(http://www.thdl.org/).
|
|
|
|
Software distributed under the License is distributed on an "AS IS" basis,
|
|
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
|
License for the specific terms governing rights and limitations under the
|
|
License.
|
|
|
|
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
|
Library (THDL). Portions created by the THDL are Copyright 2004 THDL.
|
|
All Rights Reserved.
|
|
|
|
Contributor(s): ______________________________________.
|
|
*/
|
|
|
|
// TODO(DLC)[EWTS->Tibetan]: TibetanMachineWeb has duplication of much of this!
|
|
|
|
package org.thdl.tib.text.ttt;
|
|
|
|
import java.util.ArrayList;
|
|
import java.util.HashMap;
|
|
|
|
import org.thdl.tib.text.DuffCode;
|
|
import org.thdl.tib.text.THDLWylieConstants;
|
|
import org.thdl.tib.text.TibTextUtils;
|
|
import org.thdl.tib.text.TibetanMachineWeb;
|
|
import org.thdl.tib.text.tshegbar.UnicodeUtils;
|
|
import org.thdl.util.ThdlDebug;
|
|
|
|
/** A singleton class that should contain (but due to laziness and
|
|
* ignorance probably does not contain) all the traits that make EWTS
|
|
* transliteration different from other (say, ACIP) transliteration
|
|
* schemes. */
|
|
public final class EWTSTraits implements TTraits {
|
|
/** sole instance of this class */
|
|
private static EWTSTraits singleton = null;
|
|
|
|
/** Just a constructor. */
|
|
private EWTSTraits() { }
|
|
|
|
/** */
|
|
public static synchronized EWTSTraits instance() {
|
|
if (null == singleton) {
|
|
singleton = new EWTSTraits();
|
|
}
|
|
return singleton;
|
|
}
|
|
|
|
/** Returns ".". */
|
|
public String disambiguator() { return "."; }
|
|
|
|
/** Returns '.'. */
|
|
public char disambiguatorChar() { return '.'; }
|
|
|
|
// TODO(DLC)[EWTS->Tibetan]: isClearlyIllegal and hasSimpleError are different why?
|
|
public boolean hasSimpleError(TPair p) {
|
|
if (pairHasBadWowel(p)) return true;
|
|
return (("a".equals(p.getLeft()) && null == p.getRight())
|
|
|| ("a".equals(p.getLeft())
|
|
&& null != p.getRight()
|
|
&& TibetanMachineWeb.isWylieVowel(p.getRight()))); // TODO(DLC)[EWTS->Tibetan]: or Unicode wowels? test "a\u0f74" and "a\u0f7e"
|
|
// TODO(DLC)[EWTS->Tibetan]: (a.e) is bad, one of (.a) or (a.) is bad
|
|
}
|
|
|
|
/** {tsh}, the longest consonant, has 3 characters, so this is
|
|
* three. */
|
|
public int maxConsonantLength() { return 3; }
|
|
|
|
/** Wowels can be arbitrarily long via stacking. But each
|
|
* component is no longer, in characters, than this. [~M`] is
|
|
* the current winner. */
|
|
public int maxWowelLength() { return 3; }
|
|
|
|
public boolean isUnicodeConsonant(char ch) {
|
|
return ((ch != '\u0f48' && ch >= '\u0f40' && ch <= '\u0f6a')
|
|
|| (ch != '\u0f98' && ch >= '\u0f90' && ch <= '\u0fbc')
|
|
// NOTE: \u0f88 is questionable, but we want EWTS
|
|
// [\u0f88+kha] to become "\u0f88\u0f91" and this does
|
|
// the trick.
|
|
|| ch == '\u0f88');
|
|
}
|
|
|
|
public boolean isUnicodeWowel(char ch) {
|
|
// TODO(DLC)[EWTS->Tibetan]: what about combiners that combine only with digits? TEST
|
|
return ((ch >= '\u0f71' && ch <= '\u0f84')
|
|
|| '\u0f39' == ch
|
|
|| isUnicodeWowelThatRequiresAChen(ch));
|
|
}
|
|
|
|
// TODO(DLC)[EWTS->Tibetan]: u,e,i,o? If not, document the special treatment in this function's comment
|
|
public boolean isConsonant(String s) {
|
|
if (s.length() == 1 && isUnicodeConsonant(s.charAt(0))) return true;
|
|
if (aVowel().equals(s)) return false; // In EWTS, "a" is both a consonant and a vowel, but we treat it as just a vowel and insert the implied a-chen if you have a TPair ( . a) (TODO(DLC)[EWTS->Tibetan]: right?)
|
|
|
|
// TODO(DLC)[EWTS->Tibetan]: numbers are consonants?
|
|
|
|
// TODO(DLC)[EWTS->Tibetan]: just g for now
|
|
return TibetanMachineWeb.isWylieChar(s);
|
|
}
|
|
|
|
public boolean isWowel(String s) {
|
|
return (getUnicodeForWowel(s) != null);
|
|
}
|
|
|
|
public String aVowel() { return THDLWylieConstants.WYLIE_aVOWEL; }
|
|
|
|
public boolean isPostsuffix(String s) {
|
|
return ("s".equals(s) || "d".equals(s));
|
|
}
|
|
|
|
public boolean isPrefix(String l) {
|
|
return (THDLWylieConstants.ACHUNG.equals(l)
|
|
|| THDLWylieConstants.MA.equals(l)
|
|
|| THDLWylieConstants.BA.equals(l)
|
|
|| THDLWylieConstants.DA.equals(l)
|
|
|| THDLWylieConstants.GA.equals(l));
|
|
}
|
|
|
|
public boolean isSuffix(String l) {
|
|
return (isPrefix(l)
|
|
|| THDLWylieConstants.SA.equals(l)
|
|
|| THDLWylieConstants.NGA.equals(l)
|
|
|| THDLWylieConstants.NA.equals(l)
|
|
|| THDLWylieConstants.LA.equals(l)
|
|
|| THDLWylieConstants.RA.equals(l));
|
|
}
|
|
|
|
/** Returns the best EWTS for l, which is often l but not always
|
|
* thanks to Unicode escapes. NOTE: For "\u0f42", you don't want
|
|
* to return "g" lest "\\u0f42ya " become the wrong thing under
|
|
* EWTS->Unicode. */
|
|
public String getEwtsForConsonant(String l) {
|
|
return helpGetEwts(l);
|
|
}
|
|
|
|
/** Returns the best EWTS for l, which is often l but not always
|
|
* thanks to Unicode escapes. */
|
|
public String getEwtsForOther(String l) {
|
|
return helpGetEwts(l);
|
|
}
|
|
|
|
private String helpGetEwts(String l) {
|
|
if (l.length() == 1
|
|
&& ((l.charAt(0) >= THDLWylieConstants.PUA_MIN
|
|
&& l.charAt(0) <= THDLWylieConstants.PUA_MAX)
|
|
|| 0 <= "\u0F01\u0F09\u0F0A\u0F10\u0F12\u0F13\u0F15\u0F16\u0F17\u0F18\u0F19\u0F1A\u0F1B\u0F1C\u0F1D\u0F1E\u0F1F\u0F2A\u0F2B\u0F2C\u0F2D\u0F2E\u0F2F\u0F30\u0F31\u0F32\u0F33\u0F36\u0F38\u0F86\u0F87\u0F88\u0F89\u0F8A\u0F8B\u0FBE\u0FBF\u0FC0\u0FC1\u0FC2\u0FC3\u0FC4\u0FC5\u0FC6\u0FC7\u0FC8\u0FC9\u0FCA\u0FCB\u0FCC\u0FCF\u5350\u534D".indexOf(l.charAt(0)))) {
|
|
return UnicodeUtils.unicodeCodepointToString(l.charAt(0), false, "\\u", true);
|
|
}
|
|
if (false) { // TODO(dchandler): it's too late in the game to do this. EWTS->TMW is broken for \u0f00, \u0f02, and \u0f03 right now, fix that.
|
|
if ("\u0f02".equals(l)) return "u~M`H"; // too long for a single hash key, see?
|
|
if ("\u0f03".equals(l)) return "u~M`:"; // ditto
|
|
}
|
|
return l;
|
|
}
|
|
|
|
/** Returns l, since this is EWTS's traits class. */
|
|
public String getEwtsForWowel(String l) { return l; }
|
|
|
|
public TTshegBarScanner scanner() { return EWTSTshegBarScanner.instance(); }
|
|
|
|
/** If needle is found in haystack, then haystack without the
|
|
* first instance of needle is returned. Otherwise haystack
|
|
* itself is returned. */
|
|
private static String removeFirstMatch(String haystack, String needle) {
|
|
int ix;
|
|
if ((ix = haystack.indexOf(needle)) >= 0) {
|
|
StringBuffer sb = new StringBuffer(haystack);
|
|
sb.replace(ix, ix + needle.length(), "");
|
|
return sb.toString();
|
|
}
|
|
return haystack;
|
|
}
|
|
|
|
private static HashMap bestEwtsMap = null;
|
|
private static String getBestEwtsForSingleWowel(String wowel) {
|
|
// NOTE: Not MT-safe
|
|
if (null == bestEwtsMap) {
|
|
bestEwtsMap = new HashMap(20);
|
|
// Unicode-escape sequences are handled early. To be
|
|
// correct, we must "unescape" here any Unicode escape to
|
|
// whatever tibwn.ini has. (TODO(dchandler): tibwn.ini
|
|
// has this info, use that instead of duplicating it in
|
|
// this code.)
|
|
bestEwtsMap.put("\u0f18", THDLWylieConstants.U0F18);
|
|
bestEwtsMap.put("\u0f19", THDLWylieConstants.U0F19);
|
|
bestEwtsMap.put("\u0f35", THDLWylieConstants.U0F35);
|
|
bestEwtsMap.put("\u0f37", THDLWylieConstants.U0F37);
|
|
bestEwtsMap.put("\u0f39", THDLWylieConstants.WYLIE_TSA_PHRU);
|
|
bestEwtsMap.put("\u0f3e", THDLWylieConstants.U0F3E);
|
|
bestEwtsMap.put("\u0f3f", THDLWylieConstants.U0F3F);
|
|
bestEwtsMap.put("\u0f84", THDLWylieConstants.U0F84);
|
|
bestEwtsMap.put("\u0f86", THDLWylieConstants.U0F86);
|
|
bestEwtsMap.put("\u0f87", THDLWylieConstants.U0F87);
|
|
bestEwtsMap.put("\u0fc6", THDLWylieConstants.U0FC6);
|
|
|
|
bestEwtsMap.put("\u0f71", THDLWylieConstants.A_VOWEL);
|
|
bestEwtsMap.put("\u0f72", THDLWylieConstants.i_VOWEL);
|
|
bestEwtsMap.put("\u0f74", THDLWylieConstants.u_VOWEL);
|
|
bestEwtsMap.put("\u0f7a", THDLWylieConstants.e_VOWEL);
|
|
bestEwtsMap.put("\u0f7b", THDLWylieConstants.ai_VOWEL);
|
|
bestEwtsMap.put("\u0f7c", THDLWylieConstants.o_VOWEL);
|
|
bestEwtsMap.put("\u0f7d", THDLWylieConstants.au_VOWEL);
|
|
bestEwtsMap.put("\u0f7e", THDLWylieConstants.BINDU);
|
|
bestEwtsMap.put("\u0f80", THDLWylieConstants.reverse_i_VOWEL);
|
|
bestEwtsMap.put("\u0f81", THDLWylieConstants.reverse_I_VOWEL);
|
|
|
|
bestEwtsMap.put("\u0f73", THDLWylieConstants.I_VOWEL); // not in tibwn.ini
|
|
bestEwtsMap.put("\u0f75", THDLWylieConstants.U_VOWEL); // not in tibwn.ini
|
|
}
|
|
String mapping = (String)bestEwtsMap.get(wowel);
|
|
if (null != mapping)
|
|
return mapping;
|
|
else
|
|
return wowel;
|
|
}
|
|
|
|
public void getDuffForWowel(ArrayList duff, DuffCode preceding,
|
|
String wowel)
|
|
throws ConversionException
|
|
{
|
|
boolean preceding_added[] = new boolean[] { false };
|
|
String[] wowels = wowel.split("\\+");
|
|
for (int i = 0; i < wowels.length; i++) {
|
|
getDuffForSingleWowel(duff, preceding,
|
|
getBestEwtsForSingleWowel(wowels[i]),
|
|
preceding_added);
|
|
}
|
|
}
|
|
|
|
/** Wowels can stack. This works on a single wowel. */
|
|
private void getDuffForSingleWowel(ArrayList duff, DuffCode preceding,
|
|
String wowel, boolean preceding_added[])
|
|
throws ConversionException
|
|
{
|
|
if (wowel.equals(THDLWylieConstants.WYLIE_aVOWEL)) { // TODO(dchandler): ka+o deserves at least a warning. kaM, though, does not. Do we handle it?
|
|
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.WYLIE_aVOWEL, preceding_added);
|
|
wowel = "";
|
|
} else {
|
|
// We call these combining because the TMW font treats
|
|
// such a vowel specially depending on the preceding glyph
|
|
// with which it combines.
|
|
String combining_wowels[] = new String[] {
|
|
// order does not matter
|
|
THDLWylieConstants.U_VOWEL,
|
|
THDLWylieConstants.reverse_I_VOWEL,
|
|
THDLWylieConstants.I_VOWEL,
|
|
THDLWylieConstants.A_VOWEL,
|
|
THDLWylieConstants.ai_VOWEL,
|
|
THDLWylieConstants.reverse_i_VOWEL,
|
|
THDLWylieConstants.i_VOWEL,
|
|
THDLWylieConstants.e_VOWEL,
|
|
THDLWylieConstants.o_VOWEL,
|
|
THDLWylieConstants.au_VOWEL,
|
|
THDLWylieConstants.u_VOWEL
|
|
};
|
|
for (int i = 0; i < combining_wowels.length; i++) {
|
|
if (wowel.equals(combining_wowels[i])) {
|
|
TibTextUtils.getVowel(duff, preceding, combining_wowels[i],
|
|
preceding_added);
|
|
wowel = removeFirstMatch(wowel, combining_wowels[i]);
|
|
}
|
|
}
|
|
}
|
|
// FIXME: Use TMW9.61, the "o'i" special combination, when appropriate.
|
|
|
|
if (wowel.equals(THDLWylieConstants.BINDU)) {
|
|
DuffCode last = null;
|
|
if (!preceding_added[0]) {
|
|
last = preceding;
|
|
} else if (duff.size() > 0) {
|
|
last = (DuffCode)duff.get(duff.size() - 1);
|
|
duff.remove(duff.size() - 1); // getBindu will add it back...
|
|
// TODO(DLC)[EWTS->Tibetan]: is this okay???? when is a bindu okay to be alone???
|
|
}
|
|
TibTextUtils.getBindu(duff, last);
|
|
preceding_added[0] = true;
|
|
wowel = removeFirstMatch(wowel, THDLWylieConstants.BINDU);
|
|
}
|
|
|
|
if (!preceding_added[0]) {
|
|
duff.add(preceding);
|
|
preceding_added[0] = true;
|
|
}
|
|
|
|
String standalone_wowels[] = new String[] {
|
|
// order does not matter
|
|
|
|
// This likely won't look good! TMW has glyphs for [va]
|
|
// and [fa], so use that transliteration if you care, not
|
|
// [ph^] or [b^].
|
|
THDLWylieConstants.WYLIE_TSA_PHRU,
|
|
THDLWylieConstants.U0F35,
|
|
THDLWylieConstants.U0F37,
|
|
THDLWylieConstants.U0F7F,
|
|
THDLWylieConstants.U0F82,
|
|
THDLWylieConstants.U0F83,
|
|
THDLWylieConstants.U0F86,
|
|
THDLWylieConstants.U0F87,
|
|
THDLWylieConstants.U0F19,
|
|
THDLWylieConstants.U0F18,
|
|
THDLWylieConstants.U0FC6,
|
|
THDLWylieConstants.U0F3E,
|
|
THDLWylieConstants.U0F3F,
|
|
THDLWylieConstants.U0F84,
|
|
};
|
|
for (int i = 0; i < standalone_wowels.length; i++) {
|
|
if (wowel.equals(standalone_wowels[i])) {
|
|
ThdlDebug.verify(preceding_added[0]);
|
|
duff.add(TibetanMachineWeb.getGlyph(standalone_wowels[i]));
|
|
wowel = removeFirstMatch(wowel, standalone_wowels[i]);
|
|
}
|
|
}
|
|
|
|
// We verify that no part of wowel is discarded.
|
|
if (wowel.length() > 0) {
|
|
throw new ConversionException(
|
|
"Full wowel was not handled, there remains: " + wowel);
|
|
}
|
|
|
|
// TODO(DLC)[EWTS->Tibetan]:: are bindus are screwed up in the
|
|
// unicode output? i see (with tmuni font) lone bindus
|
|
// without glyphs to stack on
|
|
}
|
|
|
|
public String getUnicodeForWowel(String wowel) {
|
|
if (THDLWylieConstants.WYLIE_aVOWEL.equals(wowel))
|
|
return "";
|
|
return helpGetUnicodeForWowel(wowel);
|
|
}
|
|
|
|
private String helpGetUnicodeForWowel(String wowel) {
|
|
if (THDLWylieConstants.WYLIE_aVOWEL.equals(wowel))
|
|
return null; // ko+a+e is invalid, e.g.
|
|
if (wowel.length() == 1 && isUnicodeWowel(wowel.charAt(0))) {
|
|
if ("\u0f75".equals(wowel))
|
|
return "\u0f71\u0f74"; // \u0f75 is discouraged
|
|
if ("\u0f81".equals(wowel))
|
|
return "\u0f71\u0f80"; // \u0f81 is discouraged
|
|
if ("\u0f73".equals(wowel))
|
|
return "\u0f71\u0f72"; // \u0f73 is discouraged
|
|
if ("\u0f79".equals(wowel))
|
|
return "\u0fb3\u0f81"; // \u0f79 is discouraged
|
|
if ("\u0f78".equals(wowel))
|
|
return "\u0fb3\u0f80"; // \u0f78 is discouraged
|
|
return wowel;
|
|
}
|
|
// handle o+u, etc.
|
|
int i;
|
|
if ((i = wowel.indexOf("+")) >= 0) {
|
|
// recurse.
|
|
|
|
// Chris Fynn says \u0f7c\u0f7c is different from \u0f7d.
|
|
// So o+o is not the same as au. e+e is not the same as
|
|
// ai.
|
|
String left = helpGetUnicodeForWowel(wowel.substring(0, i));
|
|
String right = helpGetUnicodeForWowel(wowel.substring(i + 1));
|
|
if (null != left && null != right)
|
|
return left + right;
|
|
else
|
|
return null;
|
|
} else {
|
|
// Handle vowels. (TODO(dchandler): tibwn.ini has this
|
|
// info, use that instead of duplicating it in this code.)
|
|
if (THDLWylieConstants.i_VOWEL.equals(wowel)) return "\u0f72";
|
|
if (THDLWylieConstants.u_VOWEL.equals(wowel)) return "\u0f74";
|
|
if (THDLWylieConstants.A_VOWEL.equals(wowel)) return "\u0f71";
|
|
if (THDLWylieConstants.U_VOWEL.equals(wowel)) return "\u0f71\u0f74"; // \u0f75 is discouraged
|
|
if (THDLWylieConstants.e_VOWEL.equals(wowel)) return "\u0f7a";
|
|
if (THDLWylieConstants.o_VOWEL.equals(wowel)) return "\u0f7c";
|
|
if (THDLWylieConstants.reverse_i_VOWEL.equals(wowel)) return "\u0f80";
|
|
if (THDLWylieConstants.ai_VOWEL.equals(wowel)) return "\u0f7b";
|
|
if (THDLWylieConstants.au_VOWEL.equals(wowel)) return "\u0f7d";
|
|
if (THDLWylieConstants.reverse_I_VOWEL.equals(wowel)) return "\u0f71\u0f80"; // \u0f81 is discouraged
|
|
if (THDLWylieConstants.I_VOWEL.equals(wowel)) return "\u0f71\u0f72"; // \u0f73 is discouraged
|
|
|
|
// TODO(DLC)[EWTS->Tibetan]: what about \u0f3e and \u0f3f!!!!
|
|
if (THDLWylieConstants.BINDU.equals(wowel)) return "\u0f7e";
|
|
if (THDLWylieConstants.U0F7F.equals(wowel)) return "\u0f7f";
|
|
if (THDLWylieConstants.U0F84.equals(wowel)) return "\u0f84";
|
|
if (THDLWylieConstants.U0F83.equals(wowel)) return "\u0f83";
|
|
if (THDLWylieConstants.U0F82.equals(wowel)) return "\u0f82";
|
|
if (THDLWylieConstants.U0F37.equals(wowel)) return "\u0f37";
|
|
if (THDLWylieConstants.U0F35.equals(wowel)) return "\u0f35";
|
|
if (THDLWylieConstants.WYLIE_TSA_PHRU.equals(wowel)) return "\u0f39";
|
|
|
|
return null;
|
|
}
|
|
}
|
|
|
|
public String getUnicodeFor(String l, boolean subscribed) {
|
|
|
|
// First, handle "\u0f71\u0f84\u0f86", "", "\u0f74", etc.
|
|
{
|
|
boolean already_done = true;
|
|
for (int i = 0; i < l.length(); i++) {
|
|
char ch = l.charAt(i);
|
|
if ((ch < '\u0f00' || ch > '\u0fff')
|
|
&& THDLWylieConstants.SAUVASTIKA != ch
|
|
&& THDLWylieConstants.SWASTIKA != ch
|
|
&& (ch < THDLWylieConstants.PUA_MIN || ch > THDLWylieConstants.PUA_MAX) // TODO(DLC)[EWTS->Tibetan]: give a warning, though? PUA isn't specified by the unicode standard after all.
|
|
&& '\n' != ch
|
|
&& '\r' != ch) {
|
|
// TODO(DLC)[EWTS->Tibetan]: Is this the place
|
|
// where we want to interpret how newlines work???
|
|
already_done = false;
|
|
break;
|
|
}
|
|
}
|
|
if (already_done)
|
|
return l; // TODO(dchandler): \u0fff etc. are not valid code points, though. Do we handle that well?
|
|
}
|
|
|
|
// TODO(DLC)[EWTS->Tibetan]:: vowels !subscribed could mean (a . i)???? I doubt it but test "i"->"\u0f68\u0f72" etc.
|
|
|
|
if (subscribed) {
|
|
if ("R".equals(l)) return "\u0fbc";
|
|
if ("Y".equals(l)) return "\u0fbb";
|
|
if ("W".equals(l)) return "\u0fba";
|
|
|
|
// TODO(dchandler): use tibwn.ini -- it has this same info.
|
|
|
|
// g+h etc. should not be inputs to this function, but for
|
|
// completeness they're here.
|
|
if ("k".equals(l)) return "\u0F90";
|
|
if ("kh".equals(l)) return "\u0F91";
|
|
if ("g".equals(l)) return "\u0F92";
|
|
if ("g+h".equals(l)) return "\u0F93";
|
|
if ("ng".equals(l)) return "\u0F94";
|
|
if ("c".equals(l)) return "\u0F95";
|
|
if ("ch".equals(l)) return "\u0F96";
|
|
if ("j".equals(l)) return "\u0F97";
|
|
if ("ny".equals(l)) return "\u0F99";
|
|
if ("T".equals(l)) return "\u0F9A";
|
|
if ("Th".equals(l)) return "\u0F9B";
|
|
if ("D".equals(l)) return "\u0F9C";
|
|
if ("D+h".equals(l)) return "\u0F9D";
|
|
if ("N".equals(l)) return "\u0F9E";
|
|
if ("t".equals(l)) return "\u0F9F";
|
|
if ("th".equals(l)) return "\u0FA0";
|
|
if ("d".equals(l)) return "\u0FA1";
|
|
if ("d+h".equals(l)) return "\u0FA2";
|
|
if ("n".equals(l)) return "\u0FA3";
|
|
if ("p".equals(l)) return "\u0FA4";
|
|
if ("ph".equals(l)) return "\u0FA5";
|
|
if ("b".equals(l)) return "\u0FA6";
|
|
if ("b+h".equals(l)) return "\u0FA7";
|
|
if ("m".equals(l)) return "\u0FA8";
|
|
if ("ts".equals(l)) return "\u0FA9";
|
|
if ("tsh".equals(l)) return "\u0FAA";
|
|
if ("dz".equals(l)) return "\u0FAB";
|
|
if ("dz+h".equals(l)) return "\u0FAC";
|
|
if ("w".equals(l)) return "\u0FAD"; // TODO(DLC)[EWTS->Tibetan]:: ???
|
|
if ("zh".equals(l)) return "\u0FAE";
|
|
if ("z".equals(l)) return "\u0FAF";
|
|
if ("'".equals(l)) return "\u0FB0";
|
|
if ("y".equals(l)) return "\u0FB1";
|
|
if ("r".equals(l)) return "\u0FB2";
|
|
if ("l".equals(l)) return "\u0FB3";
|
|
if ("sh".equals(l)) return "\u0FB4";
|
|
if ("Sh".equals(l)) return "\u0FB5";
|
|
if ("s".equals(l)) return "\u0FB6";
|
|
if ("h".equals(l)) return "\u0FB7";
|
|
if ("a".equals(l)) return "\u0FB8";
|
|
if ("k+Sh".equals(l)) return "\u0FB9";
|
|
|
|
if ("f".equals(l)) return "\u0FA5\u0F39";
|
|
if ("v".equals(l)) return "\u0FA6\u0F39";
|
|
return null;
|
|
} else {
|
|
if ("R".equals(l)) return "\u0f6a";
|
|
if ("Y".equals(l)) return "\u0f61";
|
|
if ("W".equals(l)) return "\u0f5d";
|
|
if ("//".equals(l)) return "\u0f0e";
|
|
|
|
if (!TibetanMachineWeb.isKnownHashKey(l)) {
|
|
// System.err.println("Getting unicode for the following is hard: '"
|
|
// + l + "' (pretty string: '"
|
|
// + UnicodeUtils.unicodeStringToPrettyString(l)
|
|
// + "'");
|
|
ThdlDebug.noteIffyCode();
|
|
return null;
|
|
}
|
|
String s = TibetanMachineWeb.getUnicodeForWylieForGlyph(l);
|
|
if (null == s)
|
|
ThdlDebug.noteIffyCode();
|
|
return s;
|
|
}
|
|
}
|
|
|
|
public String shortTranslitName() { return "EWTS"; }
|
|
|
|
private boolean pairHasBadWowel(TPair p) {
|
|
return (null != p.getRight()
|
|
&& !disambiguator().equals(p.getRight())
|
|
&& !"+".equals(p.getRight())
|
|
&& null == getUnicodeForWowel(p.getRight()));
|
|
}
|
|
public boolean isClearlyIllegal(TPair p) {
|
|
if (pairHasBadWowel(p)) return true;
|
|
if (p.getLeft() == null
|
|
&& (p.getRight() == null ||
|
|
(!disambiguator().equals(p.getRight())
|
|
&& !isWowel(p.getRight()))))
|
|
return true;
|
|
if ("+".equals(p.getLeft()))
|
|
return true;
|
|
if (p.getLeft() != null && isWowel(p.getLeft())
|
|
&& !aVowel().equals(p.getLeft())) // achen
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
public TPairList[] breakTshegBarIntoChunks(String tt, boolean sh) {
|
|
if (sh) throw new IllegalArgumentException("Don't do that, silly!");
|
|
try {
|
|
return TPairListFactory.breakEWTSIntoChunks(tt);
|
|
} catch (StackOverflowError e) {
|
|
throw new IllegalArgumentException("Input too large[1]: " + tt);
|
|
} catch (OutOfMemoryError e) {
|
|
throw new IllegalArgumentException("Input too large[2]: " + tt);
|
|
}
|
|
}
|
|
|
|
public boolean isACIP() { return false; }
|
|
|
|
public boolean vowelAloneImpliesAChen() { return true; }
|
|
|
|
public boolean vowelsMayStack() { return true; }
|
|
|
|
public boolean isWowelThatRequiresAChen(String s) {
|
|
// TODO(DLC)[EWTS->Tibetan]: not sure why we pick this subset.
|
|
// Why don't we use a negative set of regular vowels like "i",
|
|
// "o", etc.?
|
|
return ((s.length() == 1
|
|
&& (isUnicodeWowelThatRequiresAChen(s.charAt(0))))
|
|
|| THDLWylieConstants.BINDU.equals(s)
|
|
|| THDLWylieConstants.U0F35.equals(s)
|
|
|| THDLWylieConstants.U0F37.equals(s)
|
|
|| THDLWylieConstants.U0F7F.equals(s)
|
|
|| THDLWylieConstants.U0F82.equals(s)
|
|
|| THDLWylieConstants.U0F83.equals(s)
|
|
|| THDLWylieConstants.U0F84.equals(s)
|
|
|| THDLWylieConstants.WYLIE_TSA_PHRU.equals(s));
|
|
}
|
|
|
|
public boolean isUnicodeWowelThatRequiresAChen(char ch) {
|
|
// TODO(DLC)[EWTS->Tibetan]: ask if 18 19 3e 3f combine only with digits
|
|
return ("\u0f39\u0f35\u0f37\u0f18\u0f19\u0f3e\u0f3f\u0f86\u0f87\u0fc6".indexOf(ch) >= 0);
|
|
}
|
|
|
|
public boolean couldBeValidStack(TPairList pl) {
|
|
StringBuffer hashKey = new StringBuffer();
|
|
boolean allHavePlus = true;
|
|
for (int i = 0; i < pl.size(); i++) {
|
|
if (i + 1 < pl.size() && !"+".equals(pl.get(i).getRight()))
|
|
allHavePlus = false;
|
|
if (0 != hashKey.length())
|
|
hashKey.append('-');
|
|
hashKey.append(pl.get(i).getLeft());
|
|
}
|
|
return (allHavePlus
|
|
|| TibetanMachineWeb.hasGlyph(hashKey.toString())); // TODO(DLC)[EWTS->Tibetan]: test with smra and tsma and bdgya
|
|
}
|
|
|
|
public boolean stackingMustBeExplicit() { return true; }
|
|
|
|
public String U0F7F() { return THDLWylieConstants.U0F7F; }
|
|
|
|
public String U0F35() { return THDLWylieConstants.U0F35; }
|
|
|
|
public String U0F37() { return THDLWylieConstants.U0F37; }
|
|
}
|