TMW->ACIP is much improved. V and W were confused, # and * were

confused; many glyphs that should have yielded errors were not.

I've added a test case that transforms every TMW glyph save the one with
no TM mapping to ACIP.  I hand-checked that it was correct.

ACIP->TMW is fixed for # and *.  I never noticed it, but each needed an
extra swoosh (U+0F05).

Round-tripping would be good, as would testing real-world use of
TMW->ACIP.
This commit is contained in:
dchandler 2004-04-14 05:44:51 +00:00
parent 244a9d1370
commit 1bfd3772e6
10 changed files with 1110 additions and 85 deletions

View file

@ -112,17 +112,28 @@ public class TGCPair implements THDLWylieConstants {
public String getACIP() {
return getACIP(null);
}
/** Like {@link #getWylie(String)} but for ACIP transliteration, not EWTS. */
/** Like {@link #getWylie(String)} but for ACIP transliteration,
not EWTS. */
public String getACIP(String previousTranslitIfAppendaged) {
// DLC FIXME: has the EWTS change affected Manipulate.acipToWylie?
StringBuffer b = new StringBuffer();
if (consonantWylie != null) {
String consonantACIP
= org.thdl.tib.text.ttt.ACIPRules.getACIPForEWTS(consonantWylie);
= null;
if ("w".equals(consonantWylie)
&& (SANSKRIT_WITHOUT_VOWEL == classification
|| SANSKRIT_WITH_VOWEL == classification))
consonantACIP = "V";
else
consonantACIP
= org.thdl.tib.text.ttt.ACIPRules.getACIPForEWTS(consonantWylie);
if (null == consonantACIP) {
return TibetanMachineWeb.getTMWToACIPErrorString("glyph with THDL Extended Wylie " + consonantWylie);
if (null != consonantWylie && consonantWylie.startsWith("R+"))
return TibetanMachineWeb.getTMWToACIPErrorString("glyph with THDL Extended Wylie " + consonantWylie, " because the ACIP R+... could imply the short superscribed form, but this most likely intends the full form (i.e., Unicode character U+0F6A)");
return TibetanMachineWeb.getTMWToACIPErrorString("glyph with THDL Extended Wylie " + consonantWylie, "");
} else {
// Think of pa'am... we want 'am, not 'm; 'ang, not 'ng. But we want 'ur, not 'uar, 'is, not 'ias.
// Think of pa'am... we want 'am, not 'm; 'ang, not
// 'ng. But we want 'ur, not 'uar, 'is, not 'ias.
if (null != previousTranslitIfAppendaged
&& "'".equals(previousTranslitIfAppendaged)) {
b.append("A");
@ -140,7 +151,7 @@ public class TGCPair implements THDLWylieConstants {
String vowelACIP
= org.thdl.tib.text.ttt.ACIPRules.getACIPForEWTS(vowelWylie);
if (null == vowelACIP) {
return TibetanMachineWeb.getTMWToACIPErrorString("glyph with THDL Extended Wylie " + vowelWylie);
return TibetanMachineWeb.getTMWToACIPErrorString("glyph with THDL Extended Wylie " + vowelWylie, "");
} else {
b.append(vowelACIP);
}

View file

@ -900,10 +900,13 @@ public class TibTextUtils implements THDLWylieConstants {
/** Returns "a"/"A", unless wylie (which really is EWTS, not ACIP)
is already "a". */
private static String aVowelToUseAfter(boolean EWTSNotACIP, String wylie) {
if (wylie.equals(ACHEN))
return ""; // it's a, not aa, for achen alone.
else
return (EWTSNotACIP) ? WYLIE_aVOWEL : "A";
if (wylie.equals(ACHEN) && EWTSNotACIP) {
/* it's EWTS{a}, not EWTS{aa}, for achen alone. But it's
ACIP{AA}. */
return "";
} else
return ((EWTSNotACIP)
? WYLIE_aVOWEL : "A" /* hard-coded ACIP constant */);
}
private static String unambiguousPostAVowelTranslit(boolean EWTSNotACIP,
@ -929,7 +932,7 @@ public class TibTextUtils implements THDLWylieConstants {
* EWTSNotACIP is true, or the ACIP otherwise.
* @param EWTSNotACIP true if you want THDL Extended Wylie, false if
* you want ACIP
* @param dcs an array of glyphs
* @param dcs an array of TMW glyphs
* @param noSuch an array which will not be touched if this is
* successful; however, if there is no THDL Extended Wylie/ACIP
* corresponding to these glyphs, then noSuch[0] will be set to true
@ -959,9 +962,9 @@ public class TibTextUtils implements THDLWylieConstants {
// DLC FIXME: {H}, U+0F7F, is part of a grapheme cluster!
// David Chapman and I both need a comprehensive list of these
// guys. Get it from Unicode 4.0 spec?
/** Scans the glyphs in glyphList and creates the returned list of
grapheme clusters based on them. A grapheme cluster is a
consonant or consonant stack with optional adornment or a
/** Scans the TMW glyphs in glyphList and creates the returned
list of grapheme clusters based on them. A grapheme cluster
is a consonant or consonant stack with optional adornment or a
number (possibly super- or subscribed) or some other glyph
alone. */
private static TGCList breakTshegBarIntoGraphemeClusters(java.util.List glyphList,
@ -986,7 +989,12 @@ public class TibTextUtils implements THDLWylieConstants {
String wylie = TibetanMachineWeb.getWylieForGlyph(dc, noSuchWylie);
boolean buildingUpSanskritNext = false;
if ((buildingUpSanskritNext
= TibetanMachineWeb.isWylieSanskritConsonantStack(wylie))
= (TibetanMachineWeb.isWylieSanskritConsonantStack(wylie)
||
/* U+0FAD, which should become ACIP "V", not "W",
though the EWTS is "w" just as it is for
TMW(fontNum==1).53: */
(8 == dc.getFontNum() && 69 == dc.getCharNum())))
|| TibetanMachineWeb.isWylieTibetanConsonantOrConsonantStack(wylie)) {
if (buildingUpVowel.length() > 0 || null != nonVowelWylie) {
gcs.add(new TGCPair(nonVowelWylie,
@ -1612,7 +1620,7 @@ public class TibTextUtils implements THDLWylieConstants {
ArrayList glyphList = new ArrayList();
StringBuffer translitBuffer = new StringBuffer();
// DLC FIXME: " " should become " ", and test with ACIP # and *.
// DLC FIXME: " " should become " " for ACIP
for (int i=0; i<dcs.length; i++) {
char ch = dcs[i].getCharacter();
int k = dcs[i].getCharNum();
@ -1650,13 +1658,18 @@ public class TibTextUtils implements THDLWylieConstants {
((i+1<dcs.length)
? dcs[i+1]
: null),
((i+2<dcs.length)
? dcs[i+2]
: null),
noSuch,
howManyConsumed);
if (howManyConsumed[0] == 1) {
// nothing to do
} else {
ThdlDebug.verify(howManyConsumed[0] == 2);
} else if (howManyConsumed[0] == 2) {
++i;
} else {
ThdlDebug.verify(howManyConsumed[0] == 3);
++i; ++i;
}
}
if (TibetanMachineWeb.isWyliePunc(wylie)
@ -1683,8 +1696,9 @@ public class TibTextUtils implements THDLWylieConstants {
warnings.append("The stretch of Tibetan ended without final punctuation.");
}
if (translitBuffer.length() > 0)
if (translitBuffer.length() > 0) {
return translitBuffer.toString();
}
else
return null;
}

View file

@ -966,9 +966,12 @@ public static boolean isWylieTibetanConsonantOrConsonantStack(String s) {
}
/**
* Returns true if and only if s is the THDL Extended Wylie for a
* Sanskrit multi-consonant stack.
*/
* Returns true if and only if s is necessarily the THDL Extended Wylie
* for a Sanskrit (non-Tibetan, to be more correct) multi-consonant
* stack. If s is "w", then it might be the EWTS for TWM7.69, and that
* glyph is only used in non-Tibetan stacks, but "w" also stands for
* TMW.53, which is Tibetan, so this will return false for such a
* glyph. */
public static boolean isWylieSanskritConsonantStack(String s) {
return sanskritStackSet.contains(s);
}
@ -1909,11 +1912,18 @@ public static String wylieForGlyph(String hashKey) {
return sb.toString();
}
// DLC DOC
/** Returns the ACIP transliteration for a glyph with hash key
hashKey, or returns null if there is none. */
private static String acipForGlyph(String hashKey) {
String ACIP // DLC FIXME: test this.
= org.thdl.tib.scanner.Manipulate.wylieToAcip(hashKey);
return ACIP;
if (1 == hashKey.length()
// ~X is a special case because the EWTS is 2 characters in
// length
|| "~X".equals(hashKey)) // hard-coded EWTS value
return org.thdl.tib.text.ttt.ACIPRules.getACIPForEWTS(hashKey);
else
// else we are not be able to use it because it's not smart
// about stacks (e.g., W+W)
return org.thdl.tib.scanner.Manipulate.wylieToAcip(hashKey);
}
/** Error that appears in a document when some TMW cannot be
@ -1927,15 +1937,15 @@ private static String getTMWToWylieErrorString(DuffCode dc) {
}
/** Error that appears in a document when some TMW cannot be
* transcribed in ACIP. This error message is
* documented in www/htdocs/TMW_RTF_TO_THDL_WYLIE.html (DLC NOT YET), so change
* them both when you change this. */
static String getTMWToACIPErrorString(String it) {
return "[# JSKAD_TMW_TO_ACIP_ERROR_NO_SUCH_ACIP: Cannot convert " + it + " to ACIP. Please transcribe this yourself.]";
* transcribed in ACIP. This error message is documented in
* www/htdocs/TMW_or_TM_To_X_Converters.html, so change them both
* when you change this. */
static String getTMWToACIPErrorString(String it, String explanation) {
return "[# JSKAD_TMW_TO_ACIP_ERROR_NO_SUCH_ACIP: Cannot convert " + it + " to ACIP" + explanation + ". Please transcribe this yourself.]";
}
private static String getTMWToACIPErrorString(DuffCode dc) {
return getTMWToACIPErrorString(dc.toString(true));
private static String getTMWToACIPErrorString(DuffCode dc, String explanation) {
return getTMWToACIPErrorString(dc.toString(true), explanation);
}
/**
@ -1979,65 +1989,103 @@ public static String getWylieForGlyph(DuffCode dc, boolean noSuchWylie[]) {
}
/** Returns ACIP transliteration or an error message stating why no
ACIP transliteration exists for the sole glyph dc or the two
glyphs dc and optionalNextDC as a whole. noSuchACIP[0] will be
set (to true) if and only if there is no ACIP representation for
dc; in that case, an error message is returned rather than valid
ACIP. optionalNextDC should be null if there is no context
information available (such as if dc is the last DuffCode being
converted from TMW to ACIP) or the DuffCode following dc
otherwise. If the ACIP (or error message) returned captures both
dc and the nonnull optionalNextDC, then howManyGlyphsUsed[0] will
be set to 2, otherwise it will be set to 1.
ACIP transliteration exists for one, two, or three TMW glyphs.
This gobbles up three TMW glyphs when and only when "#" is
returned; this gobbles up two TMW glyphs when and only when "@" is
returned; this gobbles up one TMW glyph otherwise. The number
gobbled is stored into howManyGlyphsUsed[0]. Always pass in as
many glyphs as possible.
<p>noSuchACIP[0] will be set (to true) if and only if there is no
ACIP representation; in that case, an error message is returned
rather than valid ACIP. dc2 and/or dc3 should be null if there is
no context information available (i.e., if dc1 or dc2 is the last
DuffCode being converted from TMW to ACIP). Otherwise, dc2 should
be the DuffCode following dc1 and dc3 should be the DuffCode
following dc2. If the ACIP (or error message) returned captures
both dc1 and the (nonnull) dc2 and the (nonnull) dc3, then
howManyGlyphsUsed[0] will be set to 3. If the ACIP (or error
message) returned captures both dc1 and the nonnull dc2, then
howManyGlyphsUsed[0] will be set to 2. Otherwise it will be set
to 1.
<p>This would be more straightforward if it were not the case that
a TMW-&gt;ACIP conversion requires context information in the case
of U+0F04 and U+0F05. Because it does, two DuffCodes, not one,
of U+0F04 and U+0F05. Because it does, three DuffCodes, not one,
must be passed in whenever possible.
<p>We opt to treat a lone U+0F05 as an error in TMW-&gt;ACIP
conversions rather than return the pseudo-ACIP Unicode character
escape for U+0F05. After all, the conversion is TMW-&gt;ACIP, not
TMW-&gt;pseudo-ACIP.
<p>We opt to treat a lone U+0F05 or U+0F04 as an error in
TMW-&gt;ACIP conversions rather than return the pseudo-ACIP
Unicode character escape. After all, the conversion is
TMW-&gt;ACIP, not TMW-&gt;pseudo-ACIP.
@return error message or valid ACIP, never pseudo-ACIP like
Unicode character escapes
@param dc the leftmost DuffCode if optionalNextDC is nonnull, or
the sole DuffCode
@param optionalNextDC null if dc is the last (rightmost) DuffCode
in the sequence, or the DuffCode following dc. If you pass in dc
equal to the DuffCode for U+0F04, and optionalNextDC null, then
"*" will be returned, so don't leave this out unless dc is the
rightmost DuffCode.
@param dc1 the leftmost TMW DuffCode if dc2 is nonnull,
or the sole TMW DuffCode
@param dc2 null if dc1 is the last (rightmost) TMW DuffCode in the
sequence, or the TMW DuffCode following dc1. If you pass in dc1
equal to the TMW DuffCode for U+0F04, and dc2 null, then "*" will
be returned, so don't leave this out unless dc1 is the rightmost
TMW DuffCode.
@param dc3 null if dc2 is null or is the last (rightmost) TMW
DuffCode in the sequence, or the TMW DuffCode following dc2
otherwise.
@param noSuchACIP an array whose first element will be set to true
if and only if an error message is returned instead of valid ACIP;
the first element is never set to false, so nominally caller will
initialize the first element to false
@param howManyGlyphsUsed an array whose first element will be set
to 2 if valid ACIP that describes both dc and optionalNextDC is
returned, or 1 otherwise */
public static String getACIPForGlyph(DuffCode dc,
DuffCode optionalNextDC,
to 3 if valid ACIP that desribes dc1, dc2, and dc3 is returned, to
2 if valid ACIP that describes both dc1 and dc2 is returned, or to
1 otherwise */
public static String getACIPForGlyph(DuffCode dc1,
DuffCode dc2,
DuffCode dc3,
boolean noSuchACIP[],
int howManyGlyphsUsed[]) {
String hashKey = getHashKeyForGlyph(dc);
// DLC FIXME: TMW.53 is probably going to come out all wrong (VA
// vs. WA) from this function, but
// ACIPRules.getACIPForEWTS(String) seems to come through... will
// it always?
String hashKey = getHashKeyForGlyph(dc1);
if (null != hashKey && hashKey.equals("@")) { // hard-coded EWTS value
String nextHashKey
= ((null == optionalNextDC)
? null : getHashKeyForGlyph(optionalNextDC));
= ((null == dc2)
? null : getHashKeyForGlyph(dc2));
if (null != nextHashKey && nextHashKey.equals("#")) { // hard-coded EWTS value
String nextNextHashKey
= ((null == dc3)
? null : getHashKeyForGlyph(dc3));
if (null != nextNextHashKey && nextNextHashKey.equals("#")) { // hard-coded EWTS value
howManyGlyphsUsed[0] = 3;
return "#"; // hard-coded ACIP value
}
howManyGlyphsUsed[0] = 2;
return "#"; // hard-coded ACIP value
} else {
howManyGlyphsUsed[0] = 1;
return "*"; // hard-coded ACIP value
}
} // else fall through
}
if (null != hashKey && hashKey.equals("@#")) { // hard-coded EWTS value
String nextHashKey
= ((null == dc2)
? null : getHashKeyForGlyph(dc2));
if (null != nextHashKey && nextHashKey.equals("#")) { // hard-coded EWTS value
howManyGlyphsUsed[0] = 2; // not 3
return "#"; // hard-coded ACIP value
}
howManyGlyphsUsed[0] = 1; // not 2
return "*"; // hard-coded ACIP value
}
howManyGlyphsUsed[0] = 1;
String ans = (hashKey == null) ? null : acipForGlyph(hashKey);
if (hashKey == null || ans == null) {
if (null == ans) {
noSuchACIP[0] = true;
return getTMWToACIPErrorString(dc);
if (null != hashKey && hashKey.startsWith("R+"))
return getTMWToACIPErrorString(dc1, " because the ACIP R+... could imply the short superscribed form, but this most likely intends the full form (i.e., Unicode character U+0F6A)");
return getTMWToACIPErrorString(dc1, "");
}
return ans;
}

View file

@ -23,6 +23,8 @@
// glyphs from TMW. 0F6A is not listed here (DLC FIXME: should it be?),
// but the glyph for it is the glyph for 0F62.
//
// The EWTS is not a unique key -- see "r", for example.
//
// DuffPaneTest ensures that the na-ro column truly contains na-ros,
// by the way.
//
@ -70,7 +72,8 @@ __TILDE__X~102,5~~9,102~~~~~~~0F35
// though, and we let it become U+0F7E when you convert TMW->Unicode.
// That is, we treat them as interchangeable except for in TMW->TM
// mappings, where [8,91] does not map to any TM glyph (though you
// could argue that it should become what [8,90] becomes).
// could argue that it should become what [8,90] becomes -- DLC
// FIXME).
M~~~8,91~~~~~~~0F7E
__TILDE__M~241,1~~8,94~~~~~~~0F83

View file

@ -628,9 +628,16 @@ public class ACIPConverter {
tdocLocation[0] += s.getText().length();
continue; // FIXME: this means the unicode above doesn't go into the output if null != writer && null != tdoc?
} else {
String wy = ACIPRules.getWylieForACIPOther(s.getText());
if (null == wy) throw new Error("No wylie for ACIP " + s.getText());
duff = new Object[] { TibetanMachineWeb.getGlyph(wy) };
if ("#".equals(s.getText())) { // hard-coded ACIP value
duff = new Object[] {
TibetanMachineWeb.getGlyph("@#"),
TibetanMachineWeb.getGlyph("#")
}; // hard-coded EWTS values
} else {
String wy = ACIPRules.getWylieForACIPOther(s.getText());
if (null == wy) throw new Error("No wylie for ACIP " + s.getText());
duff = new Object[] { TibetanMachineWeb.getGlyph(wy) };
}
}
}
}

View file

@ -157,6 +157,9 @@ public class ACIPRules {
getWylieForACIPOther(null);
getWylieForACIPVowel(null);
String ans = (String)wylieToACIP.get(EWTS);
boolean useCapitalW = false;
if (EWTS.startsWith("w"))
useCapitalW = true; // We want W+NA, not V+NA; we want WA, not VA.
if (null == ans) {
StringBuffer finalAns = new StringBuffer(EWTS.length());
StringTokenizer sTok = new StringTokenizer(EWTS, "-+", true);
@ -182,9 +185,14 @@ public class ACIPRules {
if (null == part) return null;
finalAns.append(part);
}
if (useCapitalW)
finalAns.setCharAt(0, 'W');
return finalAns.toString();
}
return ans;
if (useCapitalW)
return "W" + ans.substring(1);
else
return ans;
}
/** Registers acip->wylie mappings in toWylie; registers
@ -193,6 +201,12 @@ public class ACIPRules {
toWylie.put(ACIP, EWTS);
if (null == wylieToACIP) {
wylieToACIP = new HashMap(75);
// We don't want to put "/" in toWylie:
wylieToACIP.put("(", "/");
wylieToACIP.put(")", "/");
wylieToACIP.put("?", "\\");
wylieToACIP.put("_", " "); // oddball.
wylieToACIP.put("o'i", "O'I"); // oddball for TMW9.61.
}
@ -307,14 +321,20 @@ public class ACIPRules {
if (acipOther2wylie == null) {
acipOther2wylie = new HashMap(20);
// don't use putMapping for this. We don't want TMW->ACIP
// to produce "." for a U+0F0C because ACIP doesn't say
// that "." means U+0F0C. It just seems to in practice
// for ACIP Release IV texts.
acipOther2wylie.put(".", "*");
putMapping(acipOther2wylie, "m", "M");
putMapping(acipOther2wylie, ":", "H");
putMapping(acipOther2wylie, ",", "/");
putMapping(acipOther2wylie, " ", " ");
putMapping(acipOther2wylie, ".", "*");
putMapping(acipOther2wylie, "|", "|");
putMapping(acipOther2wylie, ";", "|");
putMapping(acipOther2wylie, "`", "!");
putMapping(acipOther2wylie, ";", ";");
putMapping(acipOther2wylie, "*", "@");
putMapping(acipOther2wylie, "#", "@#");
putMapping(acipOther2wylie, "*", "@#");
// There is no glyph in TMW with the EWTS @##, so we don't do this: putMapping(acipOther2wylie, "#", "@##");
putMapping(acipOther2wylie, "%", "~X");
putMapping(acipOther2wylie, "o", "X");
putMapping(acipOther2wylie, "&", "&");

View file

@ -359,6 +359,7 @@ class TParseTree {
}
}
if (stackSize > 1 && tp.getLeft() != null && tp.getLeft().length() > 1) {
// DLC FIXME: gives a false positive warning for Rsh
hasAmbiguousConsonant = true;
}
}