ACIP->Unicode was broken for KshR, ndRY, ndY, YY, and RY -- those

stacks that use full-form subjoined RA and YA consonants.

ACIP {RVA} was converting to the wrong things.

The TMW for {RVA} was converting to the wrong ACIP.

Checked all the 'DLC' tags in the ttt (ACIP->Tibetan) package.
This commit is contained in:
dchandler 2003-11-09 01:07:45 +00:00
parent 8193cef5d1
commit 04816acb74
11 changed files with 213 additions and 123 deletions

View file

@ -36,9 +36,6 @@ import org.thdl.tib.text.DuffCode;
* @author David Chandler
*/
public class ACIPConverter {
// DLC NOW: (KA)'s info is lost when you convert to Unicode text instead of Unicode RTF. Give an ERROR.
// DLC NOW: BAo isn't converting.
/** Command-line converter. Gives error messages on standard
* output about why we can't convert the document perfectly and
@ -148,7 +145,7 @@ public class ACIPConverter {
/** Turns the list of TStrings scan into TibetanMachineWeb and
Roman warnings and error messages that are inserted at
position loc in tdoc. DLC DOC better
position loc in tdoc. FIXME: DOC better
@param loc an input-output parameter. On input, loc[0] is the
offset from zero inside tdoc at which conversion results will
@ -270,7 +267,7 @@ public class ACIPConverter {
{
try {
if (toUnicode && toRTF)
throw new Error("DLC NOW FIXME: support this ACIP->Unicode.rtf mode so that KA (GA) shows up in two different font sizes.");
throw new Error("FIXME: support this ACIP->Unicode.rtf mode so that KA (GA) shows up in two different font sizes. See RFE 838591.");
if (!toUnicode && !toRTF)
throw new IllegalArgumentException("ACIP->Uni.rtf, ACIP->Uni.txt, and ACIP->TMW.rtf are supported, but not ACIP->TMW.txt");
if (toUnicode && toRTF && null == tdoc)
@ -524,7 +521,7 @@ public class ACIPConverter {
// not used after a GA in Tibetan
// typesetting.
boolean done = false;
// DLC what about after numbers? marks?
// what about after numbers? marks? FIXME: test
TPairList lpl = null;
if (s.getText().equals(" ")) {
if (!lastGuyWasNonPunct
@ -556,12 +553,6 @@ public class ACIPConverter {
tdocLocation[0] += x.length();
continue;
}
// DLC AM I DOING THIS? By normal Tibetan & Dzongkha spelling, writing, and input rules
// Tibetan script stacks should be entered and written: 1 headline
// consonant (0F40->0F6A), any subjoined consonant(s) (0F90->
// 0F9C), achung (0F71), shabkyu (0F74), any above headline
// vowel(s) (0F72 0F7A 0F7B 0F7C 0F7D and 0F80) ; any ngaro (0F7E,
// 0F82 and 0F83)
}
} else if (s.getText().equals(",")
&& lastGuyWasNonPunct

View file

@ -64,10 +64,11 @@ public class ACIPRules {
if (null == acipVowels) {
acipVowels = new HashSet(baseVowels.length * 8);
for (int i = 0; i < baseVowels.length; i++) {
// DLC I'm on my own with 'O and 'E and 'OO and 'EE, but
// GANG'O appears and I wonder... so here they are. It's
// consistent with 'I and 'A and 'U, at least: all the vowels
// may appear as K'vowel. DLC FIMXE: ask.
// I'm on my own with 'O and 'E and 'OO and 'EE, but
// GANG'O appears and I wonder... so here they are.
// It's consistent with 'I and 'A and 'U, at least:
// all the vowels may appear as K'vowel. DLC FIXME:
// ask.
acipVowels.add(baseVowels[i][0]);
acipVowels.add('\'' + baseVowels[i][0]);
@ -77,10 +78,10 @@ public class ACIPRules {
acipVowels.add('\'' + baseVowels[i][0] + ':');
acipVowels.add(baseVowels[i][0] + "m:");
acipVowels.add('\'' + baseVowels[i][0] + "m:");
// DLC keep this code in sync with getUnicodeFor.
// DLC keep this code in sync with getWylieForACIPVowel
// DLC '\' for virama? how shall we do \ the virama? like a vowel or not?
// Keep this code in sync with getUnicodeFor.
// Keep this code in sync with getWylieForACIPVowel.
}
}
return (acipVowels.contains(s));
@ -141,6 +142,8 @@ public class ACIPRules {
return consonants.contains(acip);
}
/** A map from wylie to ACIP. Note that the Wylie "w" maps to
both "V" and "W". */
private static HashMap wylieToACIP = null;
/** Returns the ACIP transliteration corresponding to the THDL
Extended Wylie <em>atom</em> EWTS, or null if EWTS is not
@ -157,8 +160,21 @@ public class ACIPRules {
String part, tok = sTok.nextToken();
if (tok.equals("-") || tok.equals("+"))
part = tok;
else
part = (String)wylieToACIP.get(tok);
else {
if ("w".equals(tok)) {
// There are only two stacks in TMW that have
// U+0FBA: r+wa and w+wa. TMW->ACIP fails for
// these unless we handle it here. (FIXME:
// add an automated test for this).
if ("r+w".equals(EWTS) || "w+w".equals(EWTS)) {
part = "W";
} else {
part = "V";
}
} else {
part = (String)wylieToACIP.get(tok);
}
}
if (null == part) return null;
finalAns.append(part);
}
@ -271,7 +287,6 @@ public class ACIPRules {
if (acipOther2wylie == null) {
acipOther2wylie = new HashMap(20);
// DLC FIXME: check all these again.
putMapping(acipOther2wylie, ",", "/");
putMapping(acipOther2wylie, " ", " ");
putMapping(acipOther2wylie, ".", "*");
@ -477,7 +492,7 @@ public class ACIPRules {
superACIP2unicode.put("8", "\u0F28");
superACIP2unicode.put("9", "\u0F29");
// DLC punctuation
// punctuation
superACIP2unicode.put("&", "\u0F85");
superACIP2unicode.put(",", "\u0F0D");
superACIP2unicode.put(" ", "\u0F0B");
@ -486,17 +501,17 @@ public class ACIPRules {
superACIP2unicode.put("`", "\u0F08");
superACIP2unicode.put("*", "\u0F04\u0F05");
superACIP2unicode.put("#", "\u0F04\u0F05\u0F05");
superACIP2unicode.put("%", "\u0F35");
superACIP2unicode.put("%", "\u0F35"); // FIXME: could be U+0F37 or U+0F35 according to RC if I understand correctly.
superACIP2unicode.put(";", "\u0F11");
superACIP2unicode.put("\r", "\r");
superACIP2unicode.put("\t", "\t");
superACIP2unicode.put("\r\n", "\r\n");
superACIP2unicode.put("\n", "\n");
superACIP2unicode.put("\\", "\u0F84"); // DLC FIXME: make this like a vowel
// DLC FIXME: what's the Unicode for caret, ^?
// DLC FIXME: what's the Unicode for o?
// DLC FIXME: what's the Unicode for x?
superACIP2unicode.put("\\", "\u0F84");
superACIP2unicode.put("^", "\u0F38");
// DLC FIXME: "^ GONG" is "^GONG", right?
// DLC FIXME: what's the Unicode for x? for o? RC said there is none in plain-text Unicode for x. But what about in RTF Unicode?
}
if (subscribed) {
String u = (String)subACIP2unicode.get(acip);
@ -546,7 +561,7 @@ public class ACIPRules {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.reverse_i_VOWEL, context_added);
}
}
// DLC FIXME: Use TMW9.61, the "o'i" special combination, when appropriate.
// FIXME: Use TMW9.61, the "o'i" special combination, when appropriate.
if (vowel.indexOf('m') >= 0) {
DuffCode last = (DuffCode)duff.get(duff.size() - 1);

View file

@ -1,4 +1,3 @@
// DLC NOW: KAsh ->Ksh here! optionally!
/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
@ -118,7 +117,7 @@ public class ACIPTshegBarScanner {
* followed by a '\n'.
* @param s the ACIP text
* @param errors if non-null, the buffer to which to append error
* messages (DLC FIXME: cludge, just get this info by scanning
* messages (FIXME: kludge, just get this info by scanning
* the result for TString.ERROR (and maybe TString.WARNING,
* if you care about warnings), but then we'd have to put the
* Offset info in the TString)
@ -340,7 +339,7 @@ public class ACIPTshegBarScanner {
if (!foundOne && i+1 < sl && s.charAt(i+1) == '*') {
// Identify [*LINE BREAK?] as an English
// correction. Every correction not on this
// list is considered to be Tibetan. DLC
// list is considered to be Tibetan.
// FIXME: make this extensible via a config
// file or at least a System property (which
// could be a comma-separated list of these
@ -537,7 +536,7 @@ public class ACIPTshegBarScanner {
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
+ "Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker is not followed by whitespace, as is expected.\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
startOfString = i+1; // DLC FIXME: skip over more?
startOfString = i+1; // FIXME: skip over more? test this code.
currentType = TString.ERROR;
break;
}
@ -651,8 +650,7 @@ public class ACIPTshegBarScanner {
if (startSlashIndex + 1 == i) {
/* //NYA\\ appears in ACIP input, and I think
* it means /NYA/. We warn about // for this
* reason. \\ causes a tsheg-bar error (DLC
* FIXME: verify this is so). */
* reason. \\ causes a tsheg-bar error. */
al.add(new TString("Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.",
TString.ERROR));
if (errors != null) {
@ -941,7 +939,7 @@ public class ACIPTshegBarScanner {
|| ch == 'm'
|| ch == ':'
|| ch == '^'
// DLC FIXME: we must treat this guy like a vowel, a special vowel that numerals can take on. Until then, warn. || ch == '\\'
// FIXME: we must treat this guy like a vowel, a special vowel that numerals can take on. Until then, warn. See bug 838588 || ch == '\\'
|| ch == '-'
|| ch == '+'

View file

@ -24,8 +24,6 @@ import java.util.ArrayList;
import java.util.HashMap;
import java.util.StringTokenizer;
// DLC FIXME: document this.
/** MidLexSubstitution is a hack that lets the end user clumsily fix
* the EWTS-to-Tibetan and ACIP-to-Tibetan converters without having
* to modify the source code.
@ -222,3 +220,5 @@ class StringMapping {
this.to = to;
}
}
// DLC NOW: defaults: KAsh=>K+sh, A=>?, '=>? (THESE ARE {A} AND {'} ALONE, NOT AS COMPONENTS OF A TSHEG-BAR.)

View file

@ -178,7 +178,6 @@ public class PackageTest extends TestCase {
if (decentParses.size() == 1) {
System.out.println("ACIPNoLegalParseError: NO LEGAL PARSE for the unambiguous ACIP {" + acip + "} (i.e., exactly one illegal parse exists, so it's unambiguous)");
// DLC FIXME: it's really unambiguous if one illegal parse has fewer glyphs than any other? {shthA'I} might be an example... but NO! because you go left-to-right to make stacks, except think of BRTAN vs. BRAN, they break that rule... ???? DLC ????
} else {
System.out.println("ACIPNoLegalParseError: NO PARSES for ACIP {" + acip + "}, decent parses are " + decentParses);
}
@ -244,8 +243,6 @@ public class PackageTest extends TestCase {
}
}
// DLC FIXME: warn if we have to use the "what stacks take a GA prefix?" rules to get a unique legal parse.
public void testCutoff() {
// this would once be exponential running time, so we'd cut it off:
tstHelper("BRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTN");
@ -362,7 +359,7 @@ tstHelper("MSTAN"); // ambiguous with regard to prefix rules
tstHelper("KA'", "[(K . A), (' . )]",
new String[] { "{KA}{'}" },
new String[] { "{KA}{'}" },
"{KA}{'}"); // DLC NOW
"{KA}{'}");
tstHelper("A'AAMA", "{A}{'}{AA}{MA}"); // FIXME: how should we parse this?
@ -390,17 +387,6 @@ tstHelper("KA'", "[(K . A), (' . )]",
new String[] { "{B}{DEm}" },
"{B}{DEm}");
/* DLC FIXME DOC TEST
tstHelper("BDA:", "{B}{DA:}",
new String[] { "{B+DA:}", "{B}{DA:}" },
new String[] { "{B}{DA:}" },
"{B}{DA:}");
tstHelper("BDEm:", "{B}{DEm:}",
new String[] { "{B+DEm:}", "{B}{DEm:}" },
new String[] { "{B}{DEm:}" },
"{B}{DEm:}"); */
tstHelper("NA+YA", "{N+}{YA}",
new String[] { "{N+YA}" },
new String[] { "{N+YA}" },
@ -502,10 +488,6 @@ tstHelper("KA'", "[(K . A), (' . )]",
"{SH+R'I}");
// DLC FIXME: test EWTS {pouM}
// DLC FIXME: do TMW->ACIP->TMW->ACIP round-trip.
tstHelper("DRUG", "{D}{RU}{G}",
new String[] { "{D+RU}{G}", "{D}{RU}{G}" },
new String[] { "{D+RU}{G}" },
@ -566,7 +548,7 @@ tstHelper("KA'", "[(K . A), (' . )]",
tstHelper("BHE");
tstHelper("BH'I");
tstHelper("BH'Im");
tstHelper("BH'Im:"); // DLC FIXME: make TibetanMachineWeb see EWTS {H} as an adornment.
tstHelper("BH'Im:"); // LOW-PRIORITY FIXME: make TibetanMachineWeb see EWTS {H} as an adornment.
tstHelper("D-VA");
tstHelper("DVA");
tstHelper("SRAS", "{S}{RA}{S}",
@ -603,7 +585,10 @@ tstHelper("KA'", "[(K . A), (' . )]",
// that we know the
// keyboardist was
// aware of the plus
// operator. DLC FIXME: warn in this case!
// operator. We warn
// in such a case.
// Though R0021F.ACE
// recommends D+GRA etc.
tstHelper("BRTN--GA",
"{B}{R}{T}{N-}{-}{GA}",
@ -680,7 +665,6 @@ tstHelper("KA'", "[(K . A), (' . )]",
new String[] { /* No parses exist. "K:" is illegal. */ });
tstHelper("'AO", "[(' . ), (A . O)]");
tstHelper("'AOM", "[(' . ), (A . O), (M . )]");
// DLC CHECK: S6814M6.ACT for BA'I and 'AO
tstHelper("BTZVA", "[(B . ), (TZ . ), (V . A)]");
@ -723,7 +707,7 @@ tstHelper("KA'", "[(K . A), (' . )]",
tstHelper("G'A'I", "[(G . 'A), (' . I)]");
tstHelper("G'I'I", "[(G . 'I), (' . I)]");
tstHelper("G'Im:'I", "[(G . 'Im:), (' . I)]");
tstHelper("G'Im:'ANG'AM'I", "[(G . 'Im:), (' . A), (NG . 'A), (M . 'I)]"); // DLC FIXME: 'ANG'A is what? 'ANG-'A or 'A-NG'A?
tstHelper("G'Im:'ANG'AM'I", "[(G . 'Im:), (' . A), (NG . 'A), (M . 'I)]"); // FIXME: test that 'ANG'A is {'A}{NG'A}?
tstHelper("BA'AM", "[(B . A), (' . A), (M . )]");
tstHelper("B'AM", "[(B . 'A), (M . )]");
@ -2287,7 +2271,6 @@ tstHelper("DBANG");
tstHelper("DBAR");
tstHelper("DBAS");
tstHelper("DBE");
// DLC NOW: TMW->ACIP doesn't do {KHA (KA)}.
tstHelper("DBEN");
tstHelper("DBER");
tstHelper("DBES");
@ -7227,7 +7210,6 @@ tstHelper("ZUR");
shelp("^ GONG SA,",
"",
"[TIBETAN_NON_PUNCTUATION:{^}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{GONG}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{SA}, TIBETAN_PUNCTUATION:{,}]");
// DLC FIXME: test that ^ and ^GONG are handled correctly on the whole.
shelp("", "", "[]");
shelp("[DD]", "");
shelp("[",
@ -7273,7 +7255,6 @@ tstHelper("ZUR");
shelp("[*DATA INCOMPLETE HERE?]", "", "[CORRECTION_START:{[*}, LATIN:{DATA INCOMPLETE HERE}, POSSIBLE_CORRECTION:{?]}]");
shelp("[*THIS\r\nWAS SUPPOSED TO BE THE SIXTH CATEGORY; THE CATEGORIES MENTIONED\r\nABOVE SEEM TO BE OUT OF ORDER THROUGH THIS SECTION]\r\n", "");
// DLC test ACIP files containing just "x", "o", ":", "m" and "%"
shelp("x o % : m", "");
shelp("AAx AAo AA% AA: AAm", "");
@ -7285,10 +7266,6 @@ tstHelper("ZUR");
shelp("[* Correction with []]",
"Offset 5: Found an illegal character, r, with ordinal 114.\nOffset 6: Found an illegal character, r, with ordinal 114.\nOffset 7: Found an illegal character, e, with ordinal 101.\nOffset 8: Found an illegal character, c, with ordinal 99.\nOffset 14: Found an illegal character, w, with ordinal 119.\nOffset 19: Found an illegal open bracket (in context, this is []]). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset 21: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
// DLC DOC: the line SDIG PA'I GROGS PO'I LAG TU SON PAR 'GYUR PA is followed by a blank line. Note that it's "PA", not "PA ", ending it. We autocorrect to the latter.
// DLC FIXME: @0B1 isn't handled correctly!
shelp(",NGES ? PA", "", "[TIBETAN_PUNCTUATION:{,}, TIBETAN_NON_PUNCTUATION:{NGES}, TIBETAN_PUNCTUATION:{ }, QUESTION:{?}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{PA}]");
@ -7321,8 +7298,10 @@ tstHelper("ZUR");
shelp("[text missing]", "", "[COMMENT:{[#text missing]}]");
{
// DLC FIXME: in one case, it's a tsheg. In the other,
// it's not a tsheg. That requires parsing, but test it.
// In {G'EEm: ,MDO}, is the space a tsheg? We say no
// right now. In the other, {G'EEm ,MDO}, it's not a
// tsheg because you don't need a tsheg after GA. But in
// the first, do you need a tsheg after {:}? (FIXME)
shelp("G'EEm: ,MDO",
"",
"[TIBETAN_NON_PUNCTUATION:{G'EEm:}, TIBETAN_PUNCTUATION:{ }, TIBETAN_PUNCTUATION:{,}, TIBETAN_NON_PUNCTUATION:{MDO}]");
@ -7344,7 +7323,7 @@ tstHelper("ZUR");
shelp("@001 ", "", "[FOLIO_MARKER:{@001}, TIBETAN_PUNCTUATION:{ }]");
shelp("@19-20A",
"Offset 0: Found an illegal at sign, @ (in context, this is @19-20A). @012B is an example of a legal folio marker.\n",
"[ERROR:{Found an illegal at sign, @ (in context, this is @19-20A). @012B is an example of a legal folio marker.}, TIBETAN_NON_PUNCTUATION:{19-20A}]"); // DLC FIXME: yes it occurs in the kangyur.
"[ERROR:{Found an illegal at sign, @ (in context, this is @19-20A). @012B is an example of a legal folio marker.}, TIBETAN_NON_PUNCTUATION:{19-20A}]"); // FIXME: yes it occurs in the kangyur.
shelp("@[7B]", "");
shelp("@012A.3KA",
"",
@ -7366,6 +7345,10 @@ tstHelper("ZUR");
shelp("{ DD }", "", "[DD:{{ DD }}]"); // TD3790E2.ACT
shelp("{ BP }", "", "[BP:{{ BP }}]"); // TD3790E2.ACT
// LOW-PRIORITY FIXME: support nested comments.
shelp("[# This is a [# nested comment] don't you know?]KA KHA GA NGA",
"Offset 13: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\nOffset 38: Found an illegal character, y, with ordinal 121.\nOffset 40: Found an illegal character, u, with ordinal 117.\nOffset 42: Found an illegal character, k, with ordinal 107.\nOffset 45: Found an illegal character, w, with ordinal 119.\nOffset 47: Found a truly unmatched close bracket, ] or }.\nOffset 47: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n",
"[ERROR:{Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n}, COMMENT:{[# This is a [# nested comment]}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{d}, TSHEG_BAR_ADORNMENT:{o}, TIBETAN_NON_PUNCTUATION:{n't}, TIBETAN_PUNCTUATION:{ }, ERROR:{Found an illegal character, y, with ordinal 121.}, ERROR:{The ACIP o must be glued to the end of a tsheg bar, but this one was not}, ERROR:{Found an illegal character, u, with ordinal 117.}, TIBETAN_PUNCTUATION:{ }, ERROR:{Found an illegal character, k, with ordinal 107.}, TIBETAN_NON_PUNCTUATION:{n}, TSHEG_BAR_ADORNMENT:{o}, ERROR:{Found an illegal character, w, with ordinal 119.}, QUESTION:{?}, ERROR:{Found a truly unmatched close bracket, ]}, ERROR:{Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.}, TIBETAN_NON_PUNCTUATION:{KA}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{KHA}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{GA}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{NGA}]");
shelp("//NYA\\\\",
"Offset 1: Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\nOffset 5: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\nOffset 6: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n",
"[START_SLASH:{/}, ERROR:{Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.}, END_SLASH:{/}, TIBETAN_NON_PUNCTUATION:{NYA}, ERROR:{Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.}, ERROR:{Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.}]");
@ -7417,7 +7400,6 @@ M+NA
uhelp("BGLA", "\u0f56\u0f42\u0fb3");
uhelp("BLCAG", "\u0f56\u0f63\u0f95\u0f42");
uhelp("DBA", "[#WARNING CONVERTING ACIP DOCUMENT: The ACIP DBA has been interpreted as two stacks, not one, but you may wish to confirm that the original text had two stacks as it would be an easy mistake to make to see one stack and forget to input it with '+' characters.]\u0f51\u0f56");
// DLC FIXME uhelp("BDEm:", "[#WARNING CONVERTING ACIP DOCUMENT: The ACIP BDEm: has been interpreted as two stacks, not one, but you may wish to confirm that the original text had two stacks as it would be an easy mistake to make to see one stack and forget to input it with '+' characters.]\u0f56DLC\u0f7a\u0f7e\u0f7f");
uhelp("DMAR", "[#WARNING CONVERTING ACIP DOCUMENT: The ACIP DMAR has been interpreted as two stacks, not one, but you may wish to confirm that the original text had two stacks as it would be an easy mistake to make to see one stack and forget to input it with '+' characters.]\u0f51\u0f58\u0f62");
uhelp("D+BA", "\u0f51\u0fa6");
uhelp("MNA", "[#WARNING CONVERTING ACIP DOCUMENT: The ACIP MNA has been interpreted as two stacks, not one, but you may wish to confirm that the original text had two stacks as it would be an easy mistake to make to see one stack and forget to input it with '+' characters.]\u0f58\u0f53");
@ -7433,7 +7415,7 @@ M+NA
uhelp(":", "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") : HAS THESE ERRORS: Cannot convert ACIP : because : is not an ACIP consonant]");
uhelp("m", "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") m HAS THESE ERRORS: Cannot convert ACIP m because m is not an ACIP consonant]");
uhelp("NA+YA", "\u0f53\u0fb1"); // DLC FIXME: warn about the extra A
uhelp("NA+YA", "\u0f53\u0fb1"); // FIXME: warn about the extra A
uhelp("NE+YA", "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") NE+YA HAS THESE ERRORS: Cannot convert ACIP NE+-YA because + is not an ACIP consonant]");
uhelp("tRAStA", "\u0f4a\u0fb2\u0f66\u0f9a");
uhelp("DZHDZHA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP DZHDZHA was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f5c\u0fac"); // tricky because DZHDZA is not in TMW but DZHDZHA is
@ -7483,13 +7465,34 @@ M+NA
uhelp("K'EE:", "\u0f40\u0f71\u0f7b\u0f7f");
uhelp("K'A:", "\u0f40\u0f71\u0f7f");
uhelp("RVA", "\u0f62\u0fad");
uhelp("R+VA", "\u0f62\u0fad");
uhelp("RWA", "\u0f62\u0fba");
uhelp("R+WA", "\u0f62\u0fba");
uhelp("WWA", "\u0f5d\u0fba");
uhelp("W+WA", "\u0f5d\u0fba");
uhelp("/NY'EE/", "\u0f3C\u0f49\u0F71\u0F7B\u0f3D");
uhelp("*#HUm: G+DHOO GRO`;.,",
"\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b\u0f42\u0fa2\u0f7d\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d");
uhelp("*#HUm: K+DHA GRO`;.,",
"\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b\u0f40\u0fa2\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d");
// DLC FIXME: the file ACIP_SHRI should be made into an ACIP->TMW automated test case
}
public void testFixedFormSubjoinedConsonants() {
// Usual subjoined RA:
uhelp("n+d+R", "\u0f4e\u0f9c\u0fb2");
// Full-form subjoined RA:
uhelp("K+sh+R", "\u0f40\u0fb5\u0fbc");
uhelp("n+d+R+Y",
// ... with usual subjoined YA:
"\u0f4e\u0f9c\u0fbc\u0fb1");
// Full-form subjoined YA:
uhelp("n+d+Y", "\u0f4e\u0f9c\u0fbb");
uhelp("Y+Y", "\u0f61\u0fbb");
uhelp("R+Y", "\u0f62\u0fbb");
}
/** Tests some more tsheg bars, these from Dr. Lacey's critical
@ -9055,3 +9058,55 @@ tstHelper("shKA");
}
}
// S0011N.ACT contains [SMON TSIG 'DI'I RTZOM MING MI GSAL,], why the brackets? IS all this really a correction? Or were parentheses and not brackets intended? FIXME
// FIXME: [THE FOLLOWIN... appears, so [#comment] or [comment] is possible. [BLANK PAGE] [MISSING PAGE] [FIRST] [SECOND] [DD1] [DD2] [46A.2] [THE ... [FOLLOWING... [PAGE ... [THESE ... @[7B] [SW: OK] [A FIRST... [ADDENDUM... [END ... [Additional [Some [Note [MISSING [DDD] [INCOMPLETE [LINE [DATA
// [A pair of ... which is part of the text! S0200A.ACE
// [D] is a correction, eh?
// FIXME -- HOW DO YOU TREAT THESE?
// BDE 'BA' ZHIG RGYUN DU BSTEN, ,YENGS KYANG THUB NA [GNYEN PO,)
// 'BYONGS [BLO,) S0375M.ACT
/* FIXME: BDEm: is different than BDE for us, is that OK?
uhelp("BDEm:", "[#WARNING CONVERTING ACIP DOCUMENT: The ACIP BDEm: has been interpreted as two stacks, not one, but you may wish to confirm that the original text had two stacks as it would be an easy mistake to make to see one stack and forget to input it with '+' characters.]PLACEHOLDER");
tstHelper("BDA:", "{B}{DA:}",
new String[] { "{B+DA:}", "{B}{DA:}" },
new String[] { "{B}{DA:}" },
"{B}{DA:}");
tstHelper("BDEm:", "{B}{DEm:}",
new String[] { "{B+DEm:}", "{B}{DEm:}" },
new String[] { "{B}{DEm:}" },
"{B}{DEm:}"); */
// FIXME DOC: the line SDIG PA'I GROGS PO'I LAG TU SON PAR 'GYUR PA is followed by a blank line. Note that it's "PA", not "PA ", ending it. We autocorrect to the latter.
// DLC FIXME: @0B1 isn't handled correctly! DLC
// FIXME: S6814M6.ACT have BA'I and 'AO, what should these convert to?
// FIXME: test EWTS {pouM}
// FIXME: TMW->ACIP doesn't do {KHA (KA)}. Bug 838486
// FIXME: VERIFY WE DO THIS.
//
// By normal Tibetan & Dzongkha spelling, writing, and input rules
// Tibetan script stacks should be entered and written: 1 headline
// consonant (0F40->0F6A), any subjoined consonant(s) (0F90->
// 0F9C), achung (0F71), shabkyu (0F74), any above headline
// vowel(s) (0F72 0F7A 0F7B 0F7C 0F7D and 0F80) ; any ngaro (0F7E,
// 0F82 and 0F83)
// FIXME: KAo isn't converting. See bug #838594
// FIXME: NYAx isn't converting. See bug #838595
// FIXME : handle ^GONG, and "^ GONG". See Bug #838593
// FIXME: the file ACIP_SHRI should be made into an ACIP->TMW automated test case

View file

@ -27,7 +27,7 @@ import java.util.ArrayList;
/** An ordered pair used in ACIP-to-TMW conversion. The left side is
* the consonant or empty; the right side is the vowel, '+', or '-'.
* @author David Chandler */
/* DLC BIG FIXME: make this package work for EWTS, not just ACIP. */
/* BIG FIXME: make this package work for EWTS, not just ACIP. */
class TPair {
/** The left side, or null if there is no left side. That is, the
* non-vowel, non-'m', non-':', non-'-', non-'+' guy. */
@ -118,22 +118,22 @@ class TPair {
return (null != l
&& ((null == r || "".equals(r))
|| "-".equals(r)
|| "A".equals(r)) // DLC FIXME: though check for BASKYABS and warn because BSKYABS is more common
|| "A".equals(r)) // FIXME: though check for BASKYABS and warn because BSKYABS is more common
&& ACIPRules.isACIPPrefix(l));
}
/** Returns true if and only if this pair could be a Tibetan
* secondary sufffix. */
* secondary suffix. */
boolean isPostSuffix() {
return (null != l
&& ((null == r || "".equals(r))
|| "-".equals(r)
|| "A".equals(r)) // DLC FIXME: though warn about GAMASA vs. GAMS
|| "A".equals(r)) // FIXME: though warn about GAMASA vs. GAMS
&& ACIPRules.isACIPPostsuffix(l));
}
/** Returns true if and only if this pair could be a Tibetan
* sufffix. DLC FIXME: ACIP specific, just like isPostSuffix() and isPrefix() */
* suffix. FIXME: ACIP specific, just like isPostSuffix() and isPrefix() */
boolean isSuffix() {
return (null != l
&& ((null == r || "".equals(r))
@ -208,7 +208,7 @@ class TPair {
}
/** Appends legal Unicode corresponding to this (possible
* subscribed) pair to sb. DLC FIXME: which normalization form,
* subscribed) pair to sb. FIXME: which normalization form,
* if any? */
void getUnicode(StringBuffer sb, boolean subscribed) {
if (null != getLeft()) {

View file

@ -147,24 +147,11 @@ class TPairList {
return false;
}
// DLC [THE FOLLOWIN... appears, so [#comment] or [comment] is possible. [BLANK PAGE] [MISSING PAGE] [FIRST] [SECOND] [DD1] [DD2] [46A.2] [THE ... [FOLLOWING... [PAGE ... [THESE ... @[7B] [SW: OK] [A FIRST... [ADDENDUM... [END ... [Additional [Some [Note [MISSING [DDD] [INCOMPLETE [LINE [DATA
// [A pair of ... which is part of the text! S0200A.ACE
// [D] is a correction, eh?
// DLC BDE 'BA' ZHIG RGYUN DU BSTEN, ,YENGS KYANG THUB NA [GNYEN PO,)
// 'BYONGS [BLO,) S0375M.ACT
// S0011N.ACT contains [SMON TSIG 'DI'I RTZOM MING MI GSAL,], why the brackets? IS all this really a correction? DLC?
// DLC: what are () for?
/** Finds errors so simple that they can be detected without using
* the rules of Tibetan spelling (i.e., tsheg bar syntax).
* Returns an error message, or null if there is no error that
* you can find without the help of tsheg bar syntax rules. */
// DLC RENAME
// DLC FIXME: 9BLTA is an error, numbers are all or nothing
// FIXME: This is needlessly ACIP specific -- rename and change text of messages
String getACIPError() {
int sz = size();
if (0 == sz)
@ -238,7 +225,7 @@ class TPairList {
}
}
// DLC really this is a warning, not an error:
// FIXME: really this is a warning, not an error:
if ("-".equals(get(sz - 1).getRight())) {
if (first) {
first = false;
@ -277,8 +264,6 @@ class TPairList {
private static final int ALWAYS_KEEP_STACKING = 2;
private static final int ALWAYS_STOP_STACKING = 3;
// DLC TEST: BA'I has exactly two syntactically legal parses but just one TStackList.
/** Returns a set (as as ArrayList) of all possible TStackLists.
* Uses knowledge of Tibetan spelling rules (i.e., tsheg bar
* syntax) to do so. If this list of pairs has something clearly
@ -290,8 +275,7 @@ class TPairList {
// We treat [(B . ), (G . +), (K . ), (T . A)] as if it could
// be {B+G+K+T} or {B}{G+K+T}; we handle prefixes specially
// this way. [(T . ), (G . +), (K . ), (T . A)] is clearly
// {T+G+K+TA} (and, DLC FIXME, we should warn that there are
// some pluses but not all)
// {T+G+K+TA}
//
// We don't care if T+G+K+T is in TMW or not -- there is no
// master list of stacks.
@ -411,7 +395,7 @@ class TPairList {
stackStart = i+1;
}
}
// DLC FIXME: we no longer need all these breakLocations -- we can handle SAM'AM'ANG
// FIXME: we no longer need all these breakLocations -- we can handle SAM'AM'ANG without them.
// Now go from hard break (i.e., (* . VOWEL or -)) to hard
// break (and there's a hard break after the last pair, of
@ -424,7 +408,7 @@ class TPairList {
// TStackListList per hard break to pt, the parse tree.
int startLoc = 0; // which pair starts this hard break?
// DLC FIXME: assert this
// FIXME: assert this
if ((breakLocations[1] >= 0 && breakLocations[1] <= breakLocations[0])
|| (breakLocations[2] >= 0 && breakLocations[2] <= breakLocations[1]))
throw new Error("breakLocations is monotonically increasing, ain't it?");
@ -570,17 +554,20 @@ class TPairList {
lWylie.append(thislWylie);
StringBuffer ll = new StringBuffer(lWylie.toString());
int ww;
// DLC NOW: what about fixed-form RA on top??? test it.
while ((ww = ll.indexOf("+")) >= 0)
ll.deleteCharAt(ww);
boolean isTibetan = TibetanMachineWeb.isWylieTibetanConsonantOrConsonantStack(ll.toString());
boolean isSanskrit = TibetanMachineWeb.isWylieSanskritConsonantStack(lWylie.toString());
if (ddebug && !isTibetan && !isSanskrit && !isNumeric) {
System.out.println("DLC: OTHER for " + lWylie + " with vowel " + ACIPRules.getWylieForACIPVowel(p.getRight()) + " and p.getRight()=" + p.getRight());
System.out.println("OTHER for " + lWylie + " with vowel " + ACIPRules.getWylieForACIPVowel(p.getRight()) + " and p.getRight()=" + p.getRight());
}
if (isTibetan && isSanskrit) {
// RVA, e.g. It must be Tibetan because RWA is what
// you'd use for RA over fixed-form WA.
isSanskrit = false;
}
if (isTibetan && isSanskrit) isSanskrit = false; // RVA, e.g.
if (ddebug && hasNonAVowel && ACIPRules.getWylieForACIPVowel(p.getRight()) == null) {
System.out.println("DLC: vowel " + ACIPRules.getWylieForACIPVowel(p.getRight()) + " and p.getRight()=" + p.getRight());
System.out.println("vowel " + ACIPRules.getWylieForACIPVowel(p.getRight()) + " and p.getRight()=" + p.getRight());
}
TGCPair tp;
indexList.add(new Integer(index));
@ -603,7 +590,7 @@ class TPairList {
}
}
/** Appends legal Unicode corresponding to this stack to sb. DLC
/** Appends legal Unicode corresponding to this stack to sb.
* FIXME: which normalization form, if any? */
void getUnicode(StringBuffer sb) {
boolean subscribed = false;
@ -626,6 +613,21 @@ class TPairList {
TPair lastPair = get(size() - 1);
wylieForConsonant.append(lastPair.getWylie(true));
String hashKey = wylieForConsonant.toString();
// r-w and r+w are both known hash keys. Sort 'em out. They
// are the only things like this according to bug report
// #800166.
if ("r+w".equals(hashKey)) {
boolean sawWazur = false;
for (int x = 0; x < size(); x++) {
TPair p = get(x);
if ("V".equals(get(x).getLeft())) {
sawWazur = true;
break;
}
}
if (sawWazur)
hashKey = "r-w";
}
if (!TibetanMachineWeb.isKnownHashKey(hashKey)) {
hashKey = hashKey.replace('+', '-');
if (!TibetanMachineWeb.isKnownHashKey(hashKey)) {
@ -641,7 +643,7 @@ class TPairList {
lastPair.getRight());
}
if (previousSize == duffsAndErrors.size())
throw new Error("TPairList with no duffs? " + toString()); // DLC FIXME: change to assertion.
throw new Error("TPairList with no duffs? " + toString()); // FIXME: change to assertion.
}
}
// DLC FIXME: handle 'o' and 'x', e.g. KAo and NYAx.

View file

@ -242,6 +242,6 @@ class TPairListFactory {
}
// DLC test for nested comments
// FIXME: test for nested comments
// DLC see Translit directory on ACIP v4 CD-ROM
// FIXME: see Translit directory on ACIP v4 CD-ROM

View file

@ -117,8 +117,6 @@ class TParseTree {
* unique legal parse, you get it. If there's not, but there is
* a unique non-illegal parse, you get it. If there's not a
* unique answer, null is returned. */
// {TZANDRA} is not solved by this, DLC NOW. Solve PADMA PROBLEM!
// DLC by using this we can get rid of single-sanskrit-gc, eh?
public TStackList getBestParse() {
TStackListList up = getUniqueParse(false);
if (up.size() == 1)
@ -292,7 +290,7 @@ class TParseTree {
TStackListList up = getUniqueParse(false);
if (null == up || up.size() != 1) {
// DLC FIXME: code duplication
// FIXME: code duplication
boolean isLastStack[] = new boolean[1];
TStackListList nip = getNonIllegalParses();
if (nip.size() != 1) {

View file

@ -23,6 +23,7 @@ import org.thdl.tib.text.TGCList;
import org.thdl.tib.text.DuffCode;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.ListIterator;
/** A list of {@link TPairList TPairLists}, each of which is for
@ -131,7 +132,7 @@ class TStackList {
* stack can take every prefix, which is not the case in
* reality */
public BoolTriple isLegalTshegBar(boolean noPrefixTests) {
// DLC Should we handle PADMA and other Tibetanized Sanskrit fellows consistently? Right now we only treat single-stack Sanskrit guys as legal.
// FIXME: Should we handle PADMA and other Tibetanized Sanskrit fellows consistently? Right now we only treat single-stack Sanskrit guys as legal.
TTGCList tgcList = new TTGCList(this);
StringBuffer warnings = new StringBuffer();
@ -199,10 +200,9 @@ class TStackList {
* @param isLastStack if non-null, then isLastStack[0] will be
* set to true if and only if the very last stack is the only
* stack not to have a vowel or disambiguator on it */
// DLC FIXME: DELETE THIS WARNING and this code unless EWTS will need it...
boolean hasStackWithoutVowel(TPairList opl, boolean[] isLastStack) {
int runningSize = 0;
// DLC FIXME: MARDA is MARD==MAR-D to us, but is probably MAR+DA, warn
// FIXME: MARDA is MARD==MAR-D to us, but is probably MAR+DA, warn -- see 838470
for (int i = 0; i < size(); i++) {
TPairList pl = get(i);
String l;
@ -225,14 +225,39 @@ class TStackList {
return false;
}
/** Returns legal Unicode corresponding to this tsheg bar. DLC FIXME: which normalization form, if any? */
private static HashMap unicodeExceptionsMap = null;
/** Returns legal Unicode corresponding to this tsheg bar. FIXME: which normalization form, if any? */
String getUnicode() {
// The question is this: U+0FB1 or U+0FBB? U+0FB2 or
// U+0FBC? The answer: always the usual form, not the
// full form, except for a few known stacks (all the ones
// with full form subjoined consonants in TMW). Note that
// wa-zur, U+0FAD, is never confused for U+0FBA because
// "V" and "W" are different transliterations.
StringBuffer u = new StringBuffer(size());
for (int i = 0; i < size(); i++) {
get(i).getUnicode(u);
}
return u.toString();
String us = u.toString();
if (null == unicodeExceptionsMap) {
unicodeExceptionsMap = new HashMap();
unicodeExceptionsMap.put("\u0f69\u0fb2", "\u0f69\u0fbc"); // KshR
unicodeExceptionsMap.put("\u0f40\u0fb5\u0fb2", "\u0f40\u0fb5\u0fbc"); // KshR
unicodeExceptionsMap.put("\u0f4e\u0f9c\u0fb2\u0fb1", "\u0f4e\u0f9c\u0fbc\u0fb1"); // ndRY
unicodeExceptionsMap.put("\u0f4e\u0f9c\u0fb1", "\u0f4e\u0f9c\u0fbb"); // ndY
unicodeExceptionsMap.put("\u0f61\u0fb1", "\u0f61\u0fbb"); // YY
unicodeExceptionsMap.put("\u0f62\u0fb1", "\u0f62\u0fbb"); // RY
}
String mapEntry = (String)unicodeExceptionsMap.get(us);
if (null != mapEntry)
return mapEntry;
else
return us;
}
/** Returns the DuffCodes and errors corresponding to this stack
list. Each element of the array is a DuffCode or a String, the
latter if and only if the TMW font cannot represent the

View file

@ -47,7 +47,7 @@ public class TString {
&& type != END_SLASH);
}
/** For ACIP [#COMMENTS] and EWTS (DLC FIXME) */
/** For ACIP [#COMMENTS] and EWTS (DLC FIXME: what are EWTS comments?) */
public static final int COMMENT = 0;
/** For Folio markers like @012B in ACIP */
public static final int FOLIO_MARKER = 1;
@ -85,7 +85,7 @@ public class TString {
public static final int END_PAREN = 16;
/** For things that may not be legal syntax, such as {KA . KHA} */
public static final int WARNING = 17;
/** For ACIP %, o, and x or EWTS (DLC FIXME) */
/** For ACIP %, o, and x or EWTS (DLC FIXME: what are EWTS adornments?) */
public static final int TSHEG_BAR_ADORNMENT = 18;
/** For things that are not legal syntax, such as a file that
* contains just "[# HALF A COMMEN" */
@ -144,14 +144,20 @@ public class TString {
}
}
/** For generating frequency info: */
private static boolean outputAllTshegBars
= ThdlOptions.getBooleanOption("org.thdl.tib.text.ttt.OutputAllTshegBars"); // DLC DOC -- use me to generate frequency info
= ThdlOptions.getBooleanOption("org.thdl.tib.text.ttt.OutputAllTshegBars");
/** For generating info about which tsheg bars were converted, but
not how many times: */
private static boolean outputUniqueTshegBars
= ThdlOptions.getBooleanOption("org.thdl.tib.text.ttt.OutputUniqueTshegBars"); // DLC DOC
= ThdlOptions.getBooleanOption("org.thdl.tib.text.ttt.OutputUniqueTshegBars");
/** Affects what appears on the console when either {@link
#outputUniqueTshegBars} or {@link #outputAllTshegBars} is in
use. */
private static String outputTshegBarsPrefix
= ThdlOptions.getStringOption("org.thdl.tib.text.ttt.PrefixForOutputTshegBars", ""); // DLC DOC
= ThdlOptions.getStringOption("org.thdl.tib.text.ttt.PrefixForOutputTshegBars", "");
private static final HashSet tshegBars = new HashSet();