The ACIP {NYA%} is supported. {NYAo} and {NYAx} are confusing to me,

because I don't know which glyphs o and x correspond to.  For that
reason, they cause ERRORs.

The proposed THDL Extended Wylie ~X and X is now used for U+0F35 and
U+0F37 respectively.
This commit is contained in:
dchandler 2003-09-07 16:19:50 +00:00
parent f57cdda867
commit 07e360d9a8
8 changed files with 96 additions and 32 deletions

View file

@ -144,7 +144,7 @@ public class TMW_RTF_TO_THDL_WYLIETest extends TestCase {
/** Tests the --to-wylie converter mode of {@link
* org.thdl.tib.input.TibetanConverter}. */
public void testConverterMode() {
helper("--to-wylie", "Conversion", 44);
helper("--to-wylie", "Conversion", 0);
}
/** Tests the --to-tibetan-machine converter mode of {@link

View file

@ -37,8 +37,8 @@ rgyal ba kun dngos mtsho skyes rdo rje bstan pa'i rtsa lag thams cad mkhyen pa z
bka' drin gzugs can dbyig 'dzin lto 'dir shong 'gyur min na kun mkhyen srang las gang gis gzhal//\par
\par
li khri'i lcug phran mkhyen pa'i snang ba can//\par
'jam mgon blo<<[[JSKAD_TMW_TO_WYLIE_ERROR_NO_SUCH_WYLIE: Cannot convert DuffCode <duffcode font=TibetanMachineWeb8 charNum=101 character=e/> to THDL Extended Wylie. Please see the documentation for the TMW font and transcribe this yourself.]]>>. yi lang tsho baza<<[[JSKAD_TMW_TO_WYLIE_ERROR_NO_SUCH_WYLIE: Cannot convert DuffCode <duffcode font=TibetanMachineWeb8 charNum=101 character=e/> to THDL Extended Wylie. Please see the documentation for the TMW font and transcribe this yourself.]]>>.nga po'i tshon//\par
kha dog so sor bkra ba'i graga<<[[JSKAD_TMW_TO_WYLIE_ERROR_NO_SUCH_WYLIE: Cannot convert DuffCode <duffcode font=TibetanMachineWeb8 charNum=101 character=e/> to THDL Extended Wylie. Please see the documentation for the TMW font and transcribe this yourself.]]>>.sa pa<<[[JSKAD_TMW_TO_WYLIE_ERROR_NO_SUCH_WYLIE: Cannot convert DuffCode <duffcode font=TibetanMachineWeb8 charNum=101 character=e/> to THDL Extended Wylie. Please see the documentation for the TMW font and transcribe this yourself.]]>>.'i rgyan//\par
'jam mgon bloX. yi lang tsho bazaX.nga po'i tshon//\par
kha dog so sor bkra ba'i gragaX.sa paX.'i rgyan//\par
phyogs bral rna lung 'god mkhas rtag tu rgyal//\f2\fs44\i0\b0\ul0\cf0\par
\par
\f1\fs28\i0\b0\ul0 dpal ldan chos kyi rang bzhin ngos yangs par//\par

View file

@ -5,6 +5,10 @@
// - initial // marks a comment
// - blank lines should be ignored
// - <?x?> marks a command
//
// If you change the Wylie here, it can break the ACIP->TMW and
// ACIP->Unicode conversion. So keep ACIPRules in sync with this, and be
// sure to run 'ant clean check' after your change.
<?Input:Punctuation?>
//_~32,1~0,32
@ -34,6 +38,10 @@ $~38,5~~9,41~~~~~~~0F06
// dbu.khang.g-yas: (If this changes, edit ACIPConverter)
)~209,1~~9,94~~~~~~~0F3D
H~239,1~~8,92~~~~~~~0F7F
// mtshan.rtags:
X~101,5~~9,101~~~~~~~0F37
// mtshan.rtags zhes.sa:
__TILDE__X~102,5~~9,102~~~~~~~0F35
// 8,91 is the small bindu. We say that this, and not 8,90 (large
// anusvara) is the glyph that M yields. This is because [8,90] is
@ -971,10 +979,6 @@ r~176,4~~8,71~~~~~~~0FB2
// mchan rtags leading:
\tmw8100~100,5~~9,100~~~~~~~none
// mtshan.rtags:
\tmw8101~101,5~~9,101~~~~~~~0F37
// mtshan.rtags zhes.sa:
\tmw8102~102,5~~9,102~~~~~~~0F35
// che.mgo:
\tmw8103~103,5~~9,103~~~~~~~0F38
// kuruka:

View file

@ -186,10 +186,10 @@ public class ACIPConverter {
ByteArrayOutputStream sw = new ByteArrayOutputStream();
ArrayList al = ACIPTshegBarScanner.scan(acip, errors, -1);
try {
if (null != al
&& convertToUnicode(al, sw, errors,
warnings, writeWarningsToResult,
warningLevel)) {
if (null != al) {
convertToUnicode(al, sw, errors,
warnings, writeWarningsToResult,
warningLevel);
return sw.toString("UTF-8");
} else {
return null;
@ -282,6 +282,33 @@ public class ACIPConverter {
String text = "[#ERROR CONVERTING ACIP DOCUMENT: Lexical error: " + s.getText() + "]";
if (null != writer) writer.write(text);
if (null != tdoc) tdoc.appendRoman(text, Color.RED);
} else if (stype == ACIPString.TSHEG_BAR_ADORNMENT) {
if (lastGuyWasNonPunct) {
String err = "[#ERROR CONVERTING ACIP DOCUMENT: This converter cannot yet convert " + s.getText() + " because the converter's author is unclear what the result should be.]";
if (null != writer) {
String uni = ACIPRules.getUnicodeFor(s.getText(), false);
if (null == uni) {
hasErrors = true;
uni = err;
}
if (null != writer) writer.write(uni);
}
if (null != tdoc) {
String wylie
= ACIPRules.getWylieForACIPOther(s.getText());
if (null == wylie) {
hasErrors = true;
tdoc.appendRoman(err, Color.RED);
} else {
tdoc.appendDuffCodes(new DuffCode[] { TibetanMachineWeb.getGlyph(wylie) },
Color.BLACK);
}
}
} else {
hasErrors = true;
}
lastGuyWasNonPunct = true; // this stuff is not really punctuation
lastGuy = null;
} else if (stype == ACIPString.WARNING) {
lastGuyWasNonPunct = false;
lastGuy = null;
@ -408,10 +435,10 @@ public class ACIPConverter {
&& (lpl = lastGuy.get(lastGuy.size() - 1)).size() == 1
&& lpl.get(0).getLeft().equals("G")
&& // it's (G . anything)
// followed by some number
// of spaces (at least one,
// this one) and then a
// comma:
// followed by some number
// of spaces (at least one,
// this one) and then a
// comma:
peekaheadFindsSpacesAndComma(scan, i+1))) {
if (null != writer) {
unicode = " ";

View file

@ -236,7 +236,7 @@ class ACIPRules {
acipOther2wylie.put(";", ";");
acipOther2wylie.put("*", "@");
acipOther2wylie.put("#", "@#");
acipOther2wylie.put("%", "%");
acipOther2wylie.put("%", "~X");
acipOther2wylie.put("&", "&");
acipOther2wylie.put("0", "0");

View file

@ -77,9 +77,11 @@ public class ACIPString {
public static final int END_PAREN = 16;
/** For things that may not be legal syntax, such as {KA . KHA} */
public static final int WARNING = 17;
/** For ACIP %, o, and x */
public static final int TSHEG_BAR_ADORNMENT = 18;
/** For things that are not legal syntax, such as a file that
* contains just "[# HALF A COMMEN" */
public static final int ERROR = 18;
public static final int ERROR = 19;
/** Returns true if and only if this string is Latin (usually
* English). Returns false if this string is transliteration of
@ -135,6 +137,7 @@ public class ACIPString {
if (type == START_PAREN) typeString = "START_PAREN";
if (type == END_PAREN) typeString = "END_PAREN";
if (type == WARNING) typeString = "WARNING";
if (type == TSHEG_BAR_ADORNMENT) typeString = "TSHEG_BAR_ADORNMENT";
if (type == ERROR) typeString = "ERROR";
return typeString + ":{" + getText() + "}";
}

View file

@ -767,9 +767,16 @@ public class ACIPTshegBarScanner {
case ';':
case '`':
case '#':
case '%':
case 'x':
case 'o':
boolean legalTshegBarAdornment = false;
// The tsheg bar ends here; new token.
if (startOfString < i) {
if (currentType == ACIPString.TIBETAN_NON_PUNCTUATION
&& isTshegBarAdornment(ch))
legalTshegBarAdornment = true;
al.add(new ACIPString(s.substring(startOfString, i),
currentType));
}
@ -780,7 +787,8 @@ public class ACIPTshegBarScanner {
if (('\r' == ch
|| ('\n' == ch && i > 0 && s.charAt(i - 1) != '\r'))
&& !al.isEmpty()
&& ((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TIBETAN_NON_PUNCTUATION) {
&& (((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TIBETAN_NON_PUNCTUATION
|| ((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TSHEG_BAR_ADORNMENT)) {
al.add(new ACIPString(" ", ACIPString.TIBETAN_PUNCTUATION));
}
@ -788,7 +796,8 @@ public class ACIPTshegBarScanner {
if (('\r' == ch
|| ('\n' == ch && i > 0 && s.charAt(i - 1) != '\r'))
&& !al.isEmpty()
&& ((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TIBETAN_PUNCTUATION
&& (((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TIBETAN_PUNCTUATION
|| ((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TSHEG_BAR_ADORNMENT)
&& ((ACIPString)al.get(al.size() - 1)).getText().equals(",")
&& s.charAt(i-1) == ','
&& (i + (('\r' == ch) ? 2 : 1) < sl
@ -804,9 +813,17 @@ public class ACIPTshegBarScanner {
|| (realNewline
= ((rn = ('\n' == ch && i >= 3 && s.charAt(i-3) == '\r' && s.charAt(i-2) == '\n' && s.charAt(i-1) == '\r'))
|| ('\n' == ch && i >= 1 && s.charAt(i-1) == '\n')))) {
for (int h = 0; h < (realNewline ? 2 : 1); h++)
al.add(new ACIPString(rn ? s.substring(i - 1, i+1) : s.substring(i, i+1),
ACIPString.TIBETAN_PUNCTUATION));
for (int h = 0; h < (realNewline ? 2 : 1); h++) {
if (isTshegBarAdornment(ch) && !legalTshegBarAdornment) {
al.add(new ACIPString("The ACIP " + ch + " must be glued to the end of a tsheg bar, but this one was not",
ACIPString.ERROR));
} else {
al.add(new ACIPString(rn ? s.substring(i - 1, i+1) : s.substring(i, i+1),
(legalTshegBarAdornment
? ACIPString.TSHEG_BAR_ADORNMENT
: ACIPString.TIBETAN_PUNCTUATION)));
}
}
}
startOfString = i+1;
currentType = ACIPString.ERROR;
@ -910,15 +927,17 @@ public class ACIPTshegBarScanner {
return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n';
}
/** See implementation. */
private static boolean isTshegBarAdornment(char ch) {
return (ch == '%' || ch == 'o' || ch == 'x');
}
/** See implementation. */
private static boolean isAlpha(char ch) {
return ch == '\'' // 23rd consonant
// combining punctuation, vowels:
|| ch == '%'
|| ch == 'o'
|| ch == 'm'
|| ch == 'x'
|| ch == ':'
|| ch == '^'
// DLC FIXME: we must treat this guy like a vowel, a special vowel that numerals can take on. Until then, warn. || ch == '\\'

View file

@ -7171,7 +7171,13 @@ tstHelper("ZUR");
"[TIBETAN_NON_PUNCTUATION:{K}, ERROR:{Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.}, TIBETAN_PUNCTUATION:{,}]");
shelp("MTHAR%", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR%}]");
shelp("MTHAR%", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR}, TSHEG_BAR_ADORNMENT:{%}]");
shelp("MTHARo", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR}, TSHEG_BAR_ADORNMENT:{o}]");
shelp("MTHARx", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR}, TSHEG_BAR_ADORNMENT:{x}]");
shelp("MTHAR\n%", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR}, TIBETAN_PUNCTUATION:{ }, ERROR:{The ACIP % must be glued to the end of a tsheg bar, but this one was not}]");
shelp("MTHAR x", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR}, TIBETAN_PUNCTUATION:{ }, ERROR:{The ACIP x must be glued to the end of a tsheg bar, but this one was not}]");
shelp("PHYIR;", "", "[TIBETAN_NON_PUNCTUATION:{PHYIR}, TIBETAN_PUNCTUATION:{;}]");
shelp("......,DAM ",
"",
@ -7254,6 +7260,10 @@ tstHelper("ZUR");
}
public void testACIPConversion() {
uhelp("KA%\nKHA", "\u0f40\u0f35\u0f0b\u0f41");
uhelp("KA%", "\u0f40\u0f35");
uhelp("KAo", "\u0f40[#ERROR CONVERTING ACIP DOCUMENT: This converter cannot yet convert o because the converter's author is unclear what the result should be.]");
uhelp("KAx", "\u0f40[#ERROR CONVERTING ACIP DOCUMENT: This converter cannot yet convert x because the converter's author is unclear what the result should be.]");
uhelp("G+DHA", "\u0f42\u0fa2");
uhelp("P'EE", "\u0f54\u0f71\u0f7b");
@ -7284,13 +7294,11 @@ tstHelper("ZUR");
uhelp("K'A:", "\u0f40\u0f71\u0f7f");
// DLC FIXME: in ACIP RTF files, (PARENTHESES) seem to make
// text go from 24-point to 18-point. Thus, ACIP->Unicode.txt
// is fundamentally flawed, whereas ACIP->Unicode.rtf is OK.
uhelp("/NY'EE/", "\u0f3C\u0f49\u0F71\u0F7B\u0f3D");
uhelp("*#HUm: G+DHOO GRO`;.,", "\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b\u0f42\u0fa2\u0f7d\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d");
uhelp("*#HUm: K+DHA GRO`;.,", "none");
uhelp("*#HUm: G+DHOO GRO`;.,",
"\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b\u0f42\u0fa2\u0f7d\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d");
uhelp("*#HUm: K+DHA GRO`;.,",
"\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") K+DHA IS ESSENTIALLY NOTHING.]\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d");
}
/** Tests some more tsheg bars, these from Dr. Lacey's critical
@ -8861,6 +8869,9 @@ tstHelper("shKA");
}
/* DLC FIXME: add test cases: from R0021F.ACE: ambiguous Tibetan/Sanskrit:
DLC NOW: warn, in "All" mode, about each occurrence of BD, DB, DG,
DGR, DGY, DM, GD, GN, MN (but not B+D etc.)
BDA'
B+DA
DBANG