The ACIP {NYA%} is supported. {NYAo} and {NYAx} are confusing to me,
because I don't know which glyphs o and x correspond to. For that reason, they cause ERRORs. The proposed THDL Extended Wylie ~X and X is now used for U+0F35 and U+0F37 respectively.
This commit is contained in:
parent
f57cdda867
commit
07e360d9a8
8 changed files with 96 additions and 32 deletions
|
@ -144,7 +144,7 @@ public class TMW_RTF_TO_THDL_WYLIETest extends TestCase {
|
|||
/** Tests the --to-wylie converter mode of {@link
|
||||
* org.thdl.tib.input.TibetanConverter}. */
|
||||
public void testConverterMode() {
|
||||
helper("--to-wylie", "Conversion", 44);
|
||||
helper("--to-wylie", "Conversion", 0);
|
||||
}
|
||||
|
||||
/** Tests the --to-tibetan-machine converter mode of {@link
|
||||
|
|
|
@ -37,8 +37,8 @@ rgyal ba kun dngos mtsho skyes rdo rje bstan pa'i rtsa lag thams cad mkhyen pa z
|
|||
bka' drin gzugs can dbyig 'dzin lto 'dir shong 'gyur min na kun mkhyen srang las gang gis gzhal//\par
|
||||
\par
|
||||
li khri'i lcug phran mkhyen pa'i snang ba can//\par
|
||||
'jam mgon blo<<[[JSKAD_TMW_TO_WYLIE_ERROR_NO_SUCH_WYLIE: Cannot convert DuffCode <duffcode font=TibetanMachineWeb8 charNum=101 character=e/> to THDL Extended Wylie. Please see the documentation for the TMW font and transcribe this yourself.]]>>. yi lang tsho baza<<[[JSKAD_TMW_TO_WYLIE_ERROR_NO_SUCH_WYLIE: Cannot convert DuffCode <duffcode font=TibetanMachineWeb8 charNum=101 character=e/> to THDL Extended Wylie. Please see the documentation for the TMW font and transcribe this yourself.]]>>.nga po'i tshon//\par
|
||||
kha dog so sor bkra ba'i graga<<[[JSKAD_TMW_TO_WYLIE_ERROR_NO_SUCH_WYLIE: Cannot convert DuffCode <duffcode font=TibetanMachineWeb8 charNum=101 character=e/> to THDL Extended Wylie. Please see the documentation for the TMW font and transcribe this yourself.]]>>.sa pa<<[[JSKAD_TMW_TO_WYLIE_ERROR_NO_SUCH_WYLIE: Cannot convert DuffCode <duffcode font=TibetanMachineWeb8 charNum=101 character=e/> to THDL Extended Wylie. Please see the documentation for the TMW font and transcribe this yourself.]]>>.'i rgyan//\par
|
||||
'jam mgon bloX. yi lang tsho bazaX.nga po'i tshon//\par
|
||||
kha dog so sor bkra ba'i gragaX.sa paX.'i rgyan//\par
|
||||
phyogs bral rna lung 'god mkhas rtag tu rgyal//\f2\fs44\i0\b0\ul0\cf0\par
|
||||
\par
|
||||
\f1\fs28\i0\b0\ul0 dpal ldan chos kyi rang bzhin ngos yangs par//\par
|
||||
|
|
|
@ -5,6 +5,10 @@
|
|||
// - initial // marks a comment
|
||||
// - blank lines should be ignored
|
||||
// - <?x?> marks a command
|
||||
//
|
||||
// If you change the Wylie here, it can break the ACIP->TMW and
|
||||
// ACIP->Unicode conversion. So keep ACIPRules in sync with this, and be
|
||||
// sure to run 'ant clean check' after your change.
|
||||
|
||||
<?Input:Punctuation?>
|
||||
//_~32,1~0,32
|
||||
|
@ -34,6 +38,10 @@ $~38,5~~9,41~~~~~~~0F06
|
|||
// dbu.khang.g-yas: (If this changes, edit ACIPConverter)
|
||||
)~209,1~~9,94~~~~~~~0F3D
|
||||
H~239,1~~8,92~~~~~~~0F7F
|
||||
// mtshan.rtags:
|
||||
X~101,5~~9,101~~~~~~~0F37
|
||||
// mtshan.rtags zhes.sa:
|
||||
__TILDE__X~102,5~~9,102~~~~~~~0F35
|
||||
|
||||
// 8,91 is the small bindu. We say that this, and not 8,90 (large
|
||||
// anusvara) is the glyph that M yields. This is because [8,90] is
|
||||
|
@ -971,10 +979,6 @@ r~176,4~~8,71~~~~~~~0FB2
|
|||
// mchan rtags leading:
|
||||
\tmw8100~100,5~~9,100~~~~~~~none
|
||||
|
||||
// mtshan.rtags:
|
||||
\tmw8101~101,5~~9,101~~~~~~~0F37
|
||||
// mtshan.rtags zhes.sa:
|
||||
\tmw8102~102,5~~9,102~~~~~~~0F35
|
||||
// che.mgo:
|
||||
\tmw8103~103,5~~9,103~~~~~~~0F38
|
||||
// kuruka:
|
||||
|
|
|
@ -186,10 +186,10 @@ public class ACIPConverter {
|
|||
ByteArrayOutputStream sw = new ByteArrayOutputStream();
|
||||
ArrayList al = ACIPTshegBarScanner.scan(acip, errors, -1);
|
||||
try {
|
||||
if (null != al
|
||||
&& convertToUnicode(al, sw, errors,
|
||||
warnings, writeWarningsToResult,
|
||||
warningLevel)) {
|
||||
if (null != al) {
|
||||
convertToUnicode(al, sw, errors,
|
||||
warnings, writeWarningsToResult,
|
||||
warningLevel);
|
||||
return sw.toString("UTF-8");
|
||||
} else {
|
||||
return null;
|
||||
|
@ -282,6 +282,33 @@ public class ACIPConverter {
|
|||
String text = "[#ERROR CONVERTING ACIP DOCUMENT: Lexical error: " + s.getText() + "]";
|
||||
if (null != writer) writer.write(text);
|
||||
if (null != tdoc) tdoc.appendRoman(text, Color.RED);
|
||||
} else if (stype == ACIPString.TSHEG_BAR_ADORNMENT) {
|
||||
if (lastGuyWasNonPunct) {
|
||||
String err = "[#ERROR CONVERTING ACIP DOCUMENT: This converter cannot yet convert " + s.getText() + " because the converter's author is unclear what the result should be.]";
|
||||
if (null != writer) {
|
||||
String uni = ACIPRules.getUnicodeFor(s.getText(), false);
|
||||
if (null == uni) {
|
||||
hasErrors = true;
|
||||
uni = err;
|
||||
}
|
||||
if (null != writer) writer.write(uni);
|
||||
}
|
||||
if (null != tdoc) {
|
||||
String wylie
|
||||
= ACIPRules.getWylieForACIPOther(s.getText());
|
||||
if (null == wylie) {
|
||||
hasErrors = true;
|
||||
tdoc.appendRoman(err, Color.RED);
|
||||
} else {
|
||||
tdoc.appendDuffCodes(new DuffCode[] { TibetanMachineWeb.getGlyph(wylie) },
|
||||
Color.BLACK);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
hasErrors = true;
|
||||
}
|
||||
lastGuyWasNonPunct = true; // this stuff is not really punctuation
|
||||
lastGuy = null;
|
||||
} else if (stype == ACIPString.WARNING) {
|
||||
lastGuyWasNonPunct = false;
|
||||
lastGuy = null;
|
||||
|
@ -408,10 +435,10 @@ public class ACIPConverter {
|
|||
&& (lpl = lastGuy.get(lastGuy.size() - 1)).size() == 1
|
||||
&& lpl.get(0).getLeft().equals("G")
|
||||
&& // it's (G . anything)
|
||||
// followed by some number
|
||||
// of spaces (at least one,
|
||||
// this one) and then a
|
||||
// comma:
|
||||
// followed by some number
|
||||
// of spaces (at least one,
|
||||
// this one) and then a
|
||||
// comma:
|
||||
peekaheadFindsSpacesAndComma(scan, i+1))) {
|
||||
if (null != writer) {
|
||||
unicode = " ";
|
||||
|
|
|
@ -236,7 +236,7 @@ class ACIPRules {
|
|||
acipOther2wylie.put(";", ";");
|
||||
acipOther2wylie.put("*", "@");
|
||||
acipOther2wylie.put("#", "@#");
|
||||
acipOther2wylie.put("%", "%");
|
||||
acipOther2wylie.put("%", "~X");
|
||||
acipOther2wylie.put("&", "&");
|
||||
|
||||
acipOther2wylie.put("0", "0");
|
||||
|
|
|
@ -77,9 +77,11 @@ public class ACIPString {
|
|||
public static final int END_PAREN = 16;
|
||||
/** For things that may not be legal syntax, such as {KA . KHA} */
|
||||
public static final int WARNING = 17;
|
||||
/** For ACIP %, o, and x */
|
||||
public static final int TSHEG_BAR_ADORNMENT = 18;
|
||||
/** For things that are not legal syntax, such as a file that
|
||||
* contains just "[# HALF A COMMEN" */
|
||||
public static final int ERROR = 18;
|
||||
public static final int ERROR = 19;
|
||||
|
||||
/** Returns true if and only if this string is Latin (usually
|
||||
* English). Returns false if this string is transliteration of
|
||||
|
@ -135,6 +137,7 @@ public class ACIPString {
|
|||
if (type == START_PAREN) typeString = "START_PAREN";
|
||||
if (type == END_PAREN) typeString = "END_PAREN";
|
||||
if (type == WARNING) typeString = "WARNING";
|
||||
if (type == TSHEG_BAR_ADORNMENT) typeString = "TSHEG_BAR_ADORNMENT";
|
||||
if (type == ERROR) typeString = "ERROR";
|
||||
return typeString + ":{" + getText() + "}";
|
||||
}
|
||||
|
|
|
@ -767,9 +767,16 @@ public class ACIPTshegBarScanner {
|
|||
case ';':
|
||||
case '`':
|
||||
case '#':
|
||||
case '%':
|
||||
case 'x':
|
||||
case 'o':
|
||||
|
||||
boolean legalTshegBarAdornment = false;
|
||||
// The tsheg bar ends here; new token.
|
||||
if (startOfString < i) {
|
||||
if (currentType == ACIPString.TIBETAN_NON_PUNCTUATION
|
||||
&& isTshegBarAdornment(ch))
|
||||
legalTshegBarAdornment = true;
|
||||
al.add(new ACIPString(s.substring(startOfString, i),
|
||||
currentType));
|
||||
}
|
||||
|
@ -780,7 +787,8 @@ public class ACIPTshegBarScanner {
|
|||
if (('\r' == ch
|
||||
|| ('\n' == ch && i > 0 && s.charAt(i - 1) != '\r'))
|
||||
&& !al.isEmpty()
|
||||
&& ((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TIBETAN_NON_PUNCTUATION) {
|
||||
&& (((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TIBETAN_NON_PUNCTUATION
|
||||
|| ((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TSHEG_BAR_ADORNMENT)) {
|
||||
al.add(new ACIPString(" ", ACIPString.TIBETAN_PUNCTUATION));
|
||||
}
|
||||
|
||||
|
@ -788,7 +796,8 @@ public class ACIPTshegBarScanner {
|
|||
if (('\r' == ch
|
||||
|| ('\n' == ch && i > 0 && s.charAt(i - 1) != '\r'))
|
||||
&& !al.isEmpty()
|
||||
&& ((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TIBETAN_PUNCTUATION
|
||||
&& (((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TIBETAN_PUNCTUATION
|
||||
|| ((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TSHEG_BAR_ADORNMENT)
|
||||
&& ((ACIPString)al.get(al.size() - 1)).getText().equals(",")
|
||||
&& s.charAt(i-1) == ','
|
||||
&& (i + (('\r' == ch) ? 2 : 1) < sl
|
||||
|
@ -804,9 +813,17 @@ public class ACIPTshegBarScanner {
|
|||
|| (realNewline
|
||||
= ((rn = ('\n' == ch && i >= 3 && s.charAt(i-3) == '\r' && s.charAt(i-2) == '\n' && s.charAt(i-1) == '\r'))
|
||||
|| ('\n' == ch && i >= 1 && s.charAt(i-1) == '\n')))) {
|
||||
for (int h = 0; h < (realNewline ? 2 : 1); h++)
|
||||
al.add(new ACIPString(rn ? s.substring(i - 1, i+1) : s.substring(i, i+1),
|
||||
ACIPString.TIBETAN_PUNCTUATION));
|
||||
for (int h = 0; h < (realNewline ? 2 : 1); h++) {
|
||||
if (isTshegBarAdornment(ch) && !legalTshegBarAdornment) {
|
||||
al.add(new ACIPString("The ACIP " + ch + " must be glued to the end of a tsheg bar, but this one was not",
|
||||
ACIPString.ERROR));
|
||||
} else {
|
||||
al.add(new ACIPString(rn ? s.substring(i - 1, i+1) : s.substring(i, i+1),
|
||||
(legalTshegBarAdornment
|
||||
? ACIPString.TSHEG_BAR_ADORNMENT
|
||||
: ACIPString.TIBETAN_PUNCTUATION)));
|
||||
}
|
||||
}
|
||||
}
|
||||
startOfString = i+1;
|
||||
currentType = ACIPString.ERROR;
|
||||
|
@ -910,15 +927,17 @@ public class ACIPTshegBarScanner {
|
|||
return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n';
|
||||
}
|
||||
|
||||
/** See implementation. */
|
||||
private static boolean isTshegBarAdornment(char ch) {
|
||||
return (ch == '%' || ch == 'o' || ch == 'x');
|
||||
}
|
||||
|
||||
/** See implementation. */
|
||||
private static boolean isAlpha(char ch) {
|
||||
return ch == '\'' // 23rd consonant
|
||||
|
||||
// combining punctuation, vowels:
|
||||
|| ch == '%'
|
||||
|| ch == 'o'
|
||||
|| ch == 'm'
|
||||
|| ch == 'x'
|
||||
|| ch == ':'
|
||||
|| ch == '^'
|
||||
// DLC FIXME: we must treat this guy like a vowel, a special vowel that numerals can take on. Until then, warn. || ch == '\\'
|
||||
|
|
|
@ -7171,7 +7171,13 @@ tstHelper("ZUR");
|
|||
"[TIBETAN_NON_PUNCTUATION:{K}, ERROR:{Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.}, TIBETAN_PUNCTUATION:{,}]");
|
||||
|
||||
|
||||
shelp("MTHAR%", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR%}]");
|
||||
shelp("MTHAR%", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR}, TSHEG_BAR_ADORNMENT:{%}]");
|
||||
shelp("MTHARo", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR}, TSHEG_BAR_ADORNMENT:{o}]");
|
||||
shelp("MTHARx", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR}, TSHEG_BAR_ADORNMENT:{x}]");
|
||||
|
||||
shelp("MTHAR\n%", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR}, TIBETAN_PUNCTUATION:{ }, ERROR:{The ACIP % must be glued to the end of a tsheg bar, but this one was not}]");
|
||||
shelp("MTHAR x", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR}, TIBETAN_PUNCTUATION:{ }, ERROR:{The ACIP x must be glued to the end of a tsheg bar, but this one was not}]");
|
||||
|
||||
shelp("PHYIR;", "", "[TIBETAN_NON_PUNCTUATION:{PHYIR}, TIBETAN_PUNCTUATION:{;}]");
|
||||
shelp("......,DAM ",
|
||||
"",
|
||||
|
@ -7254,6 +7260,10 @@ tstHelper("ZUR");
|
|||
}
|
||||
|
||||
public void testACIPConversion() {
|
||||
uhelp("KA%\nKHA", "\u0f40\u0f35\u0f0b\u0f41");
|
||||
uhelp("KA%", "\u0f40\u0f35");
|
||||
uhelp("KAo", "\u0f40[#ERROR CONVERTING ACIP DOCUMENT: This converter cannot yet convert o because the converter's author is unclear what the result should be.]");
|
||||
uhelp("KAx", "\u0f40[#ERROR CONVERTING ACIP DOCUMENT: This converter cannot yet convert x because the converter's author is unclear what the result should be.]");
|
||||
uhelp("G+DHA", "\u0f42\u0fa2");
|
||||
uhelp("P'EE", "\u0f54\u0f71\u0f7b");
|
||||
|
||||
|
@ -7284,13 +7294,11 @@ tstHelper("ZUR");
|
|||
|
||||
uhelp("K'A:", "\u0f40\u0f71\u0f7f");
|
||||
|
||||
// DLC FIXME: in ACIP RTF files, (PARENTHESES) seem to make
|
||||
// text go from 24-point to 18-point. Thus, ACIP->Unicode.txt
|
||||
// is fundamentally flawed, whereas ACIP->Unicode.rtf is OK.
|
||||
|
||||
uhelp("/NY'EE/", "\u0f3C\u0f49\u0F71\u0F7B\u0f3D");
|
||||
uhelp("*#HUm: G+DHOO GRO`;.,", "\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b\u0f42\u0fa2\u0f7d\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d");
|
||||
uhelp("*#HUm: K+DHA GRO`;.,", "none");
|
||||
uhelp("*#HUm: G+DHOO GRO`;.,",
|
||||
"\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b\u0f42\u0fa2\u0f7d\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d");
|
||||
uhelp("*#HUm: K+DHA GRO`;.,",
|
||||
"\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") K+DHA IS ESSENTIALLY NOTHING.]\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d");
|
||||
}
|
||||
|
||||
/** Tests some more tsheg bars, these from Dr. Lacey's critical
|
||||
|
@ -8861,6 +8869,9 @@ tstHelper("shKA");
|
|||
}
|
||||
/* DLC FIXME: add test cases: from R0021F.ACE: ambiguous Tibetan/Sanskrit:
|
||||
|
||||
DLC NOW: warn, in "All" mode, about each occurrence of BD, DB, DG,
|
||||
DGR, DGY, DM, GD, GN, MN (but not B+D etc.)
|
||||
|
||||
BDA'
|
||||
B+DA
|
||||
DBANG
|
||||
|
|
Loading…
Reference in a new issue