ACIP->TMW now supports EWTS PUA {\uF021}-style escapes. Our extended ACIP is thus TMW-complete and useful for testing.

This commit is contained in:
dchandler 2003-12-08 07:15:27 +00:00
parent 8f7322a056
commit a39c5c12b0
3 changed files with 36 additions and 7 deletions

View file

@ -84,6 +84,8 @@ public class TibetanMachineWeb implements THDLWylieConstants {
use special formatting to get those right (FIXME: warn use special formatting to get those right (FIXME: warn
whenever they're used). */ whenever they're used). */
private static DuffCode[][] UnicodeToTMW = new DuffCode[256][1]; private static DuffCode[][] UnicodeToTMW = new DuffCode[256][1];
/** For mapping codepoints U+F021..U+0FFF to TMW. */
private static DuffCode[][] NonUnicodeToTMW = new DuffCode[256][1];
private static String fileName = "tibwn.ini"; private static String fileName = "tibwn.ini";
private static final String DELIMITER = "~"; private static final String DELIMITER = "~";
/** vowels that appear over the glyph: */ /** vowels that appear over the glyph: */
@ -603,6 +605,14 @@ public class TibetanMachineWeb implements THDLWylieConstants {
// could well be null): // could well be null):
TMWtoTM[duffCodes[TMW].getFontNum()-1][duffCodes[TMW].getCharNum()-32] TMWtoTM[duffCodes[TMW].getFontNum()-1][duffCodes[TMW].getCharNum()-32]
= duffCodes[TM]; // TMW->TM mapping = duffCodes[TM]; // TMW->TM mapping
if (wylie.toLowerCase().startsWith("\\uf0")) {
int x = Integer.parseInt(wylie.substring("\\u".length()), 16);
ThdlDebug.verify((x >= 0xF000
&& x <= 0xF0FF));
NonUnicodeToTMW[x - '\uF000']
= new DuffCode[] { duffCodes[TMW] };
}
break; break;
// Vowels etc. to use with this glyph: // Vowels etc. to use with this glyph:
case 4: case 4:
@ -628,8 +638,8 @@ public class TibetanMachineWeb implements THDLWylieConstants {
String subval = uTok.nextToken(); String subval = uTok.nextToken();
ThdlDebug.verify(subval.length() == 4 || subval.length() == 3); ThdlDebug.verify(subval.length() == 4 || subval.length() == 3);
try { try {
int x; int x = Integer.parseInt(subval, 16);
ThdlDebug.verify(((x = Integer.parseInt(subval, 16)) >= 0x0F00 ThdlDebug.verify((x >= 0x0F00
&& x <= 0x0FFF) && x <= 0x0FFF)
|| x == 0x5350 || x == 0x5350
|| x == 0x534D || x == 0x534D
@ -1769,9 +1779,14 @@ private static final String Unicode_tab = "\t";
} else if ('\u0F81' == ch) { } else if ('\u0F81' == ch) {
return tmwFor0F81; return tmwFor0F81;
} else { } else {
DuffCode[] x = UnicodeToTMW[ch - '\u0F00']; if (ch >= '\u0F00' && ch <= '\u0FFF') {
if (null == x[0]) return null; DuffCode[] x = UnicodeToTMW[ch - '\u0F00'];
return x; if (null != x[0]) return x;
} else if (ch >= '\uF021' && ch <= '\uF0FF') {
DuffCode[] x = NonUnicodeToTMW[ch - '\uF000'];
if (null != x[0]) return x;
}
return null;
} }
} }

View file

@ -607,8 +607,18 @@ public class ACIPConverter {
} }
continue; continue;
} else if (stype == TString.UNICODE_CHARACTER) { } else if (stype == TString.UNICODE_CHARACTER) {
ThdlDebug.verify(1 == s.getText().length());
if (null != writer) { if (null != writer) {
unicode = s.getText(); char ch = s.getText().charAt(0);
if (ch >= '\uF021' && ch <= '\uF0FF') {
hasErrors = true;
String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: The Unicode escape '" + ch + "' with ordinal " + (int)ch + " is in the private-use area (PUA) of Unicode and will thus not be written out into the output lest you think other tools will be able to understand this non-standard construction.]";
writer.write(errorMessage);
if (null != errors)
errors.append(errorMessage + "\n");
continue; // FIXME: dropping output if null != tdoc
} else
unicode = s.getText();
} }
if (null != tdoc) { if (null != tdoc) {
duff = TibetanMachineWeb.mapUnicodeToTMW(s.getText().charAt(0)); duff = TibetanMachineWeb.mapUnicodeToTMW(s.getText().charAt(0));

View file

@ -41,6 +41,7 @@ public class TString {
* is to be converted to something other than Tibetan text. * is to be converted to something other than Tibetan text.
* (Chinese Unicode, Latin, etc. all qualify as non-Tibetan.) */ * (Chinese Unicode, Latin, etc. all qualify as non-Tibetan.) */
public boolean isLatin() { public boolean isLatin() {
char ch;
return (type != TIBETAN_NON_PUNCTUATION return (type != TIBETAN_NON_PUNCTUATION
&& type != TIBETAN_PUNCTUATION && type != TIBETAN_PUNCTUATION
&& type != TSHEG_BAR_ADORNMENT && type != TSHEG_BAR_ADORNMENT
@ -49,7 +50,10 @@ public class TString {
&& type != START_SLASH && type != START_SLASH
&& type != END_SLASH && type != END_SLASH
&& (type != UNICODE_CHARACTER && (type != UNICODE_CHARACTER
|| !UnicodeUtils.isInTibetanRange(getText().charAt(0)))); || !(UnicodeUtils.isInTibetanRange(ch = getText().charAt(0))
// EWTS maps some TMW glyphs to this Unicode
// private-use area (PUA):
|| (ch >= '\uF021' && ch <= '\uF0FF'))));
} }
/** For ACIP [#COMMENTS] and EWTS (DLC FIXME: what are EWTS comments?) */ /** For ACIP [#COMMENTS] and EWTS (DLC FIXME: what are EWTS comments?) */