ACIP->TMW now supports EWTS PUA {\uF021}-style escapes. Our extended ACIP is thus TMW-complete and useful for testing.
This commit is contained in:
parent
8f7322a056
commit
a39c5c12b0
3 changed files with 36 additions and 7 deletions
|
@ -84,6 +84,8 @@ public class TibetanMachineWeb implements THDLWylieConstants {
|
|||
use special formatting to get those right (FIXME: warn
|
||||
whenever they're used). */
|
||||
private static DuffCode[][] UnicodeToTMW = new DuffCode[256][1];
|
||||
/** For mapping codepoints U+F021..U+0FFF to TMW. */
|
||||
private static DuffCode[][] NonUnicodeToTMW = new DuffCode[256][1];
|
||||
private static String fileName = "tibwn.ini";
|
||||
private static final String DELIMITER = "~";
|
||||
/** vowels that appear over the glyph: */
|
||||
|
@ -603,6 +605,14 @@ public class TibetanMachineWeb implements THDLWylieConstants {
|
|||
// could well be null):
|
||||
TMWtoTM[duffCodes[TMW].getFontNum()-1][duffCodes[TMW].getCharNum()-32]
|
||||
= duffCodes[TM]; // TMW->TM mapping
|
||||
|
||||
if (wylie.toLowerCase().startsWith("\\uf0")) {
|
||||
int x = Integer.parseInt(wylie.substring("\\u".length()), 16);
|
||||
ThdlDebug.verify((x >= 0xF000
|
||||
&& x <= 0xF0FF));
|
||||
NonUnicodeToTMW[x - '\uF000']
|
||||
= new DuffCode[] { duffCodes[TMW] };
|
||||
}
|
||||
break;
|
||||
// Vowels etc. to use with this glyph:
|
||||
case 4:
|
||||
|
@ -628,8 +638,8 @@ public class TibetanMachineWeb implements THDLWylieConstants {
|
|||
String subval = uTok.nextToken();
|
||||
ThdlDebug.verify(subval.length() == 4 || subval.length() == 3);
|
||||
try {
|
||||
int x;
|
||||
ThdlDebug.verify(((x = Integer.parseInt(subval, 16)) >= 0x0F00
|
||||
int x = Integer.parseInt(subval, 16);
|
||||
ThdlDebug.verify((x >= 0x0F00
|
||||
&& x <= 0x0FFF)
|
||||
|| x == 0x5350
|
||||
|| x == 0x534D
|
||||
|
@ -1769,9 +1779,14 @@ private static final String Unicode_tab = "\t";
|
|||
} else if ('\u0F81' == ch) {
|
||||
return tmwFor0F81;
|
||||
} else {
|
||||
DuffCode[] x = UnicodeToTMW[ch - '\u0F00'];
|
||||
if (null == x[0]) return null;
|
||||
return x;
|
||||
if (ch >= '\u0F00' && ch <= '\u0FFF') {
|
||||
DuffCode[] x = UnicodeToTMW[ch - '\u0F00'];
|
||||
if (null != x[0]) return x;
|
||||
} else if (ch >= '\uF021' && ch <= '\uF0FF') {
|
||||
DuffCode[] x = NonUnicodeToTMW[ch - '\uF000'];
|
||||
if (null != x[0]) return x;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -607,8 +607,18 @@ public class ACIPConverter {
|
|||
}
|
||||
continue;
|
||||
} else if (stype == TString.UNICODE_CHARACTER) {
|
||||
ThdlDebug.verify(1 == s.getText().length());
|
||||
if (null != writer) {
|
||||
unicode = s.getText();
|
||||
char ch = s.getText().charAt(0);
|
||||
if (ch >= '\uF021' && ch <= '\uF0FF') {
|
||||
hasErrors = true;
|
||||
String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: The Unicode escape '" + ch + "' with ordinal " + (int)ch + " is in the private-use area (PUA) of Unicode and will thus not be written out into the output lest you think other tools will be able to understand this non-standard construction.]";
|
||||
writer.write(errorMessage);
|
||||
if (null != errors)
|
||||
errors.append(errorMessage + "\n");
|
||||
continue; // FIXME: dropping output if null != tdoc
|
||||
} else
|
||||
unicode = s.getText();
|
||||
}
|
||||
if (null != tdoc) {
|
||||
duff = TibetanMachineWeb.mapUnicodeToTMW(s.getText().charAt(0));
|
||||
|
|
|
@ -41,6 +41,7 @@ public class TString {
|
|||
* is to be converted to something other than Tibetan text.
|
||||
* (Chinese Unicode, Latin, etc. all qualify as non-Tibetan.) */
|
||||
public boolean isLatin() {
|
||||
char ch;
|
||||
return (type != TIBETAN_NON_PUNCTUATION
|
||||
&& type != TIBETAN_PUNCTUATION
|
||||
&& type != TSHEG_BAR_ADORNMENT
|
||||
|
@ -49,7 +50,10 @@ public class TString {
|
|||
&& type != START_SLASH
|
||||
&& type != END_SLASH
|
||||
&& (type != UNICODE_CHARACTER
|
||||
|| !UnicodeUtils.isInTibetanRange(getText().charAt(0))));
|
||||
|| !(UnicodeUtils.isInTibetanRange(ch = getText().charAt(0))
|
||||
// EWTS maps some TMW glyphs to this Unicode
|
||||
// private-use area (PUA):
|
||||
|| (ch >= '\uF021' && ch <= '\uF0FF'))));
|
||||
}
|
||||
|
||||
/** For ACIP [#COMMENTS] and EWTS (DLC FIXME: what are EWTS comments?) */
|
||||
|
|
Loading…
Reference in a new issue