ACIP->TMW now supports EWTS PUA {\uF021}-style escapes. Our extended ACIP is thus TMW-complete and useful for testing.
This commit is contained in:
parent
8f7322a056
commit
a39c5c12b0
3 changed files with 36 additions and 7 deletions
|
@ -84,6 +84,8 @@ public class TibetanMachineWeb implements THDLWylieConstants {
|
||||||
use special formatting to get those right (FIXME: warn
|
use special formatting to get those right (FIXME: warn
|
||||||
whenever they're used). */
|
whenever they're used). */
|
||||||
private static DuffCode[][] UnicodeToTMW = new DuffCode[256][1];
|
private static DuffCode[][] UnicodeToTMW = new DuffCode[256][1];
|
||||||
|
/** For mapping codepoints U+F021..U+0FFF to TMW. */
|
||||||
|
private static DuffCode[][] NonUnicodeToTMW = new DuffCode[256][1];
|
||||||
private static String fileName = "tibwn.ini";
|
private static String fileName = "tibwn.ini";
|
||||||
private static final String DELIMITER = "~";
|
private static final String DELIMITER = "~";
|
||||||
/** vowels that appear over the glyph: */
|
/** vowels that appear over the glyph: */
|
||||||
|
@ -603,6 +605,14 @@ public class TibetanMachineWeb implements THDLWylieConstants {
|
||||||
// could well be null):
|
// could well be null):
|
||||||
TMWtoTM[duffCodes[TMW].getFontNum()-1][duffCodes[TMW].getCharNum()-32]
|
TMWtoTM[duffCodes[TMW].getFontNum()-1][duffCodes[TMW].getCharNum()-32]
|
||||||
= duffCodes[TM]; // TMW->TM mapping
|
= duffCodes[TM]; // TMW->TM mapping
|
||||||
|
|
||||||
|
if (wylie.toLowerCase().startsWith("\\uf0")) {
|
||||||
|
int x = Integer.parseInt(wylie.substring("\\u".length()), 16);
|
||||||
|
ThdlDebug.verify((x >= 0xF000
|
||||||
|
&& x <= 0xF0FF));
|
||||||
|
NonUnicodeToTMW[x - '\uF000']
|
||||||
|
= new DuffCode[] { duffCodes[TMW] };
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
// Vowels etc. to use with this glyph:
|
// Vowels etc. to use with this glyph:
|
||||||
case 4:
|
case 4:
|
||||||
|
@ -628,8 +638,8 @@ public class TibetanMachineWeb implements THDLWylieConstants {
|
||||||
String subval = uTok.nextToken();
|
String subval = uTok.nextToken();
|
||||||
ThdlDebug.verify(subval.length() == 4 || subval.length() == 3);
|
ThdlDebug.verify(subval.length() == 4 || subval.length() == 3);
|
||||||
try {
|
try {
|
||||||
int x;
|
int x = Integer.parseInt(subval, 16);
|
||||||
ThdlDebug.verify(((x = Integer.parseInt(subval, 16)) >= 0x0F00
|
ThdlDebug.verify((x >= 0x0F00
|
||||||
&& x <= 0x0FFF)
|
&& x <= 0x0FFF)
|
||||||
|| x == 0x5350
|
|| x == 0x5350
|
||||||
|| x == 0x534D
|
|| x == 0x534D
|
||||||
|
@ -1769,9 +1779,14 @@ private static final String Unicode_tab = "\t";
|
||||||
} else if ('\u0F81' == ch) {
|
} else if ('\u0F81' == ch) {
|
||||||
return tmwFor0F81;
|
return tmwFor0F81;
|
||||||
} else {
|
} else {
|
||||||
DuffCode[] x = UnicodeToTMW[ch - '\u0F00'];
|
if (ch >= '\u0F00' && ch <= '\u0FFF') {
|
||||||
if (null == x[0]) return null;
|
DuffCode[] x = UnicodeToTMW[ch - '\u0F00'];
|
||||||
return x;
|
if (null != x[0]) return x;
|
||||||
|
} else if (ch >= '\uF021' && ch <= '\uF0FF') {
|
||||||
|
DuffCode[] x = NonUnicodeToTMW[ch - '\uF000'];
|
||||||
|
if (null != x[0]) return x;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -607,8 +607,18 @@ public class ACIPConverter {
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
} else if (stype == TString.UNICODE_CHARACTER) {
|
} else if (stype == TString.UNICODE_CHARACTER) {
|
||||||
|
ThdlDebug.verify(1 == s.getText().length());
|
||||||
if (null != writer) {
|
if (null != writer) {
|
||||||
unicode = s.getText();
|
char ch = s.getText().charAt(0);
|
||||||
|
if (ch >= '\uF021' && ch <= '\uF0FF') {
|
||||||
|
hasErrors = true;
|
||||||
|
String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: The Unicode escape '" + ch + "' with ordinal " + (int)ch + " is in the private-use area (PUA) of Unicode and will thus not be written out into the output lest you think other tools will be able to understand this non-standard construction.]";
|
||||||
|
writer.write(errorMessage);
|
||||||
|
if (null != errors)
|
||||||
|
errors.append(errorMessage + "\n");
|
||||||
|
continue; // FIXME: dropping output if null != tdoc
|
||||||
|
} else
|
||||||
|
unicode = s.getText();
|
||||||
}
|
}
|
||||||
if (null != tdoc) {
|
if (null != tdoc) {
|
||||||
duff = TibetanMachineWeb.mapUnicodeToTMW(s.getText().charAt(0));
|
duff = TibetanMachineWeb.mapUnicodeToTMW(s.getText().charAt(0));
|
||||||
|
|
|
@ -41,6 +41,7 @@ public class TString {
|
||||||
* is to be converted to something other than Tibetan text.
|
* is to be converted to something other than Tibetan text.
|
||||||
* (Chinese Unicode, Latin, etc. all qualify as non-Tibetan.) */
|
* (Chinese Unicode, Latin, etc. all qualify as non-Tibetan.) */
|
||||||
public boolean isLatin() {
|
public boolean isLatin() {
|
||||||
|
char ch;
|
||||||
return (type != TIBETAN_NON_PUNCTUATION
|
return (type != TIBETAN_NON_PUNCTUATION
|
||||||
&& type != TIBETAN_PUNCTUATION
|
&& type != TIBETAN_PUNCTUATION
|
||||||
&& type != TSHEG_BAR_ADORNMENT
|
&& type != TSHEG_BAR_ADORNMENT
|
||||||
|
@ -49,7 +50,10 @@ public class TString {
|
||||||
&& type != START_SLASH
|
&& type != START_SLASH
|
||||||
&& type != END_SLASH
|
&& type != END_SLASH
|
||||||
&& (type != UNICODE_CHARACTER
|
&& (type != UNICODE_CHARACTER
|
||||||
|| !UnicodeUtils.isInTibetanRange(getText().charAt(0))));
|
|| !(UnicodeUtils.isInTibetanRange(ch = getText().charAt(0))
|
||||||
|
// EWTS maps some TMW glyphs to this Unicode
|
||||||
|
// private-use area (PUA):
|
||||||
|
|| (ch >= '\uF021' && ch <= '\uF0FF'))));
|
||||||
}
|
}
|
||||||
|
|
||||||
/** For ACIP [#COMMENTS] and EWTS (DLC FIXME: what are EWTS comments?) */
|
/** For ACIP [#COMMENTS] and EWTS (DLC FIXME: what are EWTS comments?) */
|
||||||
|
|
Loading…
Reference in a new issue