From a39c5c12b095cb7f519133512c9b7cc572c8a54a Mon Sep 17 00:00:00 2001 From: dchandler Date: Mon, 8 Dec 2003 07:15:27 +0000 Subject: [PATCH] ACIP->TMW now supports EWTS PUA {\uF021}-style escapes. Our extended ACIP is thus TMW-complete and useful for testing. --- .../org/thdl/tib/text/TibetanMachineWeb.java | 25 +++++++++++++++---- .../org/thdl/tib/text/ttt/ACIPConverter.java | 12 ++++++++- source/org/thdl/tib/text/ttt/TString.java | 6 ++++- 3 files changed, 36 insertions(+), 7 deletions(-) diff --git a/source/org/thdl/tib/text/TibetanMachineWeb.java b/source/org/thdl/tib/text/TibetanMachineWeb.java index 0752ef2..011f694 100644 --- a/source/org/thdl/tib/text/TibetanMachineWeb.java +++ b/source/org/thdl/tib/text/TibetanMachineWeb.java @@ -84,6 +84,8 @@ public class TibetanMachineWeb implements THDLWylieConstants { use special formatting to get those right (FIXME: warn whenever they're used). */ private static DuffCode[][] UnicodeToTMW = new DuffCode[256][1]; + /** For mapping codepoints U+F021..U+0FFF to TMW. */ + private static DuffCode[][] NonUnicodeToTMW = new DuffCode[256][1]; private static String fileName = "tibwn.ini"; private static final String DELIMITER = "~"; /** vowels that appear over the glyph: */ @@ -603,6 +605,14 @@ public class TibetanMachineWeb implements THDLWylieConstants { // could well be null): TMWtoTM[duffCodes[TMW].getFontNum()-1][duffCodes[TMW].getCharNum()-32] = duffCodes[TM]; // TMW->TM mapping + + if (wylie.toLowerCase().startsWith("\\uf0")) { + int x = Integer.parseInt(wylie.substring("\\u".length()), 16); + ThdlDebug.verify((x >= 0xF000 + && x <= 0xF0FF)); + NonUnicodeToTMW[x - '\uF000'] + = new DuffCode[] { duffCodes[TMW] }; + } break; // Vowels etc. to use with this glyph: case 4: @@ -628,8 +638,8 @@ public class TibetanMachineWeb implements THDLWylieConstants { String subval = uTok.nextToken(); ThdlDebug.verify(subval.length() == 4 || subval.length() == 3); try { - int x; - ThdlDebug.verify(((x = Integer.parseInt(subval, 16)) >= 0x0F00 + int x = Integer.parseInt(subval, 16); + ThdlDebug.verify((x >= 0x0F00 && x <= 0x0FFF) || x == 0x5350 || x == 0x534D @@ -1769,9 +1779,14 @@ private static final String Unicode_tab = "\t"; } else if ('\u0F81' == ch) { return tmwFor0F81; } else { - DuffCode[] x = UnicodeToTMW[ch - '\u0F00']; - if (null == x[0]) return null; - return x; + if (ch >= '\u0F00' && ch <= '\u0FFF') { + DuffCode[] x = UnicodeToTMW[ch - '\u0F00']; + if (null != x[0]) return x; + } else if (ch >= '\uF021' && ch <= '\uF0FF') { + DuffCode[] x = NonUnicodeToTMW[ch - '\uF000']; + if (null != x[0]) return x; + } + return null; } } diff --git a/source/org/thdl/tib/text/ttt/ACIPConverter.java b/source/org/thdl/tib/text/ttt/ACIPConverter.java index 0b551b9..ac27ba0 100644 --- a/source/org/thdl/tib/text/ttt/ACIPConverter.java +++ b/source/org/thdl/tib/text/ttt/ACIPConverter.java @@ -607,8 +607,18 @@ public class ACIPConverter { } continue; } else if (stype == TString.UNICODE_CHARACTER) { + ThdlDebug.verify(1 == s.getText().length()); if (null != writer) { - unicode = s.getText(); + char ch = s.getText().charAt(0); + if (ch >= '\uF021' && ch <= '\uF0FF') { + hasErrors = true; + String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: The Unicode escape '" + ch + "' with ordinal " + (int)ch + " is in the private-use area (PUA) of Unicode and will thus not be written out into the output lest you think other tools will be able to understand this non-standard construction.]"; + writer.write(errorMessage); + if (null != errors) + errors.append(errorMessage + "\n"); + continue; // FIXME: dropping output if null != tdoc + } else + unicode = s.getText(); } if (null != tdoc) { duff = TibetanMachineWeb.mapUnicodeToTMW(s.getText().charAt(0)); diff --git a/source/org/thdl/tib/text/ttt/TString.java b/source/org/thdl/tib/text/ttt/TString.java index 632c531..22c4343 100644 --- a/source/org/thdl/tib/text/ttt/TString.java +++ b/source/org/thdl/tib/text/ttt/TString.java @@ -41,6 +41,7 @@ public class TString { * is to be converted to something other than Tibetan text. * (Chinese Unicode, Latin, etc. all qualify as non-Tibetan.) */ public boolean isLatin() { + char ch; return (type != TIBETAN_NON_PUNCTUATION && type != TIBETAN_PUNCTUATION && type != TSHEG_BAR_ADORNMENT @@ -49,7 +50,10 @@ public class TString { && type != START_SLASH && type != END_SLASH && (type != UNICODE_CHARACTER - || !UnicodeUtils.isInTibetanRange(getText().charAt(0)))); + || !(UnicodeUtils.isInTibetanRange(ch = getText().charAt(0)) + // EWTS maps some TMW glyphs to this Unicode + // private-use area (PUA): + || (ch >= '\uF021' && ch <= '\uF0FF')))); } /** For ACIP [#COMMENTS] and EWTS (DLC FIXME: what are EWTS comments?) */