ACIP->Unicode, without going through TMW, is now possible, so long as
\, the Sanskrit virama, is not used. Of the 1370-odd ACIP texts I've got here, about 57% make it through the gauntlet (fewer if you demand a vowel or disambiguator on every stack of a non-Tibetan tsheg bar).
This commit is contained in:
parent
245aac4911
commit
1afb3a0fdd
12 changed files with 646 additions and 40 deletions
|
@ -341,7 +341,7 @@ public final class LegalTshegBar
|
|||
EWC_ta, EWC_tha, EWC_da, EWC_na,
|
||||
EWC_pa, EWC_pha, EWC_ba, EWC_ma,
|
||||
EWC_tsa, EWC_tsha, EWC_dza, EWC_wa,
|
||||
EWC_zha, EWC_za, EWC_achung, EWC_ya,
|
||||
EWC_zha, EWC_za, EWC_achung, EWC_ya,
|
||||
EWC_ra, EWC_la, EWC_sha, EWC_sa,
|
||||
EWC_ha, EWC_a
|
||||
});
|
||||
|
@ -833,7 +833,7 @@ public final class LegalTshegBar
|
|||
return internalThrowThing(throwIfIllegal,
|
||||
errorBuf,
|
||||
"Illegal suffix -- not one of the ten legal suffixes: "
|
||||
+ UnicodeUtils.unicodeCodepointToString(suffix.charAt(0)));
|
||||
+ UnicodeUtils.unicodeCodepointToString(suffix.charAt(0), false));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -286,8 +286,41 @@ public class UnicodeUtils implements UnicodeConstants {
|
|||
}
|
||||
|
||||
/** Returns a human-readable, ASCII form of the Unicode codepoint
|
||||
cp. */
|
||||
public static String unicodeCodepointToString(char cp) {
|
||||
cp. If shortenIfPossible is true, then printable ASCII
|
||||
characters will appear as themselves. */
|
||||
public static String unicodeCodepointToString(char cp,
|
||||
boolean shortenIfPossible) {
|
||||
if (shortenIfPossible) {
|
||||
if ((cp >= 'a' && cp <= 'z')
|
||||
|| (cp >= 'A' && cp <= 'Z')
|
||||
|| (cp >= '0' && cp <= '9')
|
||||
|| cp == '.'
|
||||
|| cp == ','
|
||||
|| cp == ' '
|
||||
|| cp == '\''
|
||||
|| cp == '"'
|
||||
|| cp == '+'
|
||||
|| cp == '-'
|
||||
|| cp == '='
|
||||
|| cp == '_'
|
||||
|| cp == '@'
|
||||
|| cp == '!'
|
||||
|| cp == '#'
|
||||
|| cp == '$'
|
||||
|| cp == '%'
|
||||
|| cp == '^'
|
||||
|| cp == '&'
|
||||
|| cp == '*'
|
||||
|| cp == '\t'
|
||||
|| cp == ':'
|
||||
|| cp == '['
|
||||
|| cp == ']'
|
||||
|| cp == '('
|
||||
|| cp == ')'
|
||||
|| cp == '{'
|
||||
|| cp == '}')
|
||||
return new String(new char[] { cp });
|
||||
}
|
||||
if (cp < '\u0010')
|
||||
return "\\u000" + Integer.toHexString((int)cp);
|
||||
else if (cp < '\u0100')
|
||||
|
@ -304,7 +337,19 @@ public class UnicodeUtils implements UnicodeConstants {
|
|||
public static String unicodeStringToString(String s) {
|
||||
StringBuffer sb = new StringBuffer(s.length() * 6);
|
||||
for (int i = 0; i < s.length(); i++) {
|
||||
sb.append(unicodeCodepointToString(s.charAt(i)));
|
||||
sb.append(unicodeCodepointToString(s.charAt(i), false));
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the most succinct possible, human-readable, ASCII form
|
||||
* of the String s of Unicode codepoints. */
|
||||
public static String unicodeStringToPrettyString(String s) {
|
||||
if (s == null) return "null";
|
||||
StringBuffer sb = new StringBuffer(s.length() * 6);
|
||||
for (int i = 0; i < s.length(); i++) {
|
||||
sb.append(unicodeCodepointToString(s.charAt(i), true));
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
|
|
@ -321,15 +321,15 @@ public class UnicodeUtilsTest extends TestCase implements UnicodeConstants {
|
|||
* Tests the {@link UnicodeUtils#unicodeCodepointToString(char)}
|
||||
* method. */
|
||||
public void testUnicodeCodepointToString() {
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\u0000').equals("\\u0000"));
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\u0001').equals("\\u0001"));
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\u000F').equals("\\u000f"));
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\u001F').equals("\\u001f"));
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\u00fF').equals("\\u00ff"));
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\u01fF').equals("\\u01ff"));
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\u0ffF').equals("\\u0fff"));
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\u1ffF').equals("\\u1fff"));
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\ufffF').equals("\\uffff"));
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\u0000', false).equals("\\u0000"));
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\u0001', false).equals("\\u0001"));
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\u000F', false).equals("\\u000f"));
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\u001F', false).equals("\\u001f"));
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\u00fF', false).equals("\\u00ff"));
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\u01fF', false).equals("\\u01ff"));
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\u0ffF', false).equals("\\u0fff"));
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\u1ffF', false).equals("\\u1fff"));
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\ufffF', false).equals("\\uffff"));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue