A reverter that converts Unicode to computer-friendly (but not, yet,

human-friendly) EWTS is here in alpha mode.  It probably doesn't deal
well with non-Tibetan.
This commit is contained in:
dchandler 2005-08-01 05:54:20 +00:00
parent 00afd75362
commit 5788416629
13 changed files with 496 additions and 47 deletions

View file

@ -463,6 +463,7 @@ public final class LegalTshegBar
* concatenation like 'u'i'o. Returns false otherwise (including
* the case that suffix is the empty string). */
public static boolean isAchungBasedSuffix(String suffix) {
// TODO(dchandler): use java.util.regex
int i = 0; // so that the empty string causes false to be returned.
while (i == 0 || !suffix.equals("")) {
boolean startsWithOneOfThem = false;

View file

@ -67,11 +67,16 @@ public class UnicodeCodepointToThdlWylie {
// fail.
switch (x) {
case '\t': return "\t";
case '\n': return "\n";
case '\r': return "\r";
case ' ': return "_";
case '\u00a0': return "_";
case '\u0F00': return "oM";
case '\u0F01': return "\\u0F01";
case '\u0F02': return null; // DLC
case '\u0F03': return null; // DLC
case '\u0F02': return "\\u0F02";
case '\u0F03': return "\\u0F03";
case '\u0F04': return "@";
case '\u0F05': return "#";
case '\u0F06': return "$";
@ -314,8 +319,6 @@ public class UnicodeCodepointToThdlWylie {
case '\u0FCF': return "\\u0FCF"; // DLC i added this to the 'EWTS document misspeaks' bug report... null I think...
default: {
// DLC handle space (EW's "_")
// This codepoint is in the range 0FD0-0FFF or is not in
// the Tibetan range at all. In either case, there is no
// corresponding THDL Extended Wylie.

View file

@ -102,7 +102,10 @@ public class UnicodeUtils implements UnicodeConstants {
nor NFKD breaks down <code>U+0F00</code> into its constituent
codepoints. NFTHDL uses a maximum of codepoints, and it never
uses codepoints whose use has been {@link #isDiscouraged(char)
discouraged}.
discouraged}. NFTHDL also does not screw things up by using
the standard-but-wrong CCCVs. It sorts stretches of combining
characters wisely as per
{@link http://iris.lib.virginia.edu/tibet/xml/showEssay.php?xml=/tools/encodingTib.xml}.
<p>The Tibetan passages of the returned string are in the
chosen normalized form, but codepoints outside of the {@link
@ -136,6 +139,9 @@ public class UnicodeUtils implements UnicodeConstants {
tibetanUnicode.insert(offset, s);
}
}
if (normForm == NORM_NFTHDL) {
fixSomeOrderingErrorsInTibetanUnicode(tibetanUnicode);
}
}
/** Like {@link #toMostlyDecomposedUnicode(StringBuffer, byte)},
@ -418,7 +424,39 @@ public class UnicodeUtils implements UnicodeConstants {
* product.)
*/
private static char unicode_pairs[][]
= { { '\u0f71', '\u0f74' },
= {
/* TODO(dchandler): use regex
* "[\u0f39\u0f71-\u0f84\u0f86\u0f87]{2,}" to find patches
* that need sorting and then sort each of those. This
* cross product is ugly. */
{ '\u0f39', '\u0f71' },
{ '\u0f39', '\u0f72' },
{ '\u0f39', '\u0f74' },
{ '\u0f39', '\u0f7a' },
{ '\u0f39', '\u0f7b' },
{ '\u0f39', '\u0f7c' },
{ '\u0f39', '\u0f7d' },
{ '\u0f39', '\u0f7e' },
{ '\u0f39', '\u0f7f' },
{ '\u0f39', '\u0f80' },
{ '\u0f39', '\u0f82' },
{ '\u0f39', '\u0f83' },
{ '\u0f71', '\u0f7f' },
{ '\u0f72', '\u0f7f' },
{ '\u0f74', '\u0f7f' },
{ '\u0f7a', '\u0f7f' },
{ '\u0f7b', '\u0f7f' },
{ '\u0f7c', '\u0f7f' },
{ '\u0f7d', '\u0f7f' },
// but not { '\u0f7e', '\u0f7f' },
{ '\u0f39', '\u0f7f' },
{ '\u0f80', '\u0f7f' },
{ '\u0f82', '\u0f7f' },
{ '\u0f83', '\u0f7f' },
{ '\u0f71', '\u0f74' },
{ '\u0f71', '\u0f72' },
{ '\u0f71', '\u0f7a' },
@ -489,7 +527,9 @@ public class UnicodeUtils implements UnicodeConstants {
* the same file modulo Unicode booboos would be better. </p>
*
* @param sb the buffer to be mutated
* @return true if sb was mutated */
* @return true if sb was mutated
* @see <a href="http://iris.lib.virginia.edu/tibet/xml/showEssay.php?xml=/tools/encodingTib.xml">Tibetan Encoding Model</a>
*/
public static boolean fixSomeOrderingErrorsInTibetanUnicode(StringBuffer sb) {
boolean mutated = false;
int len = sb.length();
@ -512,25 +552,5 @@ public class UnicodeUtils implements UnicodeConstants {
} while (mutated_this_time_through);
return mutated;
}
/** Returns true iff ch is a valid Tibetan codepoint in Unicode
* 4.0: */
public boolean isTibetanUnicodeCodepoint(char ch) {
// NOTE: could use an array of 256 booleans for speed but I'm lazy
return ((ch >= '\u0f00' && ch <= '\u0fcf')
&& !(ch == '\u0f48'
|| (ch > '\u0f6a' && ch < '\u0f71')
|| (ch > '\u0f8b' && ch < '\u0f90')
|| ch == '\u0f98'
|| ch == '\u0fbd'
|| ch == '\u0fcd'
|| ch == '\u0fce'));
}
/** Returns true iff ch is in 0F00-0FFF but isn't a valid Tibetan
* codepoint in Unicode 4.0: */
public boolean isInvalidTibetanUnicode(char ch) {
return (isInTibetanRange(ch) && !isTibetanUnicodeCodepoint(ch));
}
}