A reverter that converts Unicode to computer-friendly (but not, yet,

human-friendly) EWTS is here in alpha mode.  It probably doesn't deal
well with non-Tibetan.
This commit is contained in:
dchandler 2005-08-01 05:54:20 +00:00
parent 00afd75362
commit 5788416629
13 changed files with 496 additions and 47 deletions

View file

@ -102,7 +102,10 @@ public class UnicodeUtils implements UnicodeConstants {
nor NFKD breaks down <code>U+0F00</code> into its constituent
codepoints. NFTHDL uses a maximum of codepoints, and it never
uses codepoints whose use has been {@link #isDiscouraged(char)
discouraged}.
discouraged}. NFTHDL also does not screw things up by using
the standard-but-wrong CCCVs. It sorts stretches of combining
characters wisely as per
{@link http://iris.lib.virginia.edu/tibet/xml/showEssay.php?xml=/tools/encodingTib.xml}.
<p>The Tibetan passages of the returned string are in the
chosen normalized form, but codepoints outside of the {@link
@ -136,6 +139,9 @@ public class UnicodeUtils implements UnicodeConstants {
tibetanUnicode.insert(offset, s);
}
}
if (normForm == NORM_NFTHDL) {
fixSomeOrderingErrorsInTibetanUnicode(tibetanUnicode);
}
}
/** Like {@link #toMostlyDecomposedUnicode(StringBuffer, byte)},
@ -418,7 +424,39 @@ public class UnicodeUtils implements UnicodeConstants {
* product.)
*/
private static char unicode_pairs[][]
= { { '\u0f71', '\u0f74' },
= {
/* TODO(dchandler): use regex
* "[\u0f39\u0f71-\u0f84\u0f86\u0f87]{2,}" to find patches
* that need sorting and then sort each of those. This
* cross product is ugly. */
{ '\u0f39', '\u0f71' },
{ '\u0f39', '\u0f72' },
{ '\u0f39', '\u0f74' },
{ '\u0f39', '\u0f7a' },
{ '\u0f39', '\u0f7b' },
{ '\u0f39', '\u0f7c' },
{ '\u0f39', '\u0f7d' },
{ '\u0f39', '\u0f7e' },
{ '\u0f39', '\u0f7f' },
{ '\u0f39', '\u0f80' },
{ '\u0f39', '\u0f82' },
{ '\u0f39', '\u0f83' },
{ '\u0f71', '\u0f7f' },
{ '\u0f72', '\u0f7f' },
{ '\u0f74', '\u0f7f' },
{ '\u0f7a', '\u0f7f' },
{ '\u0f7b', '\u0f7f' },
{ '\u0f7c', '\u0f7f' },
{ '\u0f7d', '\u0f7f' },
// but not { '\u0f7e', '\u0f7f' },
{ '\u0f39', '\u0f7f' },
{ '\u0f80', '\u0f7f' },
{ '\u0f82', '\u0f7f' },
{ '\u0f83', '\u0f7f' },
{ '\u0f71', '\u0f74' },
{ '\u0f71', '\u0f72' },
{ '\u0f71', '\u0f7a' },
@ -489,7 +527,9 @@ public class UnicodeUtils implements UnicodeConstants {
* the same file modulo Unicode booboos would be better. </p>
*
* @param sb the buffer to be mutated
* @return true if sb was mutated */
* @return true if sb was mutated
* @see <a href="http://iris.lib.virginia.edu/tibet/xml/showEssay.php?xml=/tools/encodingTib.xml">Tibetan Encoding Model</a>
*/
public static boolean fixSomeOrderingErrorsInTibetanUnicode(StringBuffer sb) {
boolean mutated = false;
int len = sb.length();
@ -512,25 +552,5 @@ public class UnicodeUtils implements UnicodeConstants {
} while (mutated_this_time_through);
return mutated;
}
/** Returns true iff ch is a valid Tibetan codepoint in Unicode
* 4.0: */
public boolean isTibetanUnicodeCodepoint(char ch) {
// NOTE: could use an array of 256 booleans for speed but I'm lazy
return ((ch >= '\u0f00' && ch <= '\u0fcf')
&& !(ch == '\u0f48'
|| (ch > '\u0f6a' && ch < '\u0f71')
|| (ch > '\u0f8b' && ch < '\u0f90')
|| ch == '\u0f98'
|| ch == '\u0fbd'
|| ch == '\u0fcd'
|| ch == '\u0fce'));
}
/** Returns true iff ch is in 0F00-0FFF but isn't a valid Tibetan
* codepoint in Unicode 4.0: */
public boolean isInvalidTibetanUnicode(char ch) {
return (isInTibetanRange(ch) && !isTibetanUnicodeCodepoint(ch));
}
}