7ea185fa01
UnicodeCodepointToThdlWylie.java. Added a new class, UnicodeGraphemeCluster, that can tell you the components of a grapheme cluster from top to bottom. It does not yet have good error checking; it is not yet finished. Next is to parse clean Unicode into GraphemeClusters. After that comes scanning dirty Unicode into best-guess GraphemeClusters, and scanning dirty Unicode to get nice error messages.
377 lines
15 KiB
Java
377 lines
15 KiB
Java
/*
|
|
The contents of this file are subject to the THDL Open Community License
|
|
Version 1.0 (the "License"); you may not use this file except in compliance
|
|
with the License. You may obtain a copy of the License on the THDL web site
|
|
(http://www.thdl.org/).
|
|
|
|
Software distributed under the License is distributed on an "AS IS" basis,
|
|
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
|
License for the specific terms governing rights and limitations under the
|
|
License.
|
|
|
|
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
|
Library (THDL). Portions created by the THDL are Copyright 2001 THDL.
|
|
All Rights Reserved.
|
|
|
|
Contributor(s): ______________________________________.
|
|
*/
|
|
|
|
package org.thdl.tib.text.tshegbar;
|
|
|
|
import java.util.Vector;
|
|
|
|
import org.thdl.util.ThdlDebug;
|
|
|
|
/** A UnicodeGraphemeCluster is either a non-Tibetan codepoint (such
|
|
* as whitespace or control characters or a Latin "character"), or a
|
|
* vertically stacked set of Tibetan consonants, vowels, marks, and
|
|
* signs. The Unicode string
|
|
* <code>"\u0F40\u0F0B\u0F41\u0F0B"</code> specifies
|
|
* four UnicodeGraphemeClusters (the name of the Tibetan alphabet,
|
|
* you might notice), while the Unicode string
|
|
* <code>"\u0F66\u0FA5\u0F39\u0F90\u0FB5\u0F71\u0F80\u0F7F"</code>
|
|
* is one Tibetan stack, sa over fa over ka over Sha with an a-chung,
|
|
* a reversed gi-gu, and a visarga, plus a ngas-bzung-sgor-rtags mark
|
|
* underneath all of that. I assume the latter grapheme cluster is
|
|
* nonsense, but it is considered one grapheme cluster because all
|
|
* but the first char are combining chars. See Unicode Technical
|
|
* Report 29.
|
|
*
|
|
* <p>As the above example demonstrates, not all
|
|
* UnicodeGraphemeClusters are syntactically legal in the Tibetan
|
|
* language. Not all of them are syntactically legal in Sanskrit
|
|
* transcribed in the Tibetan alphabet, either.</p>
|
|
*
|
|
* <p>The Unicode 3.2 standard (see especially Technical Report 29)
|
|
* refers to "grapheme clusters." A UnicodeGraphemeCluster is
|
|
* precisely a grapheme cluster as described by that standard. We
|
|
* interpret the standard as saying that <code>U+0F3E</code> and
|
|
* <code>U+0F3F</code> are each grapheme clusters unto themselves,
|
|
* even though they are combining codepoints.</p>
|
|
*
|
|
* @author David Chandler */
|
|
public class UnicodeGraphemeCluster
|
|
implements UnicodeReadyThunk, UnicodeConstants
|
|
{
|
|
/** @see #getCPHeight(char) */
|
|
private static final int MIN_HEIGHT = -6;
|
|
/** @see #getCPHeight(char) */
|
|
private static final int MAX_HEIGHT = 3;
|
|
|
|
/** The Unicode codepoints that compose this grapheme cluster.
|
|
This is legal, i.e. if there is a Tibetan vowel, it is the
|
|
last codepoint. It is in Normalization Form THDL (NFTHDL). */
|
|
private String unicodeString;
|
|
|
|
/** Do not use this constructor. */
|
|
private UnicodeGraphemeCluster() { super(); }
|
|
|
|
/** Creates a new GraphemeCluster given a legal sequence of
|
|
Unicode codepoints corresponding to a single grapheme
|
|
cluster.
|
|
@exception IllegalArgumentException if unicodeString is not a
|
|
syntactically correct Unicode 3.2 sequence (if it begins with
|
|
a combining codepoint or has a Tibetan vowel before another
|
|
combining character, for example, or if it is more than one
|
|
grapheme cluster. Note that syntactical correctness for
|
|
non-Tibetan codepoints is not likely to be known by this
|
|
routine. */
|
|
public UnicodeGraphemeCluster(String unicodeString)
|
|
throws IllegalArgumentException
|
|
{
|
|
// check legality:
|
|
// DLC NOW FIXME
|
|
|
|
// convert to NFTHDL:
|
|
this.unicodeString
|
|
= UnicodeUtils.toMostlyDecomposedUnicode(unicodeString, NORM_NFTHDL);
|
|
}
|
|
|
|
/** Returns a string of codepoints in NFTHDL form. */
|
|
public String getUnicodeRepresentation() {
|
|
return unicodeString;
|
|
}
|
|
|
|
/** Returns true. */
|
|
public boolean hasUnicodeRepresentation() {
|
|
return true;
|
|
}
|
|
|
|
/** Returns true iff this stack could occur in syntactically
|
|
* correct, run-of-the-mill Tibetan (as opposed to Tibetanized
|
|
* Sanksrit, Chinese, et cetera). sga is a legal Tibetan stack,
|
|
* but g+g is not, for example. */
|
|
public boolean isLegalTibetan() {
|
|
// DLC FIXME: for those odd head marks etc., return true even
|
|
// though hasUnicodeRepresentation() will return false.
|
|
|
|
// Note that ra-btags and wa-zur both be present in legal
|
|
// Tibetan.
|
|
|
|
throw new Error("DLC FIXME: not yet implemented.");
|
|
}
|
|
|
|
/** Returns a <unicodeGraphemeCluster> element that contains the
|
|
* THDL Extended Wylie transliteration for this cluster. */
|
|
public String toConciseXML() {
|
|
throw new Error("DLC NOW unimplemented");
|
|
}
|
|
|
|
/** Returns a <unicodeGraphemeCluster> element that contains this
|
|
* cluster broken down into its constituent decomposed
|
|
* codepoints. */
|
|
public String toVerboseXML() {
|
|
throw new Error("DLC NOW unimplemented");
|
|
}
|
|
|
|
/** Returns the THDL Extended Wylie transliteration of this
|
|
grapheme cluster, or null if there is none (which happens for
|
|
a few Tibetan codepoints, if you'll recall). If needsVowel is
|
|
true, then an "a" will be appended when there is no EW_achung
|
|
or explicit simple vowel. If there is an explicit vowel or
|
|
EW_achung, it will always be present. Note that needsVowel is
|
|
provided because btags is the preferred THDL Extended Wylie
|
|
for the four contiguous grapheme clusters
|
|
<code>"\u0F56\u0F4F\u0F42\u0F66"</code>, and
|
|
needsVowel must be set to false for all but the grapheme
|
|
cluster corresponding to <code>\u0F4F</code> if you wish
|
|
to get the preferred THDL Extended Wylie. */
|
|
public String getThdlWylie(boolean needsVowel) {
|
|
throw new Error("DLC NOW unimplemented");
|
|
}
|
|
|
|
/** Given some (possibly unnormalized) Unicode 3.2 string unicode,
|
|
appends grapheme clusters to the vector of GraphemeClusters
|
|
grcls if grcls is nonnulla. Performs good error checking if
|
|
validate is true. If an error is found, grcls may have been
|
|
modified if nonnull. Setting grcls to null and setting
|
|
validate to true is sometimes useful for testing the validity
|
|
of a Unicode string.
|
|
@return the number of grapheme clusters that were or would
|
|
have been added to grcls
|
|
@exception BadTibetanUnicodeException if the unicode is not
|
|
syntactically legal
|
|
@exception IllegalArgumentException if correctErrors and
|
|
validate are both true
|
|
@exception NullPointerException if unicode is null */
|
|
public static int breakUnicodeIntoGraphemeClusters(Vector grcls,
|
|
String unicode,
|
|
boolean validate,
|
|
boolean correctErrors)
|
|
throws // DLC SOON: BadTibetanUnicodeException,
|
|
IllegalArgumentException, NullPointerException
|
|
{
|
|
if (validate && correctErrors) {
|
|
throw new IllegalArgumentException("validate and correctErrors cannot both be true.");
|
|
}
|
|
throw new Error("DLC NOW unimplemented");
|
|
/*
|
|
if (start == i) {
|
|
// special tests at the beginning of input.
|
|
if (0 != height || UnicodeUtils.combinesLeftToRight(cp)) {
|
|
throw new BadTibetanUnicodeException("A combining codepoint was found at the start of input or after a mark that ends a grapheme cluster.");
|
|
}
|
|
}
|
|
if (height == last_height) {
|
|
if ('\u0F39' == cp) {
|
|
if (!UnicodeUtils.isTibetanConsonant(last_cp)) {
|
|
throw new BadTibetanUnicodeException("U+0F39 can only occur after a (possibly subjoined) Tibetan consonant");
|
|
}
|
|
} else {
|
|
// DLC: cp BEGINS A NEW GRAPHEME CLUSTER!!!
|
|
}
|
|
}
|
|
|
|
// Test to see if this last character has ended this
|
|
// grapheme cluster:
|
|
if (UnicodeUtils.isTibetanTerminatingVowel(cp)) {
|
|
// DLC: cp ENDS A GRAPHEME CLUSTER!!!
|
|
}
|
|
*/
|
|
}
|
|
|
|
/** FIXMEDOC */
|
|
public String getTopToBottomCodepoints() {
|
|
return getTopToBottomCodepoints(new StringBuffer(unicodeString),
|
|
0, unicodeString.length()).toString();
|
|
}
|
|
|
|
/** Returns a new StringBuffer consisting of the codepoints in
|
|
NFTHDLString at indices [start, end) sorted in top-to-bottom
|
|
order, or null on some occasions when NFTHDLString is already
|
|
sorted. A top-to-bottom ordering is a useful form for
|
|
applications wishing to render the grapheme cluster. Note
|
|
that this method is only useful if NFTHDLString is part of or
|
|
an entire grapheme cluster. Does no error checking on
|
|
NFTHDLString.
|
|
@param NFTHDLString a buffer with characters at indices i,
|
|
where start <= i < end, being the Unicode codepoints for a
|
|
single grapheme cluster or part of a grapheme cluster
|
|
@param start NFTHDLString.charAt(start) is the first codepoint
|
|
dealt with
|
|
@param end NFTHDLString.charAt(end) is the first codepoint NOT
|
|
dealt with
|
|
@return null only if (but not necessarily if) NFTHDLString is
|
|
already sorted top-to-bottom, or the sorted form of
|
|
NFTHDLString */
|
|
private static StringBuffer getTopToBottomCodepoints(StringBuffer NFTHDLString, /* DLC FIXME: for efficiency, use a ThdlCharIterator. */
|
|
int start, int end)
|
|
{
|
|
if (end <= start) /* 0-length string. */
|
|
return null;
|
|
if (start + 1 == end) /* 1-length string. */
|
|
return null;
|
|
// else we have a string of length >= 2.
|
|
|
|
// We'll use the world's fastest sorting algorithm. Linear
|
|
// time, baby. Here are the ten or so mailboxes for our
|
|
// postman's sort:
|
|
StringBuffer chunksAtCommonHeights[]
|
|
= new StringBuffer[(MAX_HEIGHT + 1) - MIN_HEIGHT];
|
|
|
|
for (int i = start; i < end; i++) {
|
|
char cp = NFTHDLString.charAt(i);
|
|
int height = getCPHeight(cp);
|
|
|
|
// initialize mailbox if necessary.
|
|
if (null == chunksAtCommonHeights[height - MIN_HEIGHT]) {
|
|
chunksAtCommonHeights[height - MIN_HEIGHT]
|
|
= new StringBuffer(2);
|
|
}
|
|
|
|
// put this cp into the correct mailbox.
|
|
chunksAtCommonHeights[height - MIN_HEIGHT].append(cp);
|
|
}
|
|
|
|
// Now concatenate together the mailboxes:
|
|
StringBuffer sb = new StringBuffer(end - start);
|
|
for (int h = MAX_HEIGHT; h >= MIN_HEIGHT; h--) {
|
|
if (null != chunksAtCommonHeights[h - MIN_HEIGHT]) {
|
|
sb.append(chunksAtCommonHeights[h - MIN_HEIGHT]);
|
|
}
|
|
}
|
|
return sb;
|
|
}
|
|
|
|
|
|
/** Returns the <i>height</i> for the Tibetan Unicode codepoint x.
|
|
This relative height is 0 for a base consonant, digit,
|
|
punctuation, mark, or sign. It is -1 for a subjoined
|
|
consonant, -2 for EWSUB_wa_zur, -3 for EW_achung, +1 for
|
|
EWV_gigu, and so on according to the height these codepoints
|
|
appear relative to one another when on the same stack. If two
|
|
codepoints have equal height, they should not exist in the
|
|
same grapheme cluster unless one is <code>U+0F39</code>, which
|
|
is an integral part of a consonant when tacked on to, e.g.,
|
|
EWC_PHA.
|
|
|
|
<p>If x is not a Unicode 3.2 codepoint in the Tibetan range,
|
|
or if x is not in NFTHDL form, 0 is returned. The height code
|
|
of <code>U+0F76</code> is not valid, and it is not an accident
|
|
that <code>U+0F76</code> is not in NFTHDL form.</p> */
|
|
private static int getCPHeight(char x) {
|
|
// DLC make this an assertion:
|
|
ThdlDebug.verify(null == UnicodeUtils.toNormalizedForm(x, NORM_NFTHDL));
|
|
|
|
if (x >= '\u0F90' && x <= '\u0FAC'
|
|
|| x >= '\u0FAE' && x <= '\u0FBC') {
|
|
// subjoined consonant. Note that wa-zur is an exception.
|
|
return -1;
|
|
} else if (x >= '\u0F00' && x <= '\u0F17'
|
|
|| x >= '\u0F1A' && x <= '\u0F34'
|
|
|| x >= '\u0F3A' && x <= '\u0F3D'
|
|
|| x >= '\u0F40' && x <= '\u0F6A' // consonants
|
|
|| x >= '\u0F88' && x <= '\u0F8B'
|
|
|| x >= '\u0FBE' && x <= '\u0FCF') {
|
|
// neutral height:
|
|
return 0;
|
|
} else { // Oddballs.
|
|
switch (x) {
|
|
//
|
|
// non-combining:
|
|
//
|
|
case '\u0F36':
|
|
case '\u0F38':
|
|
case '\u0F85':
|
|
return 0;
|
|
|
|
|
|
//
|
|
// combining, but left-to-right combining:
|
|
//
|
|
case '\u0F3E':
|
|
case '\u0F3F':
|
|
case '\u0F7F':
|
|
return 0;
|
|
|
|
|
|
//
|
|
// combining by coming below:
|
|
//
|
|
case '\u0FAD':
|
|
return -2; // wa-zur
|
|
case '\u0F71':
|
|
return -3; // a-chung
|
|
case '\u0F74':
|
|
case '\u0F84':
|
|
return -4; // DLC CHECKME
|
|
case '\u0F18': // combines with digits
|
|
case '\u0F19': // combines with digits
|
|
return -5;
|
|
case '\u0F35':
|
|
case '\u0F37':
|
|
case '\u0FC6': {
|
|
ThdlDebug.verify(-6 == MIN_HEIGHT);
|
|
return -6; // min height
|
|
}
|
|
|
|
|
|
//
|
|
// combining by coming above:
|
|
//
|
|
case '\u0F72':
|
|
case '\u0F7A':
|
|
case '\u0F7B':
|
|
case '\u0F7C':
|
|
case '\u0F7D':
|
|
case '\u0F80':
|
|
return 1;
|
|
case '\u0F7E':
|
|
case '\u0F82':
|
|
case '\u0F83':
|
|
return 2; // these three come above 0F7C, right? (DLC CHECKME)
|
|
case '\u0F86':
|
|
case '\u0F87': {
|
|
ThdlDebug.verify(3 == MAX_HEIGHT);
|
|
return 3; // max height
|
|
}
|
|
|
|
|
|
//
|
|
// exceptional case:
|
|
//
|
|
// some would say +1, but then "\u0F40\u0FA5\u0F39" will
|
|
// not have a5 combine with 39. Unicode could well have
|
|
// put in a single codepoint for "\u0FA5\u0F39" IMO.
|
|
case '\u0F39': return 0;
|
|
|
|
|
|
default: {
|
|
if (x >= '\u0F00' && x <= '\u0FFF') {
|
|
// This wasn't explicitly handled? Hmmm... This
|
|
// won't ever happen for NFTHDL-formed input.
|
|
ThdlDebug.noteIffyCode();
|
|
}
|
|
|
|
// This codepoint is not in the Tibetan range.
|
|
return 0;
|
|
}
|
|
} // end switch
|
|
}
|
|
}
|
|
/** DLC SOON */
|
|
public boolean isTibetan() {
|
|
throw new Error("DLC FIXME: not yet implemented.");
|
|
}
|
|
}
|
|
|