Now uses terminology from the Unicode standard. No more talk of
characters, for example. Normalization forms NFKD and NFD are supported for the Tibetan Unicode range. I don't like either, actually. I've tested NFKD, but I've not yet committed the tests.
This commit is contained in:
parent
3199ff7926
commit
a42347b224
7 changed files with 210 additions and 136 deletions
|
@ -103,7 +103,7 @@ And also there are cases where they combine. For ex you can have
|
|||
* consonants and vowels. In some situations, you should use {@link
|
||||
* #EWSUB_wa_zur} to represent the consonant wa, while in others you
|
||||
* should use {@link #EWC_wa}, even though you mean to subscribe a
|
||||
* fixed-form wa. Basically, stick to the characters for which
|
||||
* fixed-form wa. Basically, stick to the codepoints for which
|
||||
* enumerations exist in {@link
|
||||
* org.thdl.tib.text.tshegbar.UnicodeConstants} and use your common
|
||||
* sense.</p>
|
||||
|
@ -131,7 +131,7 @@ public class LegalTshegBar
|
|||
private boolean hasWaZur;
|
||||
/** true iff EW_wa_zur is under the root syllable. */
|
||||
private boolean hasAChung;
|
||||
/** If this is a string, it is of a single character or is equal
|
||||
/** If this is a string, it is of a single codepoint or is equal
|
||||
* to {@link #getConnectiveCaseSuffix()} */
|
||||
private String suffix;
|
||||
/** EW_da, EW_sa, or EW_ABSENT */
|
||||
|
@ -237,7 +237,7 @@ public class LegalTshegBar
|
|||
|
||||
/** Returns null if there is no suffix, or a string containing the
|
||||
* one consonant or a string <code>"\u0F60\u0F72"</code>
|
||||
* containing two characters in the special case that the suffix
|
||||
* containing two codepoints in the special case that the suffix
|
||||
* is that connective case marker {@link
|
||||
* #getConnectiveCaseSuffix()}. */
|
||||
public String getSuffix() {
|
||||
|
@ -317,7 +317,7 @@ public class LegalTshegBar
|
|||
}
|
||||
|
||||
|
||||
/** Returns a string of two characters, da and sa. */
|
||||
/** Returns a string of two codepoints, da and sa. */
|
||||
public static String getPossiblePostsuffixes() {
|
||||
return new String(new char[] { EWC_da, EWC_sa });
|
||||
}
|
||||
|
@ -328,7 +328,7 @@ public class LegalTshegBar
|
|||
EWC_ra, EWC_la, EWC_sa
|
||||
});
|
||||
|
||||
/** Returns a string of ten characters, each of which can be a
|
||||
/** Returns a string of ten codepoints, each of which can be a
|
||||
* suffix in Tibetan. */
|
||||
public static String getPossibleSuffixes() {
|
||||
return possibleSuffixes;
|
||||
|
@ -345,7 +345,7 @@ public class LegalTshegBar
|
|||
EWC_achen, EWV_i
|
||||
});
|
||||
|
||||
/** Returns a two-character string consisting of the Unicode
|
||||
/** Returns a two-codepoint string consisting of the Unicode
|
||||
* representation of what Extended Wylie calls
|
||||
* <code>'i</code>. */
|
||||
public static String getConnectiveCaseSuffix() {
|
||||
|
@ -594,9 +594,9 @@ public class LegalTshegBar
|
|||
* @param rootLetter the mandatory root consonant
|
||||
* @param subjoinedLetter the optional, subscribed consonant
|
||||
* @param suffix the optional suffix, which is null, a String
|
||||
* consisting of a single consonant (i.e. a single character)
|
||||
* except in the special case that this is {@link
|
||||
* #getConnectiveCaseSuffix()}
|
||||
* consisting of a single consonant (i.e. a single,
|
||||
* nondecomposable codepoint) except in the special case that
|
||||
* this is {@link #getConnectiveCaseSuffix()}
|
||||
* @param postsuffix the optional postsuffix, which should be
|
||||
* EWC_sa or EWC_da
|
||||
* @param vowel the optional vowel */
|
||||
|
@ -748,7 +748,7 @@ public class LegalTshegBar
|
|||
if (!isNominalRepresentationOfSimpleSuffix(suffix.charAt(0))) {
|
||||
return internalThrowThing(throwIfIllegal,
|
||||
"Illegal suffix -- not one of the ten legal suffixes: "
|
||||
+ UnicodeUtils.unicodeCharToString(suffix.charAt(0)));
|
||||
+ UnicodeUtils.unicodeCPToString(suffix.charAt(0)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -971,10 +971,11 @@ public class LegalTshegBar
|
|||
|
||||
|
||||
/** Overrides {@link org.thdl.tib.text.tshegbar.UnicodeReadyThunk}
|
||||
method to return {@link UnicodeUtils#toCanonicalForm(String)
|
||||
canonically-formed Unicode}.
|
||||
method to return {@link
|
||||
UnicodeUtils#toMostlyDecomposedUnicode(String, byte)
|
||||
NFKD-normalized Unicode}.
|
||||
@exception UnsupportedOperationException is never thrown */
|
||||
public String getEquivalentUnicode() {
|
||||
public String getUnicodeRepresentation() {
|
||||
StringBuffer sb = new StringBuffer();
|
||||
if (hasPrefix()) {
|
||||
ThdlDebug.verify(UnicodeUtils.isNonSubjoinedConsonant(getPrefix()));
|
||||
|
@ -1017,7 +1018,7 @@ public class LegalTshegBar
|
|||
|
||||
/** Overrides {@link org.thdl.tib.text.tshegbar.UnicodeReadyThunk}
|
||||
method to return true. */
|
||||
public boolean hasEquivalentUnicode() {
|
||||
public boolean hasUnicodeRepresentation() {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -23,26 +23,37 @@ package org.thdl.tib.text.tshegbar;
|
|||
*
|
||||
* <p> First, some terminology.</p>
|
||||
*
|
||||
* <ul> <li>When we talk about a <i>glyph</i>, we mean a picture
|
||||
* found in a font. A single glyph may have one or more
|
||||
* representations by sequences of Unicode characters, or it may not
|
||||
* be representable becuase it is only part of one Unicode character
|
||||
* or pictures a nonstandard character.</li> <li>When we talk about a
|
||||
* <i>stack</i>, we mean either a number (or half-number), a mark or
|
||||
* sign, a bit of punctuation, or a consonant stack.</li> <li>A
|
||||
* <i>consonant stack</i> is or one or more consonants stacked
|
||||
* vertically, plus an optional vocalic modification such as an
|
||||
* anusvara (DLC what do we call a bindu?) or visarga, plus zero or
|
||||
* more signs like <code>\u0F35</code>, plus an optional a-chung
|
||||
* (<code>\u0F71</code>), plus an optional simple vowel.</li> <li>By
|
||||
* <i>simple vowel</i>, we mean any of <code>\u0F72</code>,
|
||||
* <code>\u0F74</code>, <code>\u0F7A</code>, <code>\u0F7B</code>,
|
||||
* <ul> <li>When we talk about a <i>grapheme cluster</i> (or
|
||||
* <i>grcl</i>), we mean what the Unicode standard calls a "grapheme
|
||||
* cluster". Most glyphs (i.e., pictures) found in a font are
|
||||
* grapheme clusters, but the picture corresponding to the Unicode
|
||||
* codepoint <code>\u0F74</code> is not a grapheme cluster. In
|
||||
* addition, in English, many fonts have a single glyph (a
|
||||
* "ligature") for the combination of two grapheme clusters,
|
||||
* e.g. "fi". A single grapheme cluster may have one or more
|
||||
* representations by sequences of Unicode codepoints, or it may not
|
||||
* be representable becuase it is only part of one Unicode codepoint
|
||||
* or pictures a nonstandard character.</li> <li>We will attempt to
|
||||
* avoid using the word "character", as it sometimes refers to a
|
||||
* codepoint and sometimes refers to a glyph in a font and yet other
|
||||
* times refers to a grapheme cluster.</li> <li>We'll try to avoid
|
||||
* using the word "stack" because it sometimes refers to a sequence
|
||||
* of stacked Tibetan consonants and sometimes refers to an entire
|
||||
* grapheme cluster.</li> <li>A <i>Tibetan stack</i> is or one or
|
||||
* more consonants stacked vertically, plus an optional vocalic
|
||||
* modification such as an anusvara (DLC what do we call a bindu?) or
|
||||
* visarga, plus zero or more signs like <code>\u0F35</code>,
|
||||
* plus an optional a-chung (<code>\u0F71</code>), plus an
|
||||
* optional simple vowel.</li> <li>By <i>simple vowel</i>, we mean
|
||||
* any of <code>\u0F72</code>, <code>\u0F74</code>,
|
||||
* <code>\u0F7A</code>, <code>\u0F7B</code>,
|
||||
* <code>\u0F7C</code>, <code>\u0F7D</code>, or
|
||||
* <code>\u0F80</code>.</li> </ul>
|
||||
*
|
||||
* (Note: The string <code>"\u0F68\u0F7E\u0F7C"</code> seems to equal
|
||||
* <code>"\u0F00"</code>, though the Unicode standard does not
|
||||
* indicate that it is so. This code treats it that way.)</p>
|
||||
* <p>(Note: The string <code>"\u0F68\u0F7E\u0F7C"</code>
|
||||
* seems to equal <code>"\u0F00"</code>, though the Unicode
|
||||
* standard does not indicate that it is so. This code treats it
|
||||
* that way.)</p>
|
||||
*
|
||||
* <p> This class allows for invalid tsheg bars, like those
|
||||
* containing more than one prefix, more than two suffixes, an
|
||||
|
@ -55,10 +66,10 @@ package org.thdl.tib.text.tshegbar;
|
|||
* and for invalid tsheg bars. Note that correctness is at the tsheg
|
||||
* bar level only; it may be grammatically incorrect to concatenate
|
||||
* two valid tsheg bars. Some subclasses can be represented in
|
||||
* Unicode, but others contain nonstandard glyphs and cannot be.</p>
|
||||
* Unicode, but others contain nonstandard glyphs/characters and
|
||||
* cannot be.</p>
|
||||
*
|
||||
* @author David Chandler
|
||||
*/
|
||||
* @author David Chandler */
|
||||
public abstract class TshegBar implements UnicodeReadyThunk {
|
||||
/** Returns true, as we consider a transliteration in the Tibetan
|
||||
* alphabet of a non-Tibetan language, say Chinese, as being
|
||||
|
|
|
@ -21,10 +21,10 @@ package org.thdl.tib.text.tshegbar;
|
|||
import org.thdl.tib.text.TibetanMachineWeb;
|
||||
|
||||
/** This noninstantiable class allows for converting from Unicode
|
||||
* characters (i.e., code points) to Extended Wylie. It cannot be
|
||||
* used for long stretches of text, though, as it is unaware of
|
||||
* context, which is essential to understanding a non-trivial string
|
||||
* of Tibetan Unicode.
|
||||
* codepoints to Extended Wylie. It cannot be used for long
|
||||
* stretches of text, though, as it is unaware of context, which is
|
||||
* essential to understanding a non-trivial string of Tibetan
|
||||
* Unicode.
|
||||
*
|
||||
* <p>See the document by Nathaniel Garson and David Germano entitled
|
||||
* <i>Extended Wylie Transliteration Scheme</i>. Note that there are
|
||||
|
@ -307,7 +307,7 @@ public class UnicodeCharToExtendedWylie {
|
|||
default: {
|
||||
// DLC handle space (EW's "_")
|
||||
|
||||
// This character is in the range 0FD0-0FFF or is not in
|
||||
// This codepoint is in the range 0FD0-0FFF or is not in
|
||||
// the Tibetan range at all. In either case, there is no
|
||||
// corresponding Extended Wylie.
|
||||
return null;
|
||||
|
|
|
@ -19,7 +19,7 @@ Contributor(s): ______________________________________.
|
|||
package org.thdl.tib.text.tshegbar;
|
||||
|
||||
/** Provides handy Extended Wylie-inspired names for Unicode
|
||||
* characters commonly used to represent Tibetan. The consonant that
|
||||
* codepoints commonly used to represent Tibetan. The consonant that
|
||||
* the Extended Wylie text "ka" refers to is named EWC_ka as in "The
|
||||
* Extended Wylie Consonant ka", the vowel represented in Extended
|
||||
* Wylie by "i" is EWV_i, and so on. There is at least one exception
|
||||
|
@ -30,10 +30,26 @@ package org.thdl.tib.text.tshegbar;
|
|||
* @author David Chandler */
|
||||
public interface UnicodeConstants {
|
||||
|
||||
/** for those times when you need a char to represent a non-existent character */
|
||||
/** Refers to unnormalized Unicode: */
|
||||
static final byte NORM_UNNORMALIZED = 0;
|
||||
/** Refers to Normalization Form C: */
|
||||
static final byte NORM_NFC = 1;
|
||||
/** Refers to Normalization Form KC: */
|
||||
static final byte NORM_NFKC = 2;
|
||||
/** Refers to Normalization Form D: */
|
||||
static final byte NORM_NFD = 3;
|
||||
/** Refers to Normalization Form KD: */
|
||||
static final byte NORM_NFKD = 4;
|
||||
|
||||
|
||||
/** for those times when you need a char to represent a
|
||||
non-existent codepoint */
|
||||
static final char EW_ABSENT = '\u0000';
|
||||
|
||||
|
||||
//
|
||||
// the thirty consonants, in alphabetical order:
|
||||
//
|
||||
|
||||
/** first letter of the alphabet: */
|
||||
static final char EWC_ka = '\u0F40';
|
||||
|
@ -70,11 +86,13 @@ public interface UnicodeConstants {
|
|||
static final char EWC_ha = '\u0F67';
|
||||
static final char EWC_a = '\u0F68';
|
||||
|
||||
|
||||
/** In the word for father, "pA lags", there is an a-chung (i.e.,
|
||||
<code>\u0F71</code>). This is the constant for that little
|
||||
guy. */
|
||||
static final char EW_achung = '\u0F71';
|
||||
|
||||
|
||||
/* Four of the five vowels, some say, or, others say, "the four
|
||||
vowels": */
|
||||
/** "gi gu", the 'i' sound in the English word keep: */
|
||||
|
|
|
@ -18,14 +18,14 @@ Contributor(s): ______________________________________.
|
|||
|
||||
package org.thdl.tib.text.tshegbar;
|
||||
|
||||
/** A UnicodeReadyThunk represents a string of characters. While
|
||||
* there are ways to turn a string of Unicode characters into a list
|
||||
/** A UnicodeReadyThunk represents a string of codepoints. While
|
||||
* there are ways to turn a string of Unicode codepoints into a list
|
||||
* of UnicodeReadyThunks (DLC reference it), you cannot
|
||||
* necessarily recover the exact sequence of Unicode characters from
|
||||
* a UnicodeReadyThunk. For characters that are not Tibetan
|
||||
* Unicode and are not one of a handful of other known characters,
|
||||
* necessarily recover the exact sequence of Unicode codepoints from
|
||||
* a UnicodeReadyThunk. For codepoints that are not Tibetan
|
||||
* Unicode and are not one of a handful of other known codepoints,
|
||||
* only the most primitive operations are available. Generally in
|
||||
* this case you can recover the exact string of Unicode characters,
|
||||
* this case you can recover the exact string of Unicode codepoints,
|
||||
* but don't bank on it.
|
||||
*
|
||||
* @author David Chandler
|
||||
|
@ -33,23 +33,25 @@ package org.thdl.tib.text.tshegbar;
|
|||
public interface UnicodeReadyThunk {
|
||||
|
||||
/** Returns true iff this thunk is entirely Tibetan (regardless of
|
||||
whether or not all characters come from the Tibetan range of
|
||||
Unicode 3, i.e. <code>0x0F00</code>-<code>0x0FFF</code>). */
|
||||
whether or not all codepoints come from the Tibetan range of
|
||||
Unicode 3, i.e. <code>U+0F00</code>-<code>U+0FFF</code>, and
|
||||
regardless of whether or not this thunk is syntactically legal
|
||||
Tibetan). */
|
||||
public boolean isTibetan();
|
||||
|
||||
/** Returns a sequence of Unicode characters that is equivalent to
|
||||
/** Returns a sequence of Unicode codepoints that is equivalent to
|
||||
* this thunk if possible. It is only possible if {@link
|
||||
* #hasEquivalentUnicode()} is true. Unicode has more than one
|
||||
* #hasUnicodeRepresentation()} is true. Unicode has more than one
|
||||
* way to refer to the same language element, so this is just one
|
||||
* method. When more than one Unicode sequence exists, and when
|
||||
* the thunk {@link #isTibetan() is Tibetan}, this method returns
|
||||
* sequences that the Unicode 3.2 standard does not discourage.
|
||||
* @exception UnsupportedOperationException if {@link
|
||||
* #hasEquivalentUnicode()} is false
|
||||
* @return a String of Unicode characters */
|
||||
public String getEquivalentUnicode() throws UnsupportedOperationException;
|
||||
* #hasUnicodeRepresentation()} is false
|
||||
* @return a String of Unicode codepoints */
|
||||
public String getUnicodeRepresentation() throws UnsupportedOperationException;
|
||||
|
||||
/** Returns true iff there exists a sequence of Unicode characters
|
||||
/** Returns true iff there exists a sequence of Unicode codepoints
|
||||
* that correctly represents this thunk. This will not be the
|
||||
* case if the thunk contains Tibetan characters for which the
|
||||
* Unicode standard does not provide. See the Extended Wylie
|
||||
|
@ -58,6 +60,6 @@ public interface UnicodeReadyThunk {
|
|||
* standard section 9.13. The presence of head marks or multiple
|
||||
* vowels in the thunk would cause this to return false, for
|
||||
* example. */
|
||||
public boolean hasEquivalentUnicode();
|
||||
public boolean hasUnicodeRepresentation();
|
||||
}
|
||||
|
||||
|
|
|
@ -19,15 +19,15 @@ Contributor(s): ______________________________________.
|
|||
package org.thdl.tib.text.tshegbar;
|
||||
|
||||
/** <p>This non-instantiable class contains utility routines for
|
||||
* dealing with Tibetan Unicode characters and strings of such
|
||||
* characters.</p>
|
||||
* dealing with Tibetan Unicode codepoints and strings of such
|
||||
* codepoints.</p>
|
||||
*
|
||||
* @author David Chandler */
|
||||
public class UnicodeUtils {
|
||||
public class UnicodeUtils implements UnicodeConstants {
|
||||
/** Do not use this, as this class is not instantiable. */
|
||||
private UnicodeUtils() { super(); }
|
||||
|
||||
/** Returns true iff x is a Unicode character that represents a
|
||||
/** Returns true iff x is a Unicode codepoint that represents a
|
||||
consonant or two-consonant stack that has a Unicode code
|
||||
point. Returns true only for the usual suspects (like
|
||||
<code>\u0F40</code>) and for Sanskrit consonants (like
|
||||
|
@ -40,7 +40,7 @@ public class UnicodeUtils {
|
|||
&& (x >= '\u0F40' && x <= '\u0F6A'));
|
||||
}
|
||||
|
||||
/** Returns true iff x is a Unicode character that represents a
|
||||
/** Returns true iff x is a Unicode codepoint that represents a
|
||||
subjoined consonant or subjoined two-consonant stack that has
|
||||
a Unicode code point. Returns true only for the usual
|
||||
suspects (like <code>\u0F90</code>) and for Sanskrit
|
||||
|
@ -61,7 +61,7 @@ public class UnicodeUtils {
|
|||
'\u0F6A'. The new consonants (for transcribing Chinese, I
|
||||
believe) "\u0F55\u0F39" (which EWTS calls "fa"),
|
||||
"\u0F56\u0F39" ("va"), and "\u0F5F\u0F39" ("Dza") are
|
||||
two-character sequences, but you should be aware of them
|
||||
two-codepoint sequences, but you should be aware of them
|
||||
also. */
|
||||
public static boolean isPreferredFormOfConsonant(char x) {
|
||||
return ((x != '\u0F48' /* reserved in Unicode 3.2, but not in use */)
|
||||
|
@ -73,16 +73,16 @@ public class UnicodeUtils {
|
|||
&& (x != '\u0F5C'));
|
||||
}
|
||||
|
||||
/** Returns true iff unicodeChar is a character from the Unicode
|
||||
/** Returns true iff unicodeCP is a codepoint from the Unicode
|
||||
range U+0F00-U+0FFF.
|
||||
@see #isEntirelyTibetanUnicode(String) */
|
||||
public static boolean isInTibetanRange(char unicodeChar) {
|
||||
return (unicodeChar >= '\u0F00' && unicodeChar <= '\u0FFF');
|
||||
public static boolean isInTibetanRange(char unicodeCP) {
|
||||
return (unicodeCP >= '\u0F00' && unicodeCP <= '\u0FFF');
|
||||
}
|
||||
|
||||
/** Returns true iff unicodeString consists only of characters
|
||||
/** Returns true iff unicodeString consists only of codepoints
|
||||
from the Unicode range U+0F00-U+0FFF. (Note that these
|
||||
characters are typically not enough to represent a Tibetan
|
||||
codepoints are typically not enough to represent a Tibetan
|
||||
text, you may need ZWSP (zero-width space) and various
|
||||
whitespace from other ranges.) */
|
||||
public static boolean isEntirelyTibetanUnicode(String unicodeString) {
|
||||
|
@ -93,21 +93,40 @@ public class UnicodeUtils {
|
|||
return true;
|
||||
}
|
||||
|
||||
/** Modifies tibetanUnicode so that it is equivalent, according to
|
||||
the Unicode 3.2 standard, to the input buffer. The Tibetan
|
||||
passages of the returned string are in THDL-canonical form,
|
||||
however. This form uses a maximum of characters, in general,
|
||||
and never uses characters whose use has been {@link
|
||||
#isDiscouraged(char) discouraged}. If the input contains
|
||||
characters for which {@link #isInTibetanRange(char)} is not
|
||||
true, then they will not be modified.
|
||||
/** Puts the Tibetan codepoints in tibetanUnicode, a sequence of
|
||||
Unicode codepoints, into Normalization Form KD (NFKD) as
|
||||
specified by Unicode 3.2. The Tibetan passages of the
|
||||
returned string are in NFKD, but codepoints outside of the
|
||||
range <code>U+0F00</code>-<code>U+0FFF</code> are not
|
||||
necessarily put into NFKD. This form uses a maximum of
|
||||
codepoints, and it never uses codepoints whose use has been
|
||||
{@link #isDiscouraged(char) discouraged}. It would be David
|
||||
Chandler's very favorite form if not for the fact that
|
||||
<code>U+0F0C</code> normalizes to <code>U+0F0B</code> in NFKD.
|
||||
NFD is thus David Chandler's favorite, though it does not
|
||||
decompose <code>U+0F77</code> and <code>U+0F79</code> (for
|
||||
some reason, hopefully a well-thought-out one).
|
||||
|
||||
<p>Recall that NFKD, as it applies to Tibetan codepoints, is
|
||||
closed under string concatenation and under substringing.
|
||||
Note again that if the input contains codepoints for which
|
||||
{@link #isInTibetanRange(char)} is not true, then they will
|
||||
not be modified.</p>
|
||||
|
||||
<p>Note well that only well-formed input guarantees
|
||||
well-formed output.</p> */
|
||||
public static void toCanonicalForm(StringBuffer tibetanUnicode) {
|
||||
well-formed output.</p>
|
||||
|
||||
@param tibetanUnicode the codepoints to be decomposed
|
||||
@param normForm NORM_NFKD or NORM_NFD */
|
||||
public static void toMostlyDecomposedUnicode(StringBuffer tibetanUnicode,
|
||||
byte normForm)
|
||||
{
|
||||
if (normForm != NORM_NFD && normForm != NORM_NFKD)
|
||||
throw new IllegalArgumentException("normForm must be NORM_NFD or NORM_NFKD for decomposition to work");
|
||||
int offset = 0;
|
||||
while (offset < tibetanUnicode.length()) {
|
||||
String s = toCanonicalForm(tibetanUnicode.charAt(offset));
|
||||
String s
|
||||
= toNormalizedForm(tibetanUnicode.charAt(offset), normForm);
|
||||
if (null == s) {
|
||||
++offset;
|
||||
} else {
|
||||
|
@ -118,67 +137,88 @@ public class UnicodeUtils {
|
|||
}
|
||||
}
|
||||
|
||||
/** Like {@link #toCanonicalForm(StringBuffer)}, but does not
|
||||
modify its input. Instead, it returns the canonically-formed
|
||||
version of tibetanUnicode. */
|
||||
public static String toCanonicalForm(String tibetanUnicode) {
|
||||
/** Like {@link #toMostlyDecomposedUnicode(StringBuffer, byte)},
|
||||
but does not modify its input. Instead, it returns the NFKD-
|
||||
or NFD-normalized version of tibetanUnicode. */
|
||||
public static String toMostlyDecomposedUnicode(String tibetanUnicode,
|
||||
byte normForm)
|
||||
{
|
||||
StringBuffer sb = new StringBuffer(tibetanUnicode);
|
||||
toCanonicalForm(sb);
|
||||
toMostlyDecomposedUnicode(sb, normForm);
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/** There are 19 characters in the Tibetan range of Unicode 3.2
|
||||
which can be decomposed into longer strings of characters in
|
||||
the Tibetan range of Unicode. These 19 are said not to be in
|
||||
THDL-canonical form. This routine returns the canonical form
|
||||
for such characters, and returns null for characters that are
|
||||
already canonical or are not in the Tibetan range of Unicode.
|
||||
@param tibetanUnicodeChar the character to canonicalize
|
||||
@return null if tibetanUnicodeChar is canonical, or a string
|
||||
of two or three characters otherwise */
|
||||
public static String toCanonicalForm(char tibetanUnicodeChar) {
|
||||
switch (tibetanUnicodeChar) {
|
||||
case '\u0F43': return new String(new char[] { '\u0F42', '\u0FB7' });
|
||||
case '\u0F4D': return new String(new char[] { '\u0F4C', '\u0FB7' });
|
||||
case '\u0F52': return new String(new char[] { '\u0F51', '\u0FB7' });
|
||||
case '\u0F57': return new String(new char[] { '\u0F56', '\u0FB7' });
|
||||
case '\u0F5C': return new String(new char[] { '\u0F5B', '\u0FB7' });
|
||||
case '\u0F69': return new String(new char[] { '\u0F40', '\u0FB5' });
|
||||
case '\u0F73': return new String(new char[] { '\u0F71', '\u0F72' });
|
||||
case '\u0F75': return new String(new char[] { '\u0F71', '\u0F74' });
|
||||
case '\u0F76': return new String(new char[] { '\u0FB2', '\u0F80' });
|
||||
case '\u0F77': return new String(new char[] { '\u0FB2', '\u0F71', '\u0F80' });
|
||||
case '\u0F78': return new String(new char[] { '\u0FB3', '\u0F80' });
|
||||
case '\u0F79': return new String(new char[] { '\u0FB3', '\u0F71', '\u0F80' });
|
||||
case '\u0F81': return new String(new char[] { '\u0F71', '\u0F80' });
|
||||
case '\u0F93': return new String(new char[] { '\u0F92', '\u0FB7' });
|
||||
case '\u0F9D': return new String(new char[] { '\u0F9C', '\u0FB7' });
|
||||
case '\u0FA2': return new String(new char[] { '\u0FA1', '\u0FB7' });
|
||||
case '\u0FA7': return new String(new char[] { '\u0FA6', '\u0FB7' });
|
||||
case '\u0FAC': return new String(new char[] { '\u0FAB', '\u0FB7' });
|
||||
case '\u0FB9': return new String(new char[] { '\u0F90', '\u0FB5' });
|
||||
/** There are 19 codepoints in the Tibetan range of Unicode 3.2
|
||||
which can be decomposed into longer strings of codepoints in
|
||||
the Tibetan range of Unicode. Often one wants to manipulate
|
||||
decomposed codepoint strings. Also, HTML and XML are W3C
|
||||
standards that require certain normalization forms. This
|
||||
routine returns a chosen normalized form for such codepoints,
|
||||
and returns null for codepoints that are already normalized or
|
||||
are not in the Tibetan range of Unicode.
|
||||
@param tibetanUnicodeCP the codepoint to normalize
|
||||
@param normalizationForm NORM_NFKD or NORM_NFD if you expect
|
||||
something nontrivial to happen
|
||||
@return null if tibetanUnicodeCP is already in the chosen
|
||||
normalized form, or a string of two or three codepoints
|
||||
otherwise */
|
||||
public static String toNormalizedForm(char tibetanUnicodeCP, byte normalizationForm) {
|
||||
if (normalizationForm == NORM_NFKD
|
||||
|| normalizationForm == NORM_NFD) {
|
||||
// Where not specified, the NFKD form is also the NFD form.
|
||||
switch (tibetanUnicodeCP) {
|
||||
case '\u0F0C': return ((normalizationForm == NORM_NFKD)
|
||||
? "\u0F0B" : null);
|
||||
case '\u0F43': return "\u0F42\u0FB7";
|
||||
case '\u0F4D': return "\u0F4C\u0FB7";
|
||||
case '\u0F52': return "\u0F51\u0FB7";
|
||||
case '\u0F57': return "\u0F56\u0FB7";
|
||||
case '\u0F5C': return "\u0F5B\u0FB7";
|
||||
case '\u0F69': return "\u0F40\u0FB5";
|
||||
case '\u0F73': return "\u0F71\u0F72";
|
||||
case '\u0F75': return "\u0F71\u0F74";
|
||||
case '\u0F76': return "\u0FB2\u0F80";
|
||||
// I do not understand why NFD does not decompose this codepoint:
|
||||
case '\u0F77': return ((normalizationForm == NORM_NFKD)
|
||||
? "\u0FB2\u0F71\u0F80" : null);
|
||||
case '\u0F78': return "\u0FB3\u0F80";
|
||||
// I do not understand why NFD does not decompose this codepoint:
|
||||
case '\u0F79': return ((normalizationForm == NORM_NFKD)
|
||||
? "\u0FB3\u0F71\u0F80" : null);
|
||||
|
||||
default:
|
||||
return null;
|
||||
case '\u0F81': return "\u0F71\u0F80";
|
||||
case '\u0F93': return "\u0F92\u0FB7";
|
||||
case '\u0F9D': return "\u0F9C\u0FB7";
|
||||
case '\u0FA2': return "\u0FA1\u0FB7";
|
||||
case '\u0FA7': return "\u0FA6\u0FB7";
|
||||
case '\u0FAC': return "\u0FAB\u0FB7";
|
||||
case '\u0FB9': return "\u0F90\u0FB5";
|
||||
|
||||
default:
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/** Returns true iff tibetanUnicodeChar {@link
|
||||
#isInTibetanRange(char)} and if the Unicode 3.2 standard
|
||||
discourages the use of tibetanUnicodeChar. */
|
||||
public static boolean isDiscouraged(char tibetanUnicodeChar) {
|
||||
return ('\u0F73' == tibetanUnicodeChar
|
||||
|| '\u0F75' == tibetanUnicodeChar
|
||||
|| '\u0F77' == tibetanUnicodeChar
|
||||
|| '\u0F81' == tibetanUnicodeChar);
|
||||
/** Returns true iff tibetanUnicodeCP {@link
|
||||
#isInTibetanRange(char) is a Tibetan codepoint} and if the
|
||||
Unicode 3.2 standard discourages the use of
|
||||
tibetanUnicodeCP. */
|
||||
public static boolean isDiscouraged(char tibetanUnicodeCP) {
|
||||
return ('\u0F73' == tibetanUnicodeCP
|
||||
|| '\u0F75' == tibetanUnicodeCP
|
||||
|| '\u0F77' == tibetanUnicodeCP
|
||||
|| '\u0F79' == tibetanUnicodeCP
|
||||
|| '\u0F81' == tibetanUnicodeCP);
|
||||
/* DLC FIXME -- I was using 3.0 p.437-440, check 3.2. */
|
||||
}
|
||||
|
||||
/** Returns true iff ch corresponds to the Tibetan letter ra.
|
||||
Several Unicode characters correspond to the Tibetan letter ra
|
||||
Several Unicode codepoints correspond to the Tibetan letter ra
|
||||
(in its subscribed form or otherwise). Oftentimes,
|
||||
<code>\u0F62</code> is thought of as the nominal
|
||||
representation. Returns false for some characters that
|
||||
representation. Returns false for some codepoints that
|
||||
contain ra but are not merely ra, such as <code>\u0F77</code> */
|
||||
public static boolean isRa(char ch) {
|
||||
return ('\u0F62' == ch
|
||||
|
@ -188,7 +228,7 @@ public class UnicodeUtils {
|
|||
}
|
||||
|
||||
/** Returns true iff ch corresponds to the Tibetan letter wa.
|
||||
Several Unicode characters correspond to the Tibetan letter
|
||||
Several Unicode codepoints correspond to the Tibetan letter
|
||||
wa. Oftentimes, <code>\u0F5D</code> is thought of as the
|
||||
nominal representation. */
|
||||
public static boolean isWa(char ch) {
|
||||
|
@ -198,7 +238,7 @@ public class UnicodeUtils {
|
|||
}
|
||||
|
||||
/** Returns true iff ch corresponds to the Tibetan letter ya.
|
||||
Several Unicode characters correspond to the Tibetan letter
|
||||
Several Unicode codepoints correspond to the Tibetan letter
|
||||
ya. Oftentimes, <code>\u0F61</code> is thought of as the
|
||||
nominal representation. */
|
||||
public static boolean isYa(char ch) {
|
||||
|
@ -207,14 +247,14 @@ public class UnicodeUtils {
|
|||
|| '\u0FBB' == ch);
|
||||
}
|
||||
|
||||
/** Returns true iff there exists at least one character ch in
|
||||
unicodeString such that ch {@link #isRa(char) is ra} or contains
|
||||
/** Returns true iff there exists at least one codepoint cp in
|
||||
unicodeString such that cp {@link #isRa(char) is ra} or contains
|
||||
ra (like <code>\u0F77</code>). This method is not implemented
|
||||
as fast as it could be. It calls on the canonicalization code
|
||||
in order to maximize reuse and minimize the possibility of
|
||||
coder error. */
|
||||
public static boolean containsRa(String unicodeString) {
|
||||
String canonForm = toCanonicalForm(unicodeString);
|
||||
String canonForm = toMostlyDecomposedUnicode(unicodeString, NORM_NFKD);
|
||||
for (int i = 0; i < canonForm.length(); i++) {
|
||||
if (isRa(canonForm.charAt(i)))
|
||||
return true;
|
||||
|
@ -223,11 +263,13 @@ public class UnicodeUtils {
|
|||
}
|
||||
/** Inefficient shortcut.
|
||||
@see #containsRa(String) */
|
||||
public static boolean containsRa(char unicodeChar) {
|
||||
return containsRa(new String(new char[] { unicodeChar }));
|
||||
public static boolean containsRa(char unicodeCP) {
|
||||
return containsRa(new String(new char[] { unicodeCP }));
|
||||
}
|
||||
|
||||
public static String unicodeCharToString(char ch) {
|
||||
/** Returns a human-readable, ASCII form of the Unicode codepoint
|
||||
ch. */
|
||||
public static String unicodeCPToString(char ch) {
|
||||
return "U+" + Integer.toHexString((int)ch);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,9 +21,9 @@
|
|||
syllable.
|
||||
|
||||
<p>
|
||||
This package allows for turning a string of Unicode characters into
|
||||
This package allows for turning a string of Unicode codepoints into
|
||||
our <i>TTBIR</i>, our Tibetan Tsheg Bar Internal Representation.
|
||||
Said Unicode document may contain non-Tibetan characters also.
|
||||
Said Unicode document may contain non-Tibetan codepoints also.
|
||||
</p>
|
||||
|
||||
</body>
|
||||
|
|
Loading…
Reference in a new issue