Renamed UnicodeCharToExtendedWylie to
UnicodeCodepointToThdlWylie.java. Added a new class, UnicodeGraphemeCluster, that can tell you the components of a grapheme cluster from top to bottom. It does not yet have good error checking; it is not yet finished. Next is to parse clean Unicode into GraphemeClusters. After that comes scanning dirty Unicode into best-guess GraphemeClusters, and scanning dirty Unicode to get nice error messages.
This commit is contained in:
parent
8e8a23c6a6
commit
7ea185fa01
4 changed files with 481 additions and 69 deletions
|
@ -748,7 +748,7 @@ public class LegalTshegBar
|
|||
if (!isNominalRepresentationOfSimpleSuffix(suffix.charAt(0))) {
|
||||
return internalThrowThing(throwIfIllegal,
|
||||
"Illegal suffix -- not one of the ten legal suffixes: "
|
||||
+ UnicodeUtils.unicodeCPToString(suffix.charAt(0)));
|
||||
+ UnicodeUtils.unicodeCodepointToString(suffix.charAt(0)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -837,7 +837,7 @@ public class LegalTshegBar
|
|||
|
||||
boolean disambiguatorNeeded = false;
|
||||
char prefix = getPrefix();
|
||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(prefix));
|
||||
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(prefix));
|
||||
if (!hasHeadLetter()) {
|
||||
if (EWC_ya == rootLetter) {
|
||||
if (isConsonantThatTakesYaBtags(prefix))
|
||||
|
@ -857,55 +857,55 @@ public class LegalTshegBar
|
|||
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
|
||||
}
|
||||
if (hasHeadLetter())
|
||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getHeadLetter()));
|
||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(rootLetter));
|
||||
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getHeadLetter()));
|
||||
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(rootLetter));
|
||||
if (hasSubjoinedLetter())
|
||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getSubjoinedLetter()));
|
||||
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getSubjoinedLetter()));
|
||||
if (hasWaZurSubjoinedToRootLetter())
|
||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EWSUB_wa_zur));
|
||||
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EWSUB_wa_zur));
|
||||
|
||||
// a-chung is treated, in THDL Extended Wylie, like a vowel.
|
||||
// I.e., you don't have 'pAa', you have 'pA'.
|
||||
if (hasAChungOnRootLetter()) {
|
||||
if (hasExplicitVowel()) {
|
||||
if (EWV_i == getVowel()) {
|
||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar('\u0F73'));
|
||||
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint('\u0F73'));
|
||||
} else if (EWV_u == getVowel()) {
|
||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar('\u0F75'));
|
||||
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint('\u0F75'));
|
||||
} else if (EWV_e == getVowel() || EWV_o == getVowel()) {
|
||||
// The exception to the rule for a-chung and vowels...
|
||||
|
||||
// DLC FIXME: are these allowed in legal Tibetan?
|
||||
// EWTS would have special cases for them if so,
|
||||
// I'd wager...
|
||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EW_achung));
|
||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel()));
|
||||
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EW_achung));
|
||||
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getVowel()));
|
||||
} else {
|
||||
ThdlDebug.abort("only simple vowels occur in this class, how did this get past internalLegalityTest(..)?");
|
||||
}
|
||||
} else {
|
||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EW_achung));
|
||||
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EW_achung));
|
||||
}
|
||||
} else {
|
||||
if (hasExplicitVowel())
|
||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel()));
|
||||
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getVowel()));
|
||||
else
|
||||
sb.append("a");
|
||||
}
|
||||
|
||||
if (hasSuffix()) {
|
||||
String suf = getSuffix();
|
||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(suf.charAt(0)));
|
||||
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(suf.charAt(0)));
|
||||
if (suf.length() > 1) {
|
||||
// DLC assert, don't verify, that the length is two.
|
||||
// This could change if I learn of more suffix
|
||||
// particles.
|
||||
ThdlDebug.verify(2 == suf.length());
|
||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(suf.charAt(1)));
|
||||
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(suf.charAt(1)));
|
||||
}
|
||||
}
|
||||
if (hasPostsuffix())
|
||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPostsuffix()));
|
||||
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getPostsuffix()));
|
||||
return sb;
|
||||
}
|
||||
|
||||
|
@ -929,18 +929,18 @@ public class LegalTshegBar
|
|||
+ "transliterationType=\"THDL Extended Wylie 0.5\" "
|
||||
+ (hasPrefix()
|
||||
? ("prefix=\""
|
||||
+ UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPrefix()) + "\" ")
|
||||
+ UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getPrefix()) + "\" ")
|
||||
: "")
|
||||
+ (hasHeadLetter()
|
||||
? ("headLetter=\""
|
||||
+ UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getHeadLetter())
|
||||
+ UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getHeadLetter())
|
||||
+ "\" ")
|
||||
: "")
|
||||
+ ("rootLetter=\""
|
||||
+ UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getRootLetter()) + "\" ")
|
||||
+ UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getRootLetter()) + "\" ")
|
||||
+ (hasSubjoinedLetter()
|
||||
? ("subjoinedLetter=\""
|
||||
+ UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getSubjoinedLetter())
|
||||
+ UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getSubjoinedLetter())
|
||||
+ "\" ")
|
||||
: "")
|
||||
+ (hasWaZurSubjoinedToRootLetter()
|
||||
|
@ -953,17 +953,17 @@ public class LegalTshegBar
|
|||
// DLC NOW: what about the root letter a, i.e. \u0F68 ? do we want the EWTS to be 'aa' ?
|
||||
+ ("vowel=\""
|
||||
+ (hasExplicitVowel()
|
||||
? UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel())
|
||||
? UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getVowel())
|
||||
: "a")
|
||||
+ "\" ")
|
||||
+ (hasSuffix()
|
||||
? ("suffix=\""
|
||||
+ UnicodeCharToThdlWylie.getThdlWylieForUnicodeString(getSuffix())
|
||||
+ UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeString(getSuffix())
|
||||
+ "\" ")
|
||||
: "")
|
||||
+ (hasPostsuffix()
|
||||
? ("postsuffix=\""
|
||||
+ UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPostsuffix())
|
||||
+ UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getPostsuffix())
|
||||
+ "\" ")
|
||||
: "")
|
||||
+ "/>");
|
||||
|
|
|
@ -21,7 +21,7 @@ package org.thdl.tib.text.tshegbar;
|
|||
import org.thdl.tib.text.TibetanMachineWeb;
|
||||
|
||||
/** This noninstantiable class allows for converting from Unicode
|
||||
* codepoints to Extended Wylie. It cannot be used for long
|
||||
* codepoints to THDL Extended Wylie. It cannot be used for long
|
||||
* stretches of text, though, as it is unaware of context, which is
|
||||
* essential to understanding a non-trivial string of Tibetan
|
||||
* Unicode.
|
||||
|
@ -29,21 +29,22 @@ import org.thdl.tib.text.TibetanMachineWeb;
|
|||
* <p>See the document by Nathaniel Garson and David Germano entitled
|
||||
* <i>Extended Wylie Transliteration Scheme</i>. Note that there are
|
||||
* a couple of issues with the November 18, 2001 revision of that
|
||||
* document; these issues are in the Bugs tracker at our SourceForge site.</p>
|
||||
* document; these issues are in the Bugs tracker at our SourceForge
|
||||
* site.</p>
|
||||
*
|
||||
* @see <a href="http://sourceforge.net/projects/thdltools">SourceForge site</a>
|
||||
*
|
||||
* @author David Chandler */
|
||||
public class UnicodeCharToExtendedWylie {
|
||||
public class UnicodeCodepointToThdlWylie {
|
||||
|
||||
/** Returns the extended Wylie for the very simple sequence x.
|
||||
* Returns null iff some (Unicode) char in s has no extended
|
||||
* Wylie representation. This is unaware of context, so use it
|
||||
* sparingly. */
|
||||
public static StringBuffer getExtendedWylieForUnicodeString(String x) {
|
||||
/** Returns the THDL extended Wylie for the very simple sequence
|
||||
* x. Returns null iff some (Unicode) char in s has no THDL
|
||||
* extended Wylie representation. This is unaware of context, so
|
||||
* use it sparingly. */
|
||||
public static StringBuffer getThdlWylieForUnicodeString(String x) {
|
||||
StringBuffer sb = new StringBuffer();
|
||||
for (int i = 0; i < x.length(); i++) {
|
||||
String ew = getExtendedWylieForUnicodeChar(x.charAt(i));
|
||||
String ew = getThdlWylieForUnicodeCodepoint(x.charAt(i));
|
||||
if (null == ew)
|
||||
return null;
|
||||
sb.append(ew);
|
||||
|
@ -51,12 +52,14 @@ public class UnicodeCharToExtendedWylie {
|
|||
return sb;
|
||||
}
|
||||
|
||||
/** Returns the extended Wylie for x, or null if there is none.
|
||||
* Understand that multiple Unicode code points (chars) map to
|
||||
* the same Extended Wylie representation. Understand also that
|
||||
* the scrap of Extended Wylie returned is only valid in certain
|
||||
* contexts. For example, not all consonants take ra-btags. DLC NOW what about canonicalization? */
|
||||
public static String getExtendedWylieForUnicodeChar(char x) {
|
||||
/** Returns the THDL extended Wylie for x, or null if there is
|
||||
* none. Understand that multiple Unicode code points (chars)
|
||||
* map to the same THDL Extended Wylie representation.
|
||||
* Understand also that the scrap of THDL Extended Wylie returned
|
||||
* is only valid in certain contexts. For example, not all
|
||||
* consonants take ra-btags. DLC NOW what about
|
||||
* canonicalization? */
|
||||
public static String getThdlWylieForUnicodeCodepoint(char x) {
|
||||
switch (x) {
|
||||
|
||||
case '\u0F00': return "oM";
|
||||
|
@ -130,9 +133,9 @@ public class UnicodeCharToExtendedWylie {
|
|||
case '\u0F40': return "k";
|
||||
case '\u0F41': return "kh";
|
||||
case '\u0F42': return "g";
|
||||
case '\u0F43': return (getExtendedWylieForUnicodeChar('\u0F42')
|
||||
case '\u0F43': return (getThdlWylieForUnicodeCodepoint('\u0F42')
|
||||
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
||||
+ getExtendedWylieForUnicodeChar('\u0FB7'));
|
||||
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
|
||||
case '\u0F44': return "ng";
|
||||
case '\u0F45': return "c";
|
||||
case '\u0F46': return "ch";
|
||||
|
@ -142,31 +145,31 @@ public class UnicodeCharToExtendedWylie {
|
|||
case '\u0F4A': return "T";
|
||||
case '\u0F4B': return "Th";
|
||||
case '\u0F4C': return "D";
|
||||
case '\u0F4D': return (getExtendedWylieForUnicodeChar('\u0F4C')
|
||||
case '\u0F4D': return (getThdlWylieForUnicodeCodepoint('\u0F4C')
|
||||
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
||||
+ getExtendedWylieForUnicodeChar('\u0FB7'));
|
||||
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
|
||||
case '\u0F4E': return "N";
|
||||
case '\u0F4F': return "t";
|
||||
|
||||
case '\u0F50': return "th";
|
||||
case '\u0F51': return "d";
|
||||
case '\u0F52': return (getExtendedWylieForUnicodeChar('\u0F51')
|
||||
case '\u0F52': return (getThdlWylieForUnicodeCodepoint('\u0F51')
|
||||
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
||||
+ getExtendedWylieForUnicodeChar('\u0FB7'));
|
||||
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
|
||||
case '\u0F53': return "n";
|
||||
case '\u0F54': return "p";
|
||||
case '\u0F55': return "ph";
|
||||
case '\u0F56': return "b";
|
||||
case '\u0F57': return (getExtendedWylieForUnicodeChar('\u0F56')
|
||||
case '\u0F57': return (getThdlWylieForUnicodeCodepoint('\u0F56')
|
||||
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
||||
+ getExtendedWylieForUnicodeChar('\u0FB7'));
|
||||
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
|
||||
case '\u0F58': return "m";
|
||||
case '\u0F59': return "ts";
|
||||
case '\u0F5A': return "tsh";
|
||||
case '\u0F5B': return "dz";
|
||||
case '\u0F5C': return (getExtendedWylieForUnicodeChar('\u0F5B')
|
||||
case '\u0F5C': return (getThdlWylieForUnicodeCodepoint('\u0F5B')
|
||||
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
||||
+ getExtendedWylieForUnicodeChar('\u0FB7'));
|
||||
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
|
||||
case '\u0F5D': return "w";
|
||||
case '\u0F5E': return "zh";
|
||||
case '\u0F5F': return "z";
|
||||
|
@ -180,9 +183,9 @@ public class UnicodeCharToExtendedWylie {
|
|||
case '\u0F66': return "s";
|
||||
case '\u0F67': return "h";
|
||||
case '\u0F68': return "a"; // DLC: maybe the empty string is OK here because typing just 'i' into Jskad causes root letter \u0F68 to appear... yuck...
|
||||
case '\u0F69': return (getExtendedWylieForUnicodeChar('\u0F40')
|
||||
case '\u0F69': return (getThdlWylieForUnicodeCodepoint('\u0F40')
|
||||
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
||||
+ getExtendedWylieForUnicodeChar('\u0FB5'));
|
||||
+ getThdlWylieForUnicodeCodepoint('\u0FB5'));
|
||||
case '\u0F6A': return "r";
|
||||
case '\u0F6B': return null;
|
||||
case '\u0F6C': return null;
|
||||
|
@ -227,9 +230,9 @@ public class UnicodeCharToExtendedWylie {
|
|||
case '\u0F90': return "k";
|
||||
case '\u0F91': return "kh";
|
||||
case '\u0F92': return "g";
|
||||
case '\u0F93': return (getExtendedWylieForUnicodeChar('\u0F92')
|
||||
case '\u0F93': return (getThdlWylieForUnicodeCodepoint('\u0F92')
|
||||
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
||||
+ getExtendedWylieForUnicodeChar('\u0FB7'));
|
||||
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
|
||||
case '\u0F94': return "ng";
|
||||
case '\u0F95': return "c";
|
||||
case '\u0F96': return "ch";
|
||||
|
@ -239,31 +242,31 @@ public class UnicodeCharToExtendedWylie {
|
|||
case '\u0F9A': return "T";
|
||||
case '\u0F9B': return "Th";
|
||||
case '\u0F9C': return "D";
|
||||
case '\u0F9D': return (getExtendedWylieForUnicodeChar('\u0F92')
|
||||
case '\u0F9D': return (getThdlWylieForUnicodeCodepoint('\u0F92')
|
||||
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
||||
+ getExtendedWylieForUnicodeChar('\u0FB7'));
|
||||
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
|
||||
case '\u0F9E': return "N";
|
||||
case '\u0F9F': return "t";
|
||||
|
||||
case '\u0FA0': return "th";
|
||||
case '\u0FA1': return "d";
|
||||
case '\u0FA2': return (getExtendedWylieForUnicodeChar('\u0FA1')
|
||||
case '\u0FA2': return (getThdlWylieForUnicodeCodepoint('\u0FA1')
|
||||
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
||||
+ getExtendedWylieForUnicodeChar('\u0FB7'));
|
||||
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
|
||||
case '\u0FA3': return "n";
|
||||
case '\u0FA4': return "p";
|
||||
case '\u0FA5': return "ph";
|
||||
case '\u0FA6': return "b";
|
||||
case '\u0FA7': return (getExtendedWylieForUnicodeChar('\u0FA6')
|
||||
case '\u0FA7': return (getThdlWylieForUnicodeCodepoint('\u0FA6')
|
||||
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
||||
+ getExtendedWylieForUnicodeChar('\u0FB7'));
|
||||
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
|
||||
case '\u0FA8': return "m";
|
||||
case '\u0FA9': return "ts";
|
||||
case '\u0FAA': return "tsh";
|
||||
case '\u0FAB': return "dz";
|
||||
case '\u0FAC': return (getExtendedWylieForUnicodeChar('\u0FAB')
|
||||
case '\u0FAC': return (getThdlWylieForUnicodeCodepoint('\u0FAB')
|
||||
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
||||
+ getExtendedWylieForUnicodeChar('\u0FB7'));
|
||||
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
|
||||
case '\u0FAD': return "w";
|
||||
case '\u0FAE': return "zh";
|
||||
case '\u0FAF': return "z";
|
||||
|
@ -277,9 +280,9 @@ public class UnicodeCharToExtendedWylie {
|
|||
case '\u0FB6': return "s";
|
||||
case '\u0FB7': return "h";
|
||||
case '\u0FB8': return "a"; // DLC see note on \u0F68 ...
|
||||
case '\u0FB9': return (getExtendedWylieForUnicodeChar('\u0F90')
|
||||
case '\u0FB9': return (getThdlWylieForUnicodeCodepoint('\u0F90')
|
||||
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
||||
+ getExtendedWylieForUnicodeChar('\u0FB5'));
|
||||
+ getThdlWylieForUnicodeCodepoint('\u0FB5'));
|
||||
case '\u0FBA': return "w";
|
||||
case '\u0FBB': return "y";
|
||||
case '\u0FBC': return "r";
|
||||
|
@ -309,7 +312,7 @@ public class UnicodeCharToExtendedWylie {
|
|||
|
||||
// This codepoint is in the range 0FD0-0FFF or is not in
|
||||
// the Tibetan range at all. In either case, there is no
|
||||
// corresponding Extended Wylie.
|
||||
// corresponding THDL Extended Wylie.
|
||||
return null;
|
||||
}
|
||||
} // end switch
|
377
source/org/thdl/tib/text/tshegbar/UnicodeGraphemeCluster.java
Normal file
377
source/org/thdl/tib/text/tshegbar/UnicodeGraphemeCluster.java
Normal file
|
@ -0,0 +1,377 @@
|
|||
/*
|
||||
The contents of this file are subject to the THDL Open Community License
|
||||
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License on the THDL web site
|
||||
(http://www.thdl.org/).
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||
License for the specific terms governing rights and limitations under the
|
||||
License.
|
||||
|
||||
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||
Library (THDL). Portions created by the THDL are Copyright 2001 THDL.
|
||||
All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
*/
|
||||
|
||||
package org.thdl.tib.text.tshegbar;
|
||||
|
||||
import java.util.Vector;
|
||||
|
||||
import org.thdl.util.ThdlDebug;
|
||||
|
||||
/** A UnicodeGraphemeCluster is either a non-Tibetan codepoint (such
|
||||
* as whitespace or control characters or a Latin "character"), or a
|
||||
* vertically stacked set of Tibetan consonants, vowels, marks, and
|
||||
* signs. The Unicode string
|
||||
* <code>"\u0F40\u0F0B\u0F41\u0F0B"</code> specifies
|
||||
* four UnicodeGraphemeClusters (the name of the Tibetan alphabet,
|
||||
* you might notice), while the Unicode string
|
||||
* <code>"\u0F66\u0FA5\u0F39\u0F90\u0FB5\u0F71\u0F80\u0F7F"</code>
|
||||
* is one Tibetan stack, sa over fa over ka over Sha with an a-chung,
|
||||
* a reversed gi-gu, and a visarga, plus a ngas-bzung-sgor-rtags mark
|
||||
* underneath all of that. I assume the latter grapheme cluster is
|
||||
* nonsense, but it is considered one grapheme cluster because all
|
||||
* but the first char are combining chars. See Unicode Technical
|
||||
* Report 29.
|
||||
*
|
||||
* <p>As the above example demonstrates, not all
|
||||
* UnicodeGraphemeClusters are syntactically legal in the Tibetan
|
||||
* language. Not all of them are syntactically legal in Sanskrit
|
||||
* transcribed in the Tibetan alphabet, either.</p>
|
||||
*
|
||||
* <p>The Unicode 3.2 standard (see especially Technical Report 29)
|
||||
* refers to "grapheme clusters." A UnicodeGraphemeCluster is
|
||||
* precisely a grapheme cluster as described by that standard. We
|
||||
* interpret the standard as saying that <code>U+0F3E</code> and
|
||||
* <code>U+0F3F</code> are each grapheme clusters unto themselves,
|
||||
* even though they are combining codepoints.</p>
|
||||
*
|
||||
* @author David Chandler */
|
||||
public class UnicodeGraphemeCluster
|
||||
implements UnicodeReadyThunk, UnicodeConstants
|
||||
{
|
||||
/** @see #getCPHeight(char) */
|
||||
private static final int MIN_HEIGHT = -6;
|
||||
/** @see #getCPHeight(char) */
|
||||
private static final int MAX_HEIGHT = 3;
|
||||
|
||||
/** The Unicode codepoints that compose this grapheme cluster.
|
||||
This is legal, i.e. if there is a Tibetan vowel, it is the
|
||||
last codepoint. It is in Normalization Form THDL (NFTHDL). */
|
||||
private String unicodeString;
|
||||
|
||||
/** Do not use this constructor. */
|
||||
private UnicodeGraphemeCluster() { super(); }
|
||||
|
||||
/** Creates a new GraphemeCluster given a legal sequence of
|
||||
Unicode codepoints corresponding to a single grapheme
|
||||
cluster.
|
||||
@exception IllegalArgumentException if unicodeString is not a
|
||||
syntactically correct Unicode 3.2 sequence (if it begins with
|
||||
a combining codepoint or has a Tibetan vowel before another
|
||||
combining character, for example, or if it is more than one
|
||||
grapheme cluster. Note that syntactical correctness for
|
||||
non-Tibetan codepoints is not likely to be known by this
|
||||
routine. */
|
||||
public UnicodeGraphemeCluster(String unicodeString)
|
||||
throws IllegalArgumentException
|
||||
{
|
||||
// check legality:
|
||||
// DLC NOW FIXME
|
||||
|
||||
// convert to NFTHDL:
|
||||
this.unicodeString
|
||||
= UnicodeUtils.toMostlyDecomposedUnicode(unicodeString, NORM_NFTHDL);
|
||||
}
|
||||
|
||||
/** Returns a string of codepoints in NFTHDL form. */
|
||||
public String getUnicodeRepresentation() {
|
||||
return unicodeString;
|
||||
}
|
||||
|
||||
/** Returns true. */
|
||||
public boolean hasUnicodeRepresentation() {
|
||||
return true;
|
||||
}
|
||||
|
||||
/** Returns true iff this stack could occur in syntactically
|
||||
* correct, run-of-the-mill Tibetan (as opposed to Tibetanized
|
||||
* Sanksrit, Chinese, et cetera). sga is a legal Tibetan stack,
|
||||
* but g+g is not, for example. */
|
||||
public boolean isLegalTibetan() {
|
||||
// DLC FIXME: for those odd head marks etc., return true even
|
||||
// though hasUnicodeRepresentation() will return false.
|
||||
|
||||
// Note that ra-btags and wa-zur both be present in legal
|
||||
// Tibetan.
|
||||
|
||||
throw new Error("DLC FIXME: not yet implemented.");
|
||||
}
|
||||
|
||||
/** Returns a <unicodeGraphemeCluster> element that contains the
|
||||
* THDL Extended Wylie transliteration for this cluster. */
|
||||
public String toConciseXML() {
|
||||
throw new Error("DLC NOW unimplemented");
|
||||
}
|
||||
|
||||
/** Returns a <unicodeGraphemeCluster> element that contains this
|
||||
* cluster broken down into its constituent decomposed
|
||||
* codepoints. */
|
||||
public String toVerboseXML() {
|
||||
throw new Error("DLC NOW unimplemented");
|
||||
}
|
||||
|
||||
/** Returns the THDL Extended Wylie transliteration of this
|
||||
grapheme cluster, or null if there is none (which happens for
|
||||
a few Tibetan codepoints, if you'll recall). If needsVowel is
|
||||
true, then an "a" will be appended when there is no EW_achung
|
||||
or explicit simple vowel. If there is an explicit vowel or
|
||||
EW_achung, it will always be present. Note that needsVowel is
|
||||
provided because btags is the preferred THDL Extended Wylie
|
||||
for the four contiguous grapheme clusters
|
||||
<code>"\u0F56\u0F4F\u0F42\u0F66"</code>, and
|
||||
needsVowel must be set to false for all but the grapheme
|
||||
cluster corresponding to <code>\u0F4F</code> if you wish
|
||||
to get the preferred THDL Extended Wylie. */
|
||||
public String getThdlWylie(boolean needsVowel) {
|
||||
throw new Error("DLC NOW unimplemented");
|
||||
}
|
||||
|
||||
/** Given some (possibly unnormalized) Unicode 3.2 string unicode,
|
||||
appends grapheme clusters to the vector of GraphemeClusters
|
||||
grcls if grcls is nonnulla. Performs good error checking if
|
||||
validate is true. If an error is found, grcls may have been
|
||||
modified if nonnull. Setting grcls to null and setting
|
||||
validate to true is sometimes useful for testing the validity
|
||||
of a Unicode string.
|
||||
@return the number of grapheme clusters that were or would
|
||||
have been added to grcls
|
||||
@exception BadTibetanUnicodeException if the unicode is not
|
||||
syntactically legal
|
||||
@exception IllegalArgumentException if correctErrors and
|
||||
validate are both true
|
||||
@exception NullPointerException if unicode is null */
|
||||
public static int breakUnicodeIntoGraphemeClusters(Vector grcls,
|
||||
String unicode,
|
||||
boolean validate,
|
||||
boolean correctErrors)
|
||||
throws // DLC SOON: BadTibetanUnicodeException,
|
||||
IllegalArgumentException, NullPointerException
|
||||
{
|
||||
if (validate && correctErrors) {
|
||||
throw new IllegalArgumentException("validate and correctErrors cannot both be true.");
|
||||
}
|
||||
throw new Error("DLC NOW unimplemented");
|
||||
/*
|
||||
if (start == i) {
|
||||
// special tests at the beginning of input.
|
||||
if (0 != height || UnicodeUtils.combinesLeftToRight(cp)) {
|
||||
throw new BadTibetanUnicodeException("A combining codepoint was found at the start of input or after a mark that ends a grapheme cluster.");
|
||||
}
|
||||
}
|
||||
if (height == last_height) {
|
||||
if ('\u0F39' == cp) {
|
||||
if (!UnicodeUtils.isTibetanConsonant(last_cp)) {
|
||||
throw new BadTibetanUnicodeException("U+0F39 can only occur after a (possibly subjoined) Tibetan consonant");
|
||||
}
|
||||
} else {
|
||||
// DLC: cp BEGINS A NEW GRAPHEME CLUSTER!!!
|
||||
}
|
||||
}
|
||||
|
||||
// Test to see if this last character has ended this
|
||||
// grapheme cluster:
|
||||
if (UnicodeUtils.isTibetanTerminatingVowel(cp)) {
|
||||
// DLC: cp ENDS A GRAPHEME CLUSTER!!!
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
/** FIXMEDOC */
|
||||
public String getTopToBottomCodepoints() {
|
||||
return getTopToBottomCodepoints(new StringBuffer(unicodeString),
|
||||
0, unicodeString.length()).toString();
|
||||
}
|
||||
|
||||
/** Returns a new StringBuffer consisting of the codepoints in
|
||||
NFTHDLString at indices [start, end) sorted in top-to-bottom
|
||||
order, or null on some occasions when NFTHDLString is already
|
||||
sorted. A top-to-bottom ordering is a useful form for
|
||||
applications wishing to render the grapheme cluster. Note
|
||||
that this method is only useful if NFTHDLString is part of or
|
||||
an entire grapheme cluster. Does no error checking on
|
||||
NFTHDLString.
|
||||
@param NFTHDLString a buffer with characters at indices i,
|
||||
where start <= i < end, being the Unicode codepoints for a
|
||||
single grapheme cluster or part of a grapheme cluster
|
||||
@param start NFTHDLString.charAt(start) is the first codepoint
|
||||
dealt with
|
||||
@param end NFTHDLString.charAt(end) is the first codepoint NOT
|
||||
dealt with
|
||||
@return null only if (but not necessarily if) NFTHDLString is
|
||||
already sorted top-to-bottom, or the sorted form of
|
||||
NFTHDLString */
|
||||
private static StringBuffer getTopToBottomCodepoints(StringBuffer NFTHDLString, /* DLC FIXME: for efficiency, use a ThdlCharIterator. */
|
||||
int start, int end)
|
||||
{
|
||||
if (end <= start) /* 0-length string. */
|
||||
return null;
|
||||
if (start + 1 == end) /* 1-length string. */
|
||||
return null;
|
||||
// else we have a string of length >= 2.
|
||||
|
||||
// We'll use the world's fastest sorting algorithm. Linear
|
||||
// time, baby. Here are the ten or so mailboxes for our
|
||||
// postman's sort:
|
||||
StringBuffer chunksAtCommonHeights[]
|
||||
= new StringBuffer[(MAX_HEIGHT + 1) - MIN_HEIGHT];
|
||||
|
||||
for (int i = start; i < end; i++) {
|
||||
char cp = NFTHDLString.charAt(i);
|
||||
int height = getCPHeight(cp);
|
||||
|
||||
// initialize mailbox if necessary.
|
||||
if (null == chunksAtCommonHeights[height - MIN_HEIGHT]) {
|
||||
chunksAtCommonHeights[height - MIN_HEIGHT]
|
||||
= new StringBuffer(2);
|
||||
}
|
||||
|
||||
// put this cp into the correct mailbox.
|
||||
chunksAtCommonHeights[height - MIN_HEIGHT].append(cp);
|
||||
}
|
||||
|
||||
// Now concatenate together the mailboxes:
|
||||
StringBuffer sb = new StringBuffer(end - start);
|
||||
for (int h = MAX_HEIGHT; h >= MIN_HEIGHT; h--) {
|
||||
if (null != chunksAtCommonHeights[h - MIN_HEIGHT]) {
|
||||
sb.append(chunksAtCommonHeights[h - MIN_HEIGHT]);
|
||||
}
|
||||
}
|
||||
return sb;
|
||||
}
|
||||
|
||||
|
||||
/** Returns the <i>height</i> for the Tibetan Unicode codepoint x.
|
||||
This relative height is 0 for a base consonant, digit,
|
||||
punctuation, mark, or sign. It is -1 for a subjoined
|
||||
consonant, -2 for EWSUB_wa_zur, -3 for EW_achung, +1 for
|
||||
EWV_gigu, and so on according to the height these codepoints
|
||||
appear relative to one another when on the same stack. If two
|
||||
codepoints have equal height, they should not exist in the
|
||||
same grapheme cluster unless one is <code>U+0F39</code>, which
|
||||
is an integral part of a consonant when tacked on to, e.g.,
|
||||
EWC_PHA.
|
||||
|
||||
<p>If x is not a Unicode 3.2 codepoint in the Tibetan range,
|
||||
or if x is not in NFTHDL form, 0 is returned. The height code
|
||||
of <code>U+0F76</code> is not valid, and it is not an accident
|
||||
that <code>U+0F76</code> is not in NFTHDL form.</p> */
|
||||
private static int getCPHeight(char x) {
|
||||
// DLC make this an assertion:
|
||||
ThdlDebug.verify(null == UnicodeUtils.toNormalizedForm(x, NORM_NFTHDL));
|
||||
|
||||
if (x >= '\u0F90' && x <= '\u0FAC'
|
||||
|| x >= '\u0FAE' && x <= '\u0FBC') {
|
||||
// subjoined consonant. Note that wa-zur is an exception.
|
||||
return -1;
|
||||
} else if (x >= '\u0F00' && x <= '\u0F17'
|
||||
|| x >= '\u0F1A' && x <= '\u0F34'
|
||||
|| x >= '\u0F3A' && x <= '\u0F3D'
|
||||
|| x >= '\u0F40' && x <= '\u0F6A' // consonants
|
||||
|| x >= '\u0F88' && x <= '\u0F8B'
|
||||
|| x >= '\u0FBE' && x <= '\u0FCF') {
|
||||
// neutral height:
|
||||
return 0;
|
||||
} else { // Oddballs.
|
||||
switch (x) {
|
||||
//
|
||||
// non-combining:
|
||||
//
|
||||
case '\u0F36':
|
||||
case '\u0F38':
|
||||
case '\u0F85':
|
||||
return 0;
|
||||
|
||||
|
||||
//
|
||||
// combining, but left-to-right combining:
|
||||
//
|
||||
case '\u0F3E':
|
||||
case '\u0F3F':
|
||||
case '\u0F7F':
|
||||
return 0;
|
||||
|
||||
|
||||
//
|
||||
// combining by coming below:
|
||||
//
|
||||
case '\u0FAD':
|
||||
return -2; // wa-zur
|
||||
case '\u0F71':
|
||||
return -3; // a-chung
|
||||
case '\u0F74':
|
||||
case '\u0F84':
|
||||
return -4; // DLC CHECKME
|
||||
case '\u0F18': // combines with digits
|
||||
case '\u0F19': // combines with digits
|
||||
return -5;
|
||||
case '\u0F35':
|
||||
case '\u0F37':
|
||||
case '\u0FC6': {
|
||||
ThdlDebug.verify(-6 == MIN_HEIGHT);
|
||||
return -6; // min height
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// combining by coming above:
|
||||
//
|
||||
case '\u0F72':
|
||||
case '\u0F7A':
|
||||
case '\u0F7B':
|
||||
case '\u0F7C':
|
||||
case '\u0F7D':
|
||||
case '\u0F80':
|
||||
return 1;
|
||||
case '\u0F7E':
|
||||
case '\u0F82':
|
||||
case '\u0F83':
|
||||
return 2; // these three come above 0F7C, right? (DLC CHECKME)
|
||||
case '\u0F86':
|
||||
case '\u0F87': {
|
||||
ThdlDebug.verify(3 == MAX_HEIGHT);
|
||||
return 3; // max height
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// exceptional case:
|
||||
//
|
||||
// some would say +1, but then "\u0F40\u0FA5\u0F39" will
|
||||
// not have a5 combine with 39. Unicode could well have
|
||||
// put in a single codepoint for "\u0FA5\u0F39" IMO.
|
||||
case '\u0F39': return 0;
|
||||
|
||||
|
||||
default: {
|
||||
if (x >= '\u0F00' && x <= '\u0FFF') {
|
||||
// This wasn't explicitly handled? Hmmm... This
|
||||
// won't ever happen for NFTHDL-formed input.
|
||||
ThdlDebug.noteIffyCode();
|
||||
}
|
||||
|
||||
// This codepoint is not in the Tibetan range.
|
||||
return 0;
|
||||
}
|
||||
} // end switch
|
||||
}
|
||||
}
|
||||
/** DLC SOON */
|
||||
public boolean isTibetan() {
|
||||
throw new Error("DLC FIXME: not yet implemented.");
|
||||
}
|
||||
}
|
||||
|
|
@ -97,10 +97,12 @@ public class UnicodeUtils implements UnicodeConstants {
|
|||
Unicode codepoints, into either Normalization Form KD (NFKD),
|
||||
D (NFD), or THDL (NFTHDL), depending on the value of normForm.
|
||||
NFD and NFKD are specified by Unicode 3.2; NFTHDL is needed
|
||||
for {@link org.thdl.tib.text.tshegbar#GraphemeCluster} because
|
||||
NFKD normalizes <code>U+0F0C</code>. NFTHDL uses a maximum of
|
||||
codepoints, and it never uses codepoints whose use has been
|
||||
{@link #isDiscouraged(char) discouraged}.
|
||||
for {@link org.thdl.tib.text.tshegbar#UnicodeGraphemeCluster}
|
||||
because NFKD normalizes <code>U+0F0C</code> and neither NFD
|
||||
nor NFKD breaks down <code>U+0F00</code> into its constituent
|
||||
codepoints. NFTHDL uses a maximum of codepoints, and it never
|
||||
uses codepoints whose use has been {@link #isDiscouraged(char)
|
||||
discouraged}.
|
||||
|
||||
<p>The Tibetan passages of the returned string are in the
|
||||
chosen normalized form, but codepoints outside of the {@link
|
||||
|
@ -170,6 +172,8 @@ public class UnicodeUtils implements UnicodeConstants {
|
|||
// Where not specified, the NFKD and NFTHDL forms are
|
||||
// identical to the NFD form.
|
||||
switch (tibetanUnicodeCP) {
|
||||
case '\u0F00': return ((normalizationForm == NORM_NFTHDL)
|
||||
? "\u0F68\u0F7C\u0F7E" : null);
|
||||
case '\u0F0C': return ((normalizationForm == NORM_NFKD)
|
||||
? "\u0F0B" : null);
|
||||
case '\u0F43': return "\u0F42\u0FB7";
|
||||
|
@ -282,9 +286,37 @@ public class UnicodeUtils implements UnicodeConstants {
|
|||
}
|
||||
|
||||
/** Returns a human-readable, ASCII form of the Unicode codepoint
|
||||
ch. */
|
||||
public static String unicodeCPToString(char ch) {
|
||||
return "U+" + Integer.toHexString((int)ch);
|
||||
cp. */
|
||||
public static String unicodeCodepointToString(char cp) {
|
||||
if (cp < '\u0010')
|
||||
return "\\u000" + Integer.toHexString((int)cp);
|
||||
else if (cp < '\u0100')
|
||||
return "\\u00" + Integer.toHexString((int)cp);
|
||||
else if (cp < '\u1000')
|
||||
return "\\u0" + Integer.toHexString((int)cp);
|
||||
else
|
||||
return "\\u" + Integer.toHexString((int)cp);
|
||||
}
|
||||
|
||||
public static String unicodeStringToString(String s) {
|
||||
StringBuffer sb = new StringBuffer(s.length() * 6);
|
||||
for (int i = 0; i < s.length(); i++) {
|
||||
sb.append(unicodeCodepointToString(s.charAt(i)));
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/** Returns true iff cp is a Unicode 3.2 Tibetan consonant,
|
||||
subjoined or not. This counts precomposed consonant stacks
|
||||
like <code>U+0FA7</code> as consonants. If you don't wish to
|
||||
treat such as consonants, then put the input into NORM_NFD,
|
||||
NORM_NFKD, or NORM_NFTHDL first. If it changes under such a
|
||||
normalization, it is a precomposed consonant. */
|
||||
public static boolean isTibetanConsonant(char cp) {
|
||||
return (((cp >= '\u0F40' && cp <= '\u0F6A')
|
||||
|| (cp >= '\u0F90' && cp <= '\u0FBC'))
|
||||
&& '\u0F48' != cp
|
||||
&& '\u0F98' != cp);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue