7ea185fa01
UnicodeCodepointToThdlWylie.java. Added a new class, UnicodeGraphemeCluster, that can tell you the components of a grapheme cluster from top to bottom. It does not yet have good error checking; it is not yet finished. Next is to parse clean Unicode into GraphemeClusters. After that comes scanning dirty Unicode into best-guess GraphemeClusters, and scanning dirty Unicode to get nice error messages.
321 lines
13 KiB
Java
321 lines
13 KiB
Java
/*
|
|
The contents of this file are subject to the THDL Open Community License
|
|
Version 1.0 (the "License"); you may not use this file except in compliance
|
|
with the License. You may obtain a copy of the License on the THDL web site
|
|
(http://www.thdl.org/).
|
|
|
|
Software distributed under the License is distributed on an "AS IS" basis,
|
|
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
|
License for the specific terms governing rights and limitations under the
|
|
License.
|
|
|
|
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
|
Library (THDL). Portions created by the THDL are Copyright 2001 THDL.
|
|
All Rights Reserved.
|
|
|
|
Contributor(s): ______________________________________.
|
|
*/
|
|
|
|
package org.thdl.tib.text.tshegbar;
|
|
|
|
import org.thdl.tib.text.TibetanMachineWeb;
|
|
|
|
/** This noninstantiable class allows for converting from Unicode
|
|
* codepoints to THDL Extended Wylie. It cannot be used for long
|
|
* stretches of text, though, as it is unaware of context, which is
|
|
* essential to understanding a non-trivial string of Tibetan
|
|
* Unicode.
|
|
*
|
|
* <p>See the document by Nathaniel Garson and David Germano entitled
|
|
* <i>Extended Wylie Transliteration Scheme</i>. Note that there are
|
|
* a couple of issues with the November 18, 2001 revision of that
|
|
* document; these issues are in the Bugs tracker at our SourceForge
|
|
* site.</p>
|
|
*
|
|
* @see <a href="http://sourceforge.net/projects/thdltools">SourceForge site</a>
|
|
*
|
|
* @author David Chandler */
|
|
public class UnicodeCodepointToThdlWylie {
|
|
|
|
/** Returns the THDL extended Wylie for the very simple sequence
|
|
* x. Returns null iff some (Unicode) char in s has no THDL
|
|
* extended Wylie representation. This is unaware of context, so
|
|
* use it sparingly. */
|
|
public static StringBuffer getThdlWylieForUnicodeString(String x) {
|
|
StringBuffer sb = new StringBuffer();
|
|
for (int i = 0; i < x.length(); i++) {
|
|
String ew = getThdlWylieForUnicodeCodepoint(x.charAt(i));
|
|
if (null == ew)
|
|
return null;
|
|
sb.append(ew);
|
|
}
|
|
return sb;
|
|
}
|
|
|
|
/** Returns the THDL extended Wylie for x, or null if there is
|
|
* none. Understand that multiple Unicode code points (chars)
|
|
* map to the same THDL Extended Wylie representation.
|
|
* Understand also that the scrap of THDL Extended Wylie returned
|
|
* is only valid in certain contexts. For example, not all
|
|
* consonants take ra-btags. DLC NOW what about
|
|
* canonicalization? */
|
|
public static String getThdlWylieForUnicodeCodepoint(char x) {
|
|
switch (x) {
|
|
|
|
case '\u0F00': return "oM";
|
|
case '\u0F01': return null;
|
|
case '\u0F02': return null;
|
|
case '\u0F03': return null;
|
|
case '\u0F04': return "@";
|
|
case '\u0F05': return "#";
|
|
case '\u0F06': return "$";
|
|
case '\u0F07': return "%";
|
|
case '\u0F08': return "!";
|
|
case '\u0F09': return null;
|
|
case '\u0F0A': return null;
|
|
case '\u0F0B': return " ";
|
|
case '\u0F0C': return "*"; // DLC NOW: Jskad does not support this!
|
|
case '\u0F0D': return "/";
|
|
case '\u0F0E': return "//"; // DLC FIXME: this is kind of a hack-- the Unicode standard says the spacing for this construct is different than the spacing for "\u0F0D\u0F0D"
|
|
case '\u0F0F': return ";";
|
|
|
|
case '\u0F10': return "[";
|
|
case '\u0F11': return "|";
|
|
case '\u0F12': return "]";
|
|
case '\u0F13': return "`";
|
|
case '\u0F14': return ":";
|
|
case '\u0F15': return null;
|
|
case '\u0F16': return null;
|
|
case '\u0F17': return null;
|
|
case '\u0F18': return null;
|
|
case '\u0F19': return null;
|
|
case '\u0F1A': return null;
|
|
case '\u0F1B': return null;
|
|
case '\u0F1C': return null;
|
|
case '\u0F1D': return null;
|
|
case '\u0F1E': return null;
|
|
case '\u0F1F': return null;
|
|
|
|
case '\u0F20': return "0";
|
|
case '\u0F21': return "1";
|
|
case '\u0F22': return "2";
|
|
case '\u0F23': return "3";
|
|
case '\u0F24': return "4";
|
|
case '\u0F25': return "5";
|
|
case '\u0F26': return "6";
|
|
case '\u0F27': return "7";
|
|
case '\u0F28': return "8";
|
|
case '\u0F29': return "9";
|
|
case '\u0F2A': return null;
|
|
case '\u0F2B': return null;
|
|
case '\u0F2C': return null;
|
|
case '\u0F2D': return null;
|
|
case '\u0F2E': return null;
|
|
case '\u0F2F': return null;
|
|
|
|
case '\u0F30': return null;
|
|
case '\u0F31': return null;
|
|
case '\u0F32': return null;
|
|
case '\u0F33': return null;
|
|
case '\u0F34': return "=";
|
|
case '\u0F35': return null;
|
|
case '\u0F36': return null;
|
|
case '\u0F37': return null;
|
|
case '\u0F38': return null;
|
|
case '\u0F39': return null;
|
|
case '\u0F3A': return "<";
|
|
case '\u0F3B': return ">";
|
|
case '\u0F3C': return "(";
|
|
case '\u0F3D': return ")";
|
|
case '\u0F3E': return "{";
|
|
case '\u0F3F': return "}";
|
|
|
|
case '\u0F40': return "k";
|
|
case '\u0F41': return "kh";
|
|
case '\u0F42': return "g";
|
|
case '\u0F43': return (getThdlWylieForUnicodeCodepoint('\u0F42')
|
|
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
|
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
|
|
case '\u0F44': return "ng";
|
|
case '\u0F45': return "c";
|
|
case '\u0F46': return "ch";
|
|
case '\u0F47': return "j";
|
|
case '\u0F48': return null;
|
|
case '\u0F49': return "ny";
|
|
case '\u0F4A': return "T";
|
|
case '\u0F4B': return "Th";
|
|
case '\u0F4C': return "D";
|
|
case '\u0F4D': return (getThdlWylieForUnicodeCodepoint('\u0F4C')
|
|
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
|
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
|
|
case '\u0F4E': return "N";
|
|
case '\u0F4F': return "t";
|
|
|
|
case '\u0F50': return "th";
|
|
case '\u0F51': return "d";
|
|
case '\u0F52': return (getThdlWylieForUnicodeCodepoint('\u0F51')
|
|
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
|
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
|
|
case '\u0F53': return "n";
|
|
case '\u0F54': return "p";
|
|
case '\u0F55': return "ph";
|
|
case '\u0F56': return "b";
|
|
case '\u0F57': return (getThdlWylieForUnicodeCodepoint('\u0F56')
|
|
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
|
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
|
|
case '\u0F58': return "m";
|
|
case '\u0F59': return "ts";
|
|
case '\u0F5A': return "tsh";
|
|
case '\u0F5B': return "dz";
|
|
case '\u0F5C': return (getThdlWylieForUnicodeCodepoint('\u0F5B')
|
|
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
|
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
|
|
case '\u0F5D': return "w";
|
|
case '\u0F5E': return "zh";
|
|
case '\u0F5F': return "z";
|
|
|
|
case '\u0F60': return "'";
|
|
case '\u0F61': return "y";
|
|
case '\u0F62': return "r";
|
|
case '\u0F63': return "l";
|
|
case '\u0F64': return "sh";
|
|
case '\u0F65': return "Sh";
|
|
case '\u0F66': return "s";
|
|
case '\u0F67': return "h";
|
|
case '\u0F68': return "a"; // DLC: maybe the empty string is OK here because typing just 'i' into Jskad causes root letter \u0F68 to appear... yuck...
|
|
case '\u0F69': return (getThdlWylieForUnicodeCodepoint('\u0F40')
|
|
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
|
+ getThdlWylieForUnicodeCodepoint('\u0FB5'));
|
|
case '\u0F6A': return "r";
|
|
case '\u0F6B': return null;
|
|
case '\u0F6C': return null;
|
|
case '\u0F6D': return null;
|
|
case '\u0F6E': return null;
|
|
case '\u0F6F': return null;
|
|
|
|
case '\u0F70': return null;
|
|
case '\u0F71': return "A";
|
|
case '\u0F72': return "i";
|
|
case '\u0F73': return "I";
|
|
case '\u0F74': return "u";
|
|
case '\u0F75': return "U";
|
|
case '\u0F76': return "r-i"; // DLC Ri or r-i? I put in a bug report.
|
|
case '\u0F77': return "r-I"; // DLC or RI?
|
|
case '\u0F78': return "l-i";
|
|
case '\u0F79': return "l-I";
|
|
case '\u0F7A': return "e";
|
|
case '\u0F7B': return "ai";
|
|
case '\u0F7C': return "o";
|
|
case '\u0F7D': return "au";
|
|
case '\u0F7E': return "M";
|
|
case '\u0F7F': return "H";
|
|
|
|
case '\u0F80': return "-i";
|
|
case '\u0F81': return "-I";
|
|
case '\u0F82': return "~^";// DLC unsupported in Jskad
|
|
case '\u0F83': return "~"; // DLC unsupported in Jskad
|
|
case '\u0F84': return "?";
|
|
case '\u0F85': return "&";
|
|
case '\u0F86': return null;
|
|
case '\u0F87': return null;
|
|
case '\u0F88': return null;
|
|
case '\u0F89': return null;
|
|
case '\u0F8A': return null;
|
|
case '\u0F8B': return null;
|
|
case '\u0F8C': return null;
|
|
case '\u0F8D': return null;
|
|
case '\u0F8E': return null;
|
|
case '\u0F8F': return null;
|
|
|
|
case '\u0F90': return "k";
|
|
case '\u0F91': return "kh";
|
|
case '\u0F92': return "g";
|
|
case '\u0F93': return (getThdlWylieForUnicodeCodepoint('\u0F92')
|
|
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
|
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
|
|
case '\u0F94': return "ng";
|
|
case '\u0F95': return "c";
|
|
case '\u0F96': return "ch";
|
|
case '\u0F97': return "j";
|
|
case '\u0F98': return null;
|
|
case '\u0F99': return "ny";
|
|
case '\u0F9A': return "T";
|
|
case '\u0F9B': return "Th";
|
|
case '\u0F9C': return "D";
|
|
case '\u0F9D': return (getThdlWylieForUnicodeCodepoint('\u0F92')
|
|
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
|
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
|
|
case '\u0F9E': return "N";
|
|
case '\u0F9F': return "t";
|
|
|
|
case '\u0FA0': return "th";
|
|
case '\u0FA1': return "d";
|
|
case '\u0FA2': return (getThdlWylieForUnicodeCodepoint('\u0FA1')
|
|
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
|
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
|
|
case '\u0FA3': return "n";
|
|
case '\u0FA4': return "p";
|
|
case '\u0FA5': return "ph";
|
|
case '\u0FA6': return "b";
|
|
case '\u0FA7': return (getThdlWylieForUnicodeCodepoint('\u0FA6')
|
|
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
|
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
|
|
case '\u0FA8': return "m";
|
|
case '\u0FA9': return "ts";
|
|
case '\u0FAA': return "tsh";
|
|
case '\u0FAB': return "dz";
|
|
case '\u0FAC': return (getThdlWylieForUnicodeCodepoint('\u0FAB')
|
|
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
|
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
|
|
case '\u0FAD': return "w";
|
|
case '\u0FAE': return "zh";
|
|
case '\u0FAF': return "z";
|
|
|
|
case '\u0FB0': return "'";
|
|
case '\u0FB1': return "y";
|
|
case '\u0FB2': return "r";
|
|
case '\u0FB3': return "l";
|
|
case '\u0FB4': return "sh";
|
|
case '\u0FB5': return "Sh";
|
|
case '\u0FB6': return "s";
|
|
case '\u0FB7': return "h";
|
|
case '\u0FB8': return "a"; // DLC see note on \u0F68 ...
|
|
case '\u0FB9': return (getThdlWylieForUnicodeCodepoint('\u0F90')
|
|
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
|
+ getThdlWylieForUnicodeCodepoint('\u0FB5'));
|
|
case '\u0FBA': return "w";
|
|
case '\u0FBB': return "y";
|
|
case '\u0FBC': return "r";
|
|
case '\u0FBD': return null;
|
|
case '\u0FBE': return null;
|
|
case '\u0FBF': return null;
|
|
|
|
case '\u0FC0': return null;
|
|
case '\u0FC1': return null;
|
|
case '\u0FC2': return null;
|
|
case '\u0FC3': return null;
|
|
case '\u0FC4': return null;
|
|
case '\u0FC5': return null;
|
|
case '\u0FC6': return null;
|
|
case '\u0FC7': return null;
|
|
case '\u0FC8': return null;
|
|
case '\u0FC9': return null;
|
|
case '\u0FCA': return null;
|
|
case '\u0FCB': return null;
|
|
case '\u0FCC': return null;
|
|
case '\u0FCD': return null;
|
|
case '\u0FCE': return null;
|
|
case '\u0FCF': return ""; // DLC i added this to the 'EWTS document misspeaks' bug report... null I think...
|
|
|
|
default: {
|
|
// DLC handle space (EW's "_")
|
|
|
|
// This codepoint is in the range 0FD0-0FFF or is not in
|
|
// the Tibetan range at all. In either case, there is no
|
|
// corresponding THDL Extended Wylie.
|
|
return null;
|
|
}
|
|
} // end switch
|
|
}
|
|
}
|
|
|