Jskad/source/org/thdl/tib/text/tshegbar/UnicodeCharToExtendedWylie.java
dchandler a42347b224 Now uses terminology from the Unicode standard. No more talk of
characters, for example.

Normalization forms NFKD and NFD are supported for the Tibetan Unicode
range.  I don't like either, actually.  I've tested NFKD, but I've not yet
committed the tests.
2002-12-15 03:35:24 +00:00

318 lines
13 KiB
Java

/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2001 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text.tshegbar;
import org.thdl.tib.text.TibetanMachineWeb;
/** This noninstantiable class allows for converting from Unicode
* codepoints to Extended Wylie. It cannot be used for long
* stretches of text, though, as it is unaware of context, which is
* essential to understanding a non-trivial string of Tibetan
* Unicode.
*
* <p>See the document by Nathaniel Garson and David Germano entitled
* <i>Extended Wylie Transliteration Scheme</i>. Note that there are
* a couple of issues with the November 18, 2001 revision of that
* document; these issues are in the Bugs tracker at our SourceForge site.</p>
*
* @see <a href="http://sourceforge.net/projects/thdltools">SourceForge site</a>
*
* @author David Chandler */
public class UnicodeCharToExtendedWylie {
/** Returns the extended Wylie for the very simple sequence x.
* Returns null iff some (Unicode) char in s has no extended
* Wylie representation. This is unaware of context, so use it
* sparingly. */
public static StringBuffer getExtendedWylieForUnicodeString(String x) {
StringBuffer sb = new StringBuffer();
for (int i = 0; i < x.length(); i++) {
String ew = getExtendedWylieForUnicodeChar(x.charAt(i));
if (null == ew)
return null;
sb.append(ew);
}
return sb;
}
/** Returns the extended Wylie for x, or null if there is none.
* Understand that multiple Unicode code points (chars) map to
* the same Extended Wylie representation. Understand also that
* the scrap of Extended Wylie returned is only valid in certain
* contexts. For example, not all consonants take ra-btags. DLC NOW what about canonicalization? */
public static String getExtendedWylieForUnicodeChar(char x) {
switch (x) {
case '\u0F00': return "oM";
case '\u0F01': return null;
case '\u0F02': return null;
case '\u0F03': return null;
case '\u0F04': return "@";
case '\u0F05': return "#";
case '\u0F06': return "$";
case '\u0F07': return "%";
case '\u0F08': return "!";
case '\u0F09': return null;
case '\u0F0A': return null;
case '\u0F0B': return " ";
case '\u0F0C': return "*"; // DLC NOW: Jskad does not support this!
case '\u0F0D': return "/";
case '\u0F0E': return "//"; // DLC FIXME: this is kind of a hack-- the Unicode standard says the spacing for this construct is different than the spacing for "\u0F0D\u0F0D"
case '\u0F0F': return ";";
case '\u0F10': return "[";
case '\u0F11': return "|";
case '\u0F12': return "]";
case '\u0F13': return "`";
case '\u0F14': return ":";
case '\u0F15': return null;
case '\u0F16': return null;
case '\u0F17': return null;
case '\u0F18': return null;
case '\u0F19': return null;
case '\u0F1A': return null;
case '\u0F1B': return null;
case '\u0F1C': return null;
case '\u0F1D': return null;
case '\u0F1E': return null;
case '\u0F1F': return null;
case '\u0F20': return "0";
case '\u0F21': return "1";
case '\u0F22': return "2";
case '\u0F23': return "3";
case '\u0F24': return "4";
case '\u0F25': return "5";
case '\u0F26': return "6";
case '\u0F27': return "7";
case '\u0F28': return "8";
case '\u0F29': return "9";
case '\u0F2A': return null;
case '\u0F2B': return null;
case '\u0F2C': return null;
case '\u0F2D': return null;
case '\u0F2E': return null;
case '\u0F2F': return null;
case '\u0F30': return null;
case '\u0F31': return null;
case '\u0F32': return null;
case '\u0F33': return null;
case '\u0F34': return "=";
case '\u0F35': return null;
case '\u0F36': return null;
case '\u0F37': return null;
case '\u0F38': return null;
case '\u0F39': return null;
case '\u0F3A': return "<";
case '\u0F3B': return ">";
case '\u0F3C': return "(";
case '\u0F3D': return ")";
case '\u0F3E': return "{";
case '\u0F3F': return "}";
case '\u0F40': return "k";
case '\u0F41': return "kh";
case '\u0F42': return "g";
case '\u0F43': return (getExtendedWylieForUnicodeChar('\u0F42')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB7'));
case '\u0F44': return "ng";
case '\u0F45': return "c";
case '\u0F46': return "ch";
case '\u0F47': return "j";
case '\u0F48': return null;
case '\u0F49': return "ny";
case '\u0F4A': return "T";
case '\u0F4B': return "Th";
case '\u0F4C': return "D";
case '\u0F4D': return (getExtendedWylieForUnicodeChar('\u0F4C')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB7'));
case '\u0F4E': return "N";
case '\u0F4F': return "t";
case '\u0F50': return "th";
case '\u0F51': return "d";
case '\u0F52': return (getExtendedWylieForUnicodeChar('\u0F51')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB7'));
case '\u0F53': return "n";
case '\u0F54': return "p";
case '\u0F55': return "ph";
case '\u0F56': return "b";
case '\u0F57': return (getExtendedWylieForUnicodeChar('\u0F56')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB7'));
case '\u0F58': return "m";
case '\u0F59': return "ts";
case '\u0F5A': return "tsh";
case '\u0F5B': return "dz";
case '\u0F5C': return (getExtendedWylieForUnicodeChar('\u0F5B')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB7'));
case '\u0F5D': return "w";
case '\u0F5E': return "zh";
case '\u0F5F': return "z";
case '\u0F60': return "'";
case '\u0F61': return "y";
case '\u0F62': return "r";
case '\u0F63': return "l";
case '\u0F64': return "sh";
case '\u0F65': return "Sh";
case '\u0F66': return "s";
case '\u0F67': return "h";
case '\u0F68': return "a"; // DLC: maybe the empty string is OK here because typing just 'i' into Jskad causes root letter \u0F68 to appear... yuck...
case '\u0F69': return (getExtendedWylieForUnicodeChar('\u0F40')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB5'));
case '\u0F6A': return "r";
case '\u0F6B': return null;
case '\u0F6C': return null;
case '\u0F6D': return null;
case '\u0F6E': return null;
case '\u0F6F': return null;
case '\u0F70': return null;
case '\u0F71': return "A";
case '\u0F72': return "i";
case '\u0F73': return "I";
case '\u0F74': return "u";
case '\u0F75': return "U";
case '\u0F76': return "r-i"; // DLC Ri or r-i? I put in a bug report.
case '\u0F77': return "r-I"; // DLC or RI?
case '\u0F78': return "l-i";
case '\u0F79': return "l-I";
case '\u0F7A': return "e";
case '\u0F7B': return "ai";
case '\u0F7C': return "o";
case '\u0F7D': return "au";
case '\u0F7E': return "M";
case '\u0F7F': return "H";
case '\u0F80': return "-i";
case '\u0F81': return "-I";
case '\u0F82': return "~^";// DLC unsupported in Jskad
case '\u0F83': return "~"; // DLC unsupported in Jskad
case '\u0F84': return "?";
case '\u0F85': return "&";
case '\u0F86': return null;
case '\u0F87': return null;
case '\u0F88': return null;
case '\u0F89': return null;
case '\u0F8A': return null;
case '\u0F8B': return null;
case '\u0F8C': return null;
case '\u0F8D': return null;
case '\u0F8E': return null;
case '\u0F8F': return null;
case '\u0F90': return "k";
case '\u0F91': return "kh";
case '\u0F92': return "g";
case '\u0F93': return (getExtendedWylieForUnicodeChar('\u0F92')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB7'));
case '\u0F94': return "ng";
case '\u0F95': return "c";
case '\u0F96': return "ch";
case '\u0F97': return "j";
case '\u0F98': return null;
case '\u0F99': return "ny";
case '\u0F9A': return "T";
case '\u0F9B': return "Th";
case '\u0F9C': return "D";
case '\u0F9D': return (getExtendedWylieForUnicodeChar('\u0F92')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB7'));
case '\u0F9E': return "N";
case '\u0F9F': return "t";
case '\u0FA0': return "th";
case '\u0FA1': return "d";
case '\u0FA2': return (getExtendedWylieForUnicodeChar('\u0FA1')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB7'));
case '\u0FA3': return "n";
case '\u0FA4': return "p";
case '\u0FA5': return "ph";
case '\u0FA6': return "b";
case '\u0FA7': return (getExtendedWylieForUnicodeChar('\u0FA6')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB7'));
case '\u0FA8': return "m";
case '\u0FA9': return "ts";
case '\u0FAA': return "tsh";
case '\u0FAB': return "dz";
case '\u0FAC': return (getExtendedWylieForUnicodeChar('\u0FAB')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB7'));
case '\u0FAD': return "w";
case '\u0FAE': return "zh";
case '\u0FAF': return "z";
case '\u0FB0': return "'";
case '\u0FB1': return "y";
case '\u0FB2': return "r";
case '\u0FB3': return "l";
case '\u0FB4': return "sh";
case '\u0FB5': return "Sh";
case '\u0FB6': return "s";
case '\u0FB7': return "h";
case '\u0FB8': return "a"; // DLC see note on \u0F68 ...
case '\u0FB9': return (getExtendedWylieForUnicodeChar('\u0F90')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB5'));
case '\u0FBA': return "w";
case '\u0FBB': return "y";
case '\u0FBC': return "r";
case '\u0FBD': return null;
case '\u0FBE': return null;
case '\u0FBF': return null;
case '\u0FC0': return null;
case '\u0FC1': return null;
case '\u0FC2': return null;
case '\u0FC3': return null;
case '\u0FC4': return null;
case '\u0FC5': return null;
case '\u0FC6': return null;
case '\u0FC7': return null;
case '\u0FC8': return null;
case '\u0FC9': return null;
case '\u0FCA': return null;
case '\u0FCB': return null;
case '\u0FCC': return null;
case '\u0FCD': return null;
case '\u0FCE': return null;
case '\u0FCF': return ""; // DLC i added this to the 'EWTS document misspeaks' bug report... null I think...
default: {
// DLC handle space (EW's "_")
// This codepoint is in the range 0FD0-0FFF or is not in
// the Tibetan range at all. In either case, there is no
// corresponding Extended Wylie.
return null;
}
} // end switch
}
}