Jskad/source/org/thdl/tib/text/tshegbar/UnicodeCharToExtendedWylie.java

318 lines
13 KiB
Java
Raw Normal View History

This commit is for my benefit only; these classes are not ready for prime time, and the build system is not yet aware of them. I'm adding some classes for representing legal tsheg-bars (syllables, for the most part) in Unicode. These classes were designed bottom-up (OK, OK -- they weren't designed designed, but I had to write down everything I knew about Tibetan syntax somewhere). The classes are aware of extended wylie. I doubt the Javadocs work yet, and I'm still testing (and am not committing my testing code with these as it is not yet ready). Next on my list--fix these up to reflect my new awareness of suffix particles (like le'u'i'o) add classes to support syntactically incorrect Unicode sequences. Then add a UnicodeReader, and we've got the back end of a Tibetan Unicode shaping system (like half of MS's Uniscribe or Apple's Worldscript or FreeType Layout or Omega's OTPs). A top-down design would not have included LegalTshegBar. But now that my itch has been scratched, potential uses are lingering about. For example, it would be nice to scan some input and break it into LegalTshegBars, punctuation/marks/signs, and illegal stacks. Then we could alert the client of the illegality, its precise form, and its precise location. The real system for turning a Unicode stream into an internal representation suitable for conversion to EWTS/ACIP/XHTML/what-have-you need not be aware of Tibetan syntax. But to make the very best conversion from Unicode to, e.g., EWTS, it is necessary to konw that gaskad is better represented as gskad, but that jaskad is not the same as jskad.
2002-12-09 01:02:23 +00:00
/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2001 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text.tshegbar;
import org.thdl.tib.text.TibetanMachineWeb;
/** This noninstantiable class allows for converting from Unicode
* characters (i.e., code points) to Extended Wylie. It cannot be
* used for long stretches of text, though, as it is unaware of
* context, which is essential to understanding a non-trivial string
* of Tibetan Unicode.
*
* <p>See the document by Nathaniel Garson and David Germano entitled
* <i>Extended Wylie Transliteration Scheme</i>. Note that there are
* a couple of issues with the November 18, 2001 revision of that
* document; these issues are in the Bugs tracker at {@see
* http://sourceforge.net/projects/thdltools}.</p>
*
* @author David Chandler */
public class UnicodeCharToExtendedWylie {
/** Returns the extended Wylie for the very simple sequence x.
* Returns null iff some (Unicode) char in s has no extended
* Wylie representation. This is unaware of context, so use it
* sparingly. */
public static StringBuffer getExtendedWylieForUnicodeString(String x) {
StringBuffer sb = new StringBuffer();
for (int i = 0; i < x.length(); i++) {
String ew = getExtendedWylieForUnicodeChar(x.charAt(i));
if (null == ew)
return null;
sb.append(ew);
}
return sb;
}
/** Returns the extended Wylie for x, or null if there is none.
* Understand that multiple Unicode code points (chars) map to
* the same Extended Wylie representation. Understand also that
* the scrap of Extended Wylie returned is only valid in certain
* contexts. For example, not all consonants take ra-btags. DLC NOW what about canonicalization? */
public static String getExtendedWylieForUnicodeChar(char x) {
switch (x) {
case '\u0F00': return "oM";
case '\u0F01': return null;
case '\u0F02': return null;
case '\u0F03': return null;
case '\u0F04': return "@";
case '\u0F05': return "#";
case '\u0F06': return "$";
case '\u0F07': return "%";
case '\u0F08': return "!";
case '\u0F09': return null;
case '\u0F0A': return null;
case '\u0F0B': return " ";
case '\u0F0C': return "*"; // DLC NOW: Jskad does not support this!
case '\u0F0D': return "/";
case '\u0F0E': return "//"; // DLC FIXME: this is kind of a hack-- the Unicode standard says the spacing for this construct is different than the spacing for "\u0F0D\u0F0D"
case '\u0F0F': return ";";
case '\u0F10': return "[";
case '\u0F11': return "|";
case '\u0F12': return "]";
case '\u0F13': return "`";
case '\u0F14': return ":";
case '\u0F15': return null;
case '\u0F16': return null;
case '\u0F17': return null;
case '\u0F18': return null;
case '\u0F19': return null;
case '\u0F1A': return null;
case '\u0F1B': return null;
case '\u0F1C': return null;
case '\u0F1D': return null;
case '\u0F1E': return null;
case '\u0F1F': return null;
case '\u0F20': return "0";
case '\u0F21': return "1";
case '\u0F22': return "2";
case '\u0F23': return "3";
case '\u0F24': return "4";
case '\u0F25': return "5";
case '\u0F26': return "6";
case '\u0F27': return "7";
case '\u0F28': return "8";
case '\u0F29': return "9";
case '\u0F2A': return null;
case '\u0F2B': return null;
case '\u0F2C': return null;
case '\u0F2D': return null;
case '\u0F2E': return null;
case '\u0F2F': return null;
case '\u0F30': return null;
case '\u0F31': return null;
case '\u0F32': return null;
case '\u0F33': return null;
case '\u0F34': return "=";
case '\u0F35': return null;
case '\u0F36': return null;
case '\u0F37': return null;
case '\u0F38': return null;
case '\u0F39': return null;
case '\u0F3A': return "<";
case '\u0F3B': return ">";
case '\u0F3C': return "(";
case '\u0F3D': return ")";
case '\u0F3E': return "{";
case '\u0F3F': return "}";
case '\u0F40': return "k";
case '\u0F41': return "kh";
case '\u0F42': return "g";
case '\u0F43': return (getExtendedWylieForUnicodeChar('\u0F42')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB7'));
case '\u0F44': return "ng";
case '\u0F45': return "c";
case '\u0F46': return "ch";
case '\u0F47': return "j";
case '\u0F48': return null;
case '\u0F49': return "ny";
case '\u0F4A': return "T";
case '\u0F4B': return "Th";
case '\u0F4C': return "D";
case '\u0F4D': return (getExtendedWylieForUnicodeChar('\u0F4C')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB7'));
case '\u0F4E': return "N";
case '\u0F4F': return "t";
case '\u0F50': return "th";
case '\u0F51': return "d";
case '\u0F52': return (getExtendedWylieForUnicodeChar('\u0F51')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB7'));
case '\u0F53': return "n";
case '\u0F54': return "p";
case '\u0F55': return "ph";
case '\u0F56': return "b";
case '\u0F57': return (getExtendedWylieForUnicodeChar('\u0F56')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB7'));
case '\u0F58': return "m";
case '\u0F59': return "ts";
case '\u0F5A': return "tsh";
case '\u0F5B': return "dz";
case '\u0F5C': return (getExtendedWylieForUnicodeChar('\u0F5B')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB7'));
case '\u0F5D': return "w";
case '\u0F5E': return "zh";
case '\u0F5F': return "z";
case '\u0F60': return "'";
case '\u0F61': return "y";
case '\u0F62': return "r";
case '\u0F63': return "l";
case '\u0F64': return "sh";
case '\u0F65': return "Sh";
case '\u0F66': return "s";
case '\u0F67': return "h";
case '\u0F68': return "a"; // DLC: maybe the empty string is OK here because typing just 'i' into Jskad causes root letter \u0F68 to appear... yuck...
case '\u0F69': return (getExtendedWylieForUnicodeChar('\u0F40')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB5'));
case '\u0F6A': return "r";
case '\u0F6B': return null;
case '\u0F6C': return null;
case '\u0F6D': return null;
case '\u0F6E': return null;
case '\u0F6F': return null;
case '\u0F70': return null;
case '\u0F71': return "A";
case '\u0F72': return "i";
case '\u0F73': return "I";
case '\u0F74': return "u";
case '\u0F75': return "U";
case '\u0F76': return "r-i"; // DLC Ri or r-i? I put in a bug report.
case '\u0F77': return "r-I"; // DLC or RI?
case '\u0F78': return "l-i";
case '\u0F79': return "l-I";
case '\u0F7A': return "e";
case '\u0F7B': return "ai";
case '\u0F7C': return "o";
case '\u0F7D': return "au";
case '\u0F7E': return "M";
case '\u0F7F': return "H";
case '\u0F80': return "-i";
case '\u0F81': return "-I";
case '\u0F82': return "~^";// DLC unsupported in Jskad
case '\u0F83': return "~"; // DLC unsupported in Jskad
case '\u0F84': return "?";
case '\u0F85': return "&";
case '\u0F86': return null;
case '\u0F87': return null;
case '\u0F88': return null;
case '\u0F89': return null;
case '\u0F8A': return null;
case '\u0F8B': return null;
case '\u0F8C': return null;
case '\u0F8D': return null;
case '\u0F8E': return null;
case '\u0F8F': return null;
case '\u0F90': return "k";
case '\u0F91': return "kh";
case '\u0F92': return "g";
case '\u0F93': return (getExtendedWylieForUnicodeChar('\u0F92')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB7'));
case '\u0F94': return "ng";
case '\u0F95': return "c";
case '\u0F96': return "ch";
case '\u0F97': return "j";
case '\u0F98': return null;
case '\u0F99': return "ny";
case '\u0F9A': return "T";
case '\u0F9B': return "Th";
case '\u0F9C': return "D";
case '\u0F9D': return (getExtendedWylieForUnicodeChar('\u0F92')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB7'));
case '\u0F9E': return "N";
case '\u0F9F': return "t";
case '\u0FA0': return "th";
case '\u0FA1': return "d";
case '\u0FA2': return (getExtendedWylieForUnicodeChar('\u0FA1')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB7'));
case '\u0FA3': return "n";
case '\u0FA4': return "p";
case '\u0FA5': return "ph";
case '\u0FA6': return "b";
case '\u0FA7': return (getExtendedWylieForUnicodeChar('\u0FA6')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB7'));
case '\u0FA8': return "m";
case '\u0FA9': return "ts";
case '\u0FAA': return "tsh";
case '\u0FAB': return "dz";
case '\u0FAC': return (getExtendedWylieForUnicodeChar('\u0FAB')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB7'));
case '\u0FAD': return "w";
case '\u0FAE': return "zh";
case '\u0FAF': return "z";
case '\u0FB0': return "'";
case '\u0FB1': return "y";
case '\u0FB2': return "r";
case '\u0FB3': return "l";
case '\u0FB4': return "sh";
case '\u0FB5': return "Sh";
case '\u0FB6': return "s";
case '\u0FB7': return "h";
case '\u0FB8': return "a"; // DLC see note on \u0F68 ...
case '\u0FB9': return (getExtendedWylieForUnicodeChar('\u0F90')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB5'));
case '\u0FBA': return "w";
case '\u0FBB': return "y";
case '\u0FBC': return "r";
case '\u0FBD': return null;
case '\u0FBE': return null;
case '\u0FBF': return null;
case '\u0FC0': return null;
case '\u0FC1': return null;
case '\u0FC2': return null;
case '\u0FC3': return null;
case '\u0FC4': return null;
case '\u0FC5': return null;
case '\u0FC6': return null;
case '\u0FC7': return null;
case '\u0FC8': return null;
case '\u0FC9': return null;
case '\u0FCA': return null;
case '\u0FCB': return null;
case '\u0FCC': return null;
case '\u0FCD': return null;
case '\u0FCE': return null;
case '\u0FCF': return ""; // DLC i added this to the 'EWTS document misspeaks' bug report... null I think...
default: {
// DLC handle space (EW's "_")
// This character is in the range 0FD0-0FFF or is not in
// the Tibetan range at all. In either case, there is no
// corresponding Extended Wylie.
return null;
}
} // end switch
}
}