Jskad/source/org/thdl/tib/text/TibTextUtils.java
2005-07-07 02:54:36 +00:00

1934 lines
89 KiB
Java

/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2001-2004 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text;
import java.util.*;
import javax.swing.*;
import javax.swing.text.*;
import javax.swing.text.rtf.RTFEditorKit;
import java.io.*;
import org.thdl.util.ThdlDebug;
import org.thdl.tib.text.ttt.TTraits;
import org.thdl.tib.text.ttt.ACIPTraits;
import org.thdl.tib.text.ttt.EWTSTraits;
import org.thdl.tib.text.ttt.TConverter;
import org.thdl.tib.text.tshegbar.LegalTshegBar;
import org.thdl.tib.text.tshegbar.UnicodeConstants;
import org.thdl.tib.text.tshegbar.UnicodeUtils;
/**
* Provides methods for converting back and forth between Extended
* Wylie/ACIP and Tibetan represented in TibetanMachineWeb glyphs.
* This class is not instantiable.
*
* <p> The class provides a variety of static methods for converting
* back and forth between Extended Wylie or ACIP and TibetanMachineWeb.
* The Wylie/ACIP can be accessed as a String, while the
* TibetanMachineWeb can be exported as Rich Text Format.
*
* @author Edward Garrett, Tibetan and Himalayan Digital Library */
public class TibTextUtils implements THDLWylieConstants {
/** Change to true to see various things on System.out and
System.err. */
private static final boolean debug = false;
/** Do not use this contructor. */
private TibTextUtils() { super(); }
/**
* Converts a list of glyphs into an array of {@link DuffData DuffData}.
* The motivation for this is that most processes - for example using
* TibetanMachineWeb in HTML - only need to know what
* text to output, and when to change fonts. In general, they don't
* need to have an explicit indication for each glyph of the font
* for that glyph.
* @param glyphs the list of TibetanMachineWeb glyphs
* you want to convert
* @return an array of DuffData corresponding to this
* list of glyphs
*/
public static DuffData[] convertGlyphs(List glyphs) {
if (glyphs.size() == 0)
return null;
List data = new ArrayList();
StringBuffer sb = new StringBuffer();
Iterator iter = glyphs.iterator();
DuffCode dc = (DuffCode)iter.next();
int lastfont = dc.getFontNum();
sb.append(dc.getCharacter());
while (iter.hasNext()) {
dc = (DuffCode)iter.next();
if (dc.getFontNum() == lastfont)
sb.append(dc.getCharacter());
else {
data.add(new DuffData(sb.toString(), lastfont));
lastfont = dc.getFontNum();
sb = new StringBuffer();
sb.append(dc.getCharacter());
}
}
data.add(new DuffData(sb.toString(), lastfont));
DuffData[] dd = new DuffData[0];
dd = (DuffData[])data.toArray(dd);
return dd;
}
/**
* Figures out how to arrange a list of characters into glyphs. For
* example, if the user types 'bsgr' using the Extended Wylie keyboard,
* this method figures out that this should be represented as a 'b'
* glyph followed by a 's-g-r' glyph. If you know that the characters
* do not contain Sanskrit stacks, or do not contain Tibetan stacks,
* then you can specify this to speed the process up. Otherwise, the
* method will first check to see if the characters correspond to any
* Tibetan stacks, and if not, then it will check for Sanskrit stacks.
* @param chars the list of Tibetan characters you want to find glyphs
* for
* @param areStacksOnRight whether stacking should try to maximize from
* right to left (true) or from left to right (false). In the Extended
* Wylie keyboard, you try to stack from right to left. Thus, the
* character sequence r-g-r would be stacked as r followed by gr,
* rather than rg followed by r. In the Sambhota and TCC keyboards, the
* stack direction is reversed.
* @param definitelyTibetan should be true if the characters are known
* to be Tibetan and not Sanskrit
* @param definitelySanskrit should be true if the characters are known
* to be Sanskrit and not Tibetan
*/
public static List getGlyphs(List chars, boolean areStacksOnRight, boolean definitelyTibetan, boolean definitelySanskrit) {
StringBuffer tibBuffer, sanBuffer;
String tibCluster, sanCluster;
boolean checkTibetan, checkSanskrit;
if (!(definitelyTibetan || definitelySanskrit)) {
checkTibetan = true;
checkSanskrit = true;
}
else {
checkTibetan = definitelyTibetan;
checkSanskrit = definitelySanskrit;
}
int length = chars.size();
List glyphs = new ArrayList();
glyphs.clear();
if (areStacksOnRight) {
for (int i=0; i<length; i++) {
tibBuffer = new StringBuffer();
tibCluster = null;
sanBuffer = new StringBuffer();
sanCluster = null;
for (int k=i; k<length; k++) {
String s = (String)chars.get(k);
if (checkTibetan)
tibBuffer.append(s);
if (checkSanskrit)
sanBuffer.append(s);
if (k!=length-1) {
if (checkTibetan)
tibBuffer.append("-");
if (checkSanskrit)
sanBuffer.append("+");
}
}
if (checkTibetan) {
tibCluster = tibBuffer.toString();
if (TibetanMachineWeb.hasGlyph(tibCluster)) {
Iterator iter = chars.iterator();
for (int k=0; k<i; k++) //should really check here to make sure glyphs exist FIXME
glyphs.add(TibetanMachineWeb.getGlyph((String)iter.next()));
glyphs.add(TibetanMachineWeb.getGlyph(tibCluster));
return glyphs;
}
}
if (checkSanskrit) {
sanCluster = sanBuffer.toString();
if (TibetanMachineWeb.hasGlyph(sanCluster)) {
Iterator iter = chars.iterator();
for (int k=0; k<i; k++) //should really check here to make sure glyphs exist FIXME
glyphs.add(TibetanMachineWeb.getGlyph((String)iter.next()));
glyphs.add(TibetanMachineWeb.getGlyph(sanCluster));
return glyphs;
}
}
}
}
else {
for (int i=length-1; i>-1; i--) {
tibBuffer = new StringBuffer();
tibCluster = null;
sanBuffer = new StringBuffer();
sanCluster = null;
Iterator iter = chars.iterator();
for (int k=0; k<i+1; k++) {
String s = (String)iter.next();
if (checkTibetan)
tibBuffer.append(s);
if (checkSanskrit)
sanBuffer.append(s);
if (k!=i) {
if (checkTibetan)
tibBuffer.append("-");
if (checkSanskrit)
sanBuffer.append("+");
}
}
if (checkTibetan) {
tibCluster = tibBuffer.toString();
if (TibetanMachineWeb.hasGlyph(tibCluster)) {
glyphs.add(TibetanMachineWeb.getGlyph(tibCluster));
for (int k=i+1; k<length; k++)
glyphs.add(TibetanMachineWeb.getGlyph((String)iter.next()));
return glyphs;
}
}
if (checkSanskrit) {
sanCluster = sanBuffer.toString();
if (TibetanMachineWeb.hasGlyph(sanCluster)) {
glyphs.add(TibetanMachineWeb.getGlyph(sanCluster));
for (int k=i+1; k<length; k++)
glyphs.add(TibetanMachineWeb.getGlyph((String)iter.next()));
return glyphs;
}
}
}
}
return null;
}
/**
* Finds the first meaningful element to occur within a string of
* Extended Wylie. This could be a character, a vowel, punctuation, or
* formatting. For example, passed the string 'tshapo', this method
* will return 'tsh'.
* @param wylie the String of wylie you want to scan
* @return the next meaningful subpart of this string, or null if
* no meaningful subpart can be found (for example 'x' has no equivalent
* in Extended Wylie)
*/
public static String getNext(String wylie) {
boolean hasThereBeenValidity = false;
boolean isThereValidity = false;
String s;
int i;
int offset = 0;
char c = wylie.charAt(offset);
int k = (int)c;
if (k < 32) //return null if character is just formatting
return String.valueOf(c);
if (c == WYLIE_DISAMBIGUATING_KEY)
return String.valueOf(WYLIE_DISAMBIGUATING_KEY);
if (c == WYLIE_SANSKRIT_STACKING_KEY)
return String.valueOf(WYLIE_SANSKRIT_STACKING_KEY);
for (i=offset+1; i<wylie.length()+1; i++) {
s = wylie.substring(offset, i);
if (!isThereValidity) {
if (TibetanMachineWeb.isWyliePunc(s) || TibetanMachineWeb.isWylieVowel(s) || TibetanMachineWeb.isWylieChar(s)) {
isThereValidity = true;
hasThereBeenValidity = true;
}
}
else {
if (!TibetanMachineWeb.isWyliePunc(s) && !TibetanMachineWeb.isWylieVowel(s) && !TibetanMachineWeb.isWylieChar(s)) {
isThereValidity = false;
break;
}
}
}
if (!hasThereBeenValidity)
s = null;
else {
if (isThereValidity) //the whole text region is valid
s = wylie.substring(offset, wylie.length());
else //the loop was broken out of
s = wylie.substring(offset, i-1);
}
return s;
}
/** An array containing one boolean value. Pass this to
TibetanMachineWeb.getWylieForGlyph(..) if you don't care if a
certain glyph has corresponding Wylie or not. */
public static final boolean[] weDoNotCareIfThereIsCorrespondingWylieOrNot
= new boolean[] { false };
/**
* Converts a string of transliteration into TibetanMachineWeb and
* inserts that into tdoc at offset loc.
* @param EWTSNotACIP true if you want THDL Extended Wylie, false if
* you want ACIP
* @param translit the transliteration you want to convert
* @param tdoc the document in which to insert the TMW
* @param loc the offset inside the document at which to insert the TMW
* @param withWarnings true if and only if you want warnings to appear
* in the output, such as "this could be a mistranscription of blah..."
* @throws InvalidTransliterationException if the transliteration is
* deemed invalid, i.e. if it does not conform to the transcription
* rules (those in the official document and the subtler rules pieced
* together by David Chandler through study and private correspondence
* with Robert Chilton (for ACIP), Than Garson, David Germano, Chris
* Fynn, and others)
* @return the number of characters inserted into tdoc */
public static int insertTibetanMachineWebForTranslit(boolean EWTSNotACIP,
String translit,
TibetanDocument tdoc,
int loc,
boolean withWarnings)
throws InvalidTransliterationException
{
StringBuffer errors = new StringBuffer();
String warningLevel = withWarnings ? "All" : "None";
TTraits traits = (EWTSNotACIP
? (TTraits)EWTSTraits.instance()
: (TTraits)ACIPTraits.instance());
ArrayList al = traits.scanner().scan(translit, errors, 500,
false, warningLevel);
if (null == al || errors.length() > 0) {
if (errors.length() > 0)
throw new InvalidTransliterationException(errors.toString());
else
throw new InvalidTransliterationException("Fatal error converting "
+ traits.shortTranslitName()
+ " to TMW.");
}
boolean colors = withWarnings;
boolean putWarningsInOutput = false;
if ("None" != warningLevel) {
putWarningsInOutput = true;
}
try {
int tloc[] = new int[] { loc };
TConverter.convertToTMW(traits, al, tdoc, null, null,
null, putWarningsInOutput, warningLevel,
false, colors, tloc);
return tloc[0] - loc;
} catch (IOException e) {
throw new Error("Can't happen: " + e);
}
}
/**
* Converts a string of Extended Wylie into {@link DuffData DuffData}.
* @param wylie the Wylie you want to convert
* @return an array of TibetanMachineWeb data
* corresponding to the Wylie text
* @throws InvalidWylieException if the Wylie is deemed invalid,
* i.e. if it does not conform to the Extended Wylie standard
* @deprecated by insertTibetanMachineWebForTranslit
*/
public static DuffData[] getTibetanMachineWebForEWTS(String wylie) throws InvalidWylieException {
ThdlDebug.noteIffyCode(); // deprecated method!
// TODO(dchandler): remove it and
// hopefully a ton of code that
// only it uses.
List chars = new ArrayList();
DuffCode dc;
int start = 0;
boolean isSanskrit = false;
boolean wasLastSanskritStackingKey = false;
LinkedList glyphs = new LinkedList();
while (start < wylie.length()) {
String next = getNext(wylie.substring(start));
if (next == null) {
if (!chars.isEmpty()) {
glyphs.addAll(getGlyphs(chars, true, !isSanskrit, isSanskrit));
chars.clear();
isSanskrit = false;
}
else { //could not convert - throw exception
// FIXME: we're printing to stdout!
if (start+5 <= wylie.length()) {
System.out.println("Bad wylie: "
+ wylie.substring(start,
start + 5));
} else {
System.out.println("Bad wylie: "+wylie.substring(start));
}
throw new InvalidWylieException(wylie, start);
}
}
else if (TibetanMachineWeb.isWyliePunc(next)) {
if (!chars.isEmpty())
glyphs.addAll(getGlyphs(chars, true, !isSanskrit, isSanskrit));
chars.clear();
if (next.equals(String.valueOf(BINDU))) {
if (glyphs.isEmpty())
dc = null;
else
dc = (DuffCode)glyphs.removeLast(); //LinkedList implementation
getBindu(glyphs, dc);
}
else {
dc = TibetanMachineWeb.getGlyph(next);
glyphs.add(dc);
}
isSanskrit = false;
}
else if (TibetanMachineWeb.isWylieVowel(next)) {
if (!chars.isEmpty()) {
glyphs.addAll(getGlyphs(chars, true, !isSanskrit, isSanskrit));
dc = (DuffCode)glyphs.removeLast(); //LinkedList implementation
getVowel(glyphs, dc, next);
chars.clear();
}
else { //if previous is punctuation or null, then achen plus vowel - otherwise, previous could be vowel
int size = glyphs.size();
vowel_block: {
if (size > 1) {
dc = (DuffCode)glyphs.get(glyphs.size()-1);
if (!TibetanMachineWeb.isWyliePunc(TibetanMachineWeb.getWylieForGlyph(dc, weDoNotCareIfThereIsCorrespondingWylieOrNot))) {
DuffCode dc_2 = (DuffCode)glyphs.removeLast();
DuffCode dc_1 = (DuffCode)glyphs.removeLast();
getVowel(glyphs, dc_1, dc_2, next);
break vowel_block;
}
}
DuffCode[] dc_array = (DuffCode[])TibetanMachineWeb.getTibHash().get(ACHEN);
dc = dc_array[TibetanMachineWeb.TMW];
getVowel(glyphs, dc, next);
}
chars.clear();
}
isSanskrit = false;
}
else if (TibetanMachineWeb.isWylieChar(next)) {
if (!isSanskrit) //add char to list - it is not sanskrit
chars.add(next);
else if (wasLastSanskritStackingKey) { //add char to list - it is still part of sanskrit stack
chars.add(next);
wasLastSanskritStackingKey = false;
}
else { //char is no longer part of sanskrit stack, therefore compute and add previous stack
glyphs.addAll(getGlyphs(chars, true, !isSanskrit, isSanskrit));
chars.clear();
chars.add(next);
isSanskrit = false;
wasLastSanskritStackingKey = false;
}
}
else if (next.equals(String.valueOf(WYLIE_DISAMBIGUATING_KEY))) {
if (!chars.isEmpty())
glyphs.addAll(getGlyphs(chars, true, !isSanskrit, isSanskrit));
chars.clear();
isSanskrit = false;
}
else if (next.equals(String.valueOf(WYLIE_SANSKRIT_STACKING_KEY))) {
if (!isSanskrit) { //begin sanskrit stack
switch (chars.size()) {
case 0:
break; //'+' is not "pre-stacking" key
case 1:
isSanskrit = true;
wasLastSanskritStackingKey = true;
break;
default:
String top_char = (String)chars.get(chars.size()-1);
chars.remove(chars.size()-1);
// DLC PERFORMANCE FIXME: make glyphs a parameter
glyphs.addAll(getGlyphs(chars, true, !isSanskrit, isSanskrit));
chars.clear();
chars.add(top_char);
isSanskrit = true;
wasLastSanskritStackingKey = true;
break;
}
}
}
else if (TibetanMachineWeb.isFormatting(next.charAt(0))) {
if (!chars.isEmpty())
glyphs.addAll(getGlyphs(chars, true, !isSanskrit, isSanskrit));
dc = new DuffCode(1,next.charAt(0));
glyphs.add(dc);
chars.clear();
isSanskrit = false;
}
if (next != null)
start += next.length();
}
if (!chars.isEmpty()) {
glyphs.addAll(getGlyphs(chars, true, !isSanskrit, isSanskrit));
chars.clear();
}
DuffData[] dd = convertGlyphs(glyphs);
return dd;
}
/**
* Gets the bindu sequence for a given context. In the
* TibetanMachineWeb fonts, bindu (anusvara) is realized differently
* depending on which vowel it attaches to. Although the default bindu
* glyph is affixed to consonants and subscript vowels, for superscript
* vowels (i, e, o, etc), there is a single glyph which merges the
* bindu and that vowel together. When you pass this method a glyph
* context and a list, it will append to that list glyphs which will either consist
* of the original glyph followed by the default bindu glyph, or a
* composite vowel+bindu glyph. Note that there is only one glyph in
* the context. This means that bindus will not affix properly if
* superscript vowels are allowed to directly precede subscript vowels
* (e.g. pou).
* @param list a List of DuffCode glyphs to which will be appended the
* original dc (if non-null) as well as a bindu, or the one glyph that
* represents both
* @param dc the DuffCode of the glyph you want to attach a bindu to,
* or null */
public static void getBindu(List list, DuffCode dc) {
if (null == dc) {
list.add(TibetanMachineWeb.getGlyph(String.valueOf(BINDU)));
} else {
if (!TibetanMachineWeb.getBinduMap().containsKey(dc)) {
list.add(dc);
list.add(TibetanMachineWeb.getGlyph(String.valueOf(BINDU)));
} else {
list.add((DuffCode)TibetanMachineWeb.getBinduMap().get(dc));
}
}
}
/**
* Gets the vowel sequence for a given vowel in a given context. Given
* a context, this method affixes a vowel and returns the context (iff
* context_added[0] is false) plus the vowel. Generally, it is enough
* to provide just one glyph for context.
* @param context the glyph preceding the vowel you want to affix
* @param vowel the vowel you want to affix, in Wylie
* @param context_added an array of one boolean, an input/output
* parameter that, if true, means that only the vowel will be added to
* l, not the context, and if false, means that the context and the
* vowel will be added and that context_added[0] will be updated to be
* true
* @return a List of glyphs equal to the vowel in context
* @throws IllegalArgumentException if the given combination is not
* supported */
public static void getVowel(List l, DuffCode context, String vowel, boolean context_added[]) {
getVowel(l, null, context, vowel, context_added);
}
/** Wrapper that calls for adding context to l. */
public static void getVowel(List l, DuffCode context, String vowel) {
getVowel(l, null, context, vowel, new boolean[] { false });
}
/** Wrapper that calls for adding context to l. */
public static void getVowel(List l, DuffCode context_1, DuffCode context_2, String vowel) {
getVowel(l, context_1, context_2, vowel, new boolean[] { false });
}
/**
* Gets the vowel sequence for a given vowel in a given context and
* appends it to l. Given a context, this method affixes a vowel and
* appends the context (iff context_added[0] is false) plus the vowel
* to l. Since the choice of vowel glyph depends on the consonant to
* which it is attached, generally it is enough to provide just the
* immediately preceding context. However, in some cases, double vowels
* are allowed - for example 'buo'. To find the correct glyph for 'o',
* we need 'b' in this case, not 'u'. Note also that some Extended
* Wylie vowels correspond to multiple glyphs in TibetanMachineWeb. For
* example, the vowel I consists of both an achung and a reverse
* gigu. All required glyphs are appended to l.
* @param context_1 the glyph occurring two glyphs before the vowel you
* want to affix
* @param context_2 the glyph immediately before the vowel you want to
* affix
* @param vowel the vowel you want to affix, in Wylie
* @param context_added an array of one boolean, an input/output
* parameter that, if true, means that only the vowel will be added to
* l, not the context, and if false, means that the context and the
* vowel will be added and that context_added[0] will be updated to be
* true
* @throws IllegalArgumentException if the given combination is not
* supported */
public static void getVowel(List l, DuffCode context_1, DuffCode context_2,
String vowel, boolean context_added[])
throws IllegalArgumentException
{
//this vowel doesn't correspond to a glyph -
//so you just return the original context
if (vowel.equals(WYLIE_aVOWEL)
|| TibetanMachineWeb.isTopVowel(context_2)) {
if (TibetanMachineWeb.isTopVowel(context_2))
throw new IllegalArgumentException("dropping vowels is bad1:" + vowel);
if (!context_added[0]) {
context_added[0] = true;
if (context_1 != null)
l.add(context_1);
l.add(context_2);
}
return;
}
//first, the three easiest cases: ai, au, and <i
//these vowels have one invariant form - therefore,
//dc_context is just returned along with that form
if (vowel.equals(ai_VOWEL)) {
if (!context_added[0]) {
context_added[0] = true;
if (context_1 != null)
l.add(context_1);
l.add(context_2);
}
DuffCode[] dc_v = (DuffCode[])TibetanMachineWeb.getTibHash().get(ai_VOWEL);
l.add(dc_v[TibetanMachineWeb.TMW]);
return;
}
if (vowel.equals(au_VOWEL)) {
if (!context_added[0]) {
context_added[0] = true;
if (context_1 != null)
l.add(context_1);
l.add(context_2);
}
DuffCode[] dc_v = (DuffCode[])TibetanMachineWeb.getTibHash().get(au_VOWEL);
l.add(dc_v[TibetanMachineWeb.TMW]);
return;
}
if (vowel.equals(reverse_i_VOWEL)) {
if (!context_added[0]) {
context_added[0] = true;
if (context_1 != null)
l.add(context_1);
l.add(context_2);
}
if (!TibetanMachineWeb.isTopVowel(context_2)) {
DuffCode[] dc_v = (DuffCode[])TibetanMachineWeb.getTibHash().get(reverse_i_VOWEL);
l.add(dc_v[TibetanMachineWeb.TMW]);
} else throw new IllegalArgumentException("dropping vowels is bad2:" + vowel);
return;
}
//second, the vowels i, e, and o
//these vowels have many different glyphs each,
//whose correct selection depends on the
//preceding context. therefore, dc_context is
//returned along with the vowel appropriate to
//that context
if (vowel.equals(i_VOWEL)) {
String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2);
DuffCode dc_v = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_i);
if (null == dc_v && null != context_1) {
hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_1);
dc_v = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_i);
}
if (!context_added[0]) {
context_added[0] = true;
if (context_1 != null)
l.add(context_1);
l.add(context_2);
}
if (null != dc_v)
l.add(dc_v);
else throw new IllegalArgumentException("dropping vowels is bad3:" + vowel);
return;
}
// DLC perfect TMW->Wylie wouldn't produce o'i for an input file containing merely TMW9.61 -- it would produce \u0f7c,\u0f60,\u0f72 -- round-trip shows why.
if (vowel.equals(e_VOWEL)) {
String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2);
DuffCode dc_v = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_e);
if (null == dc_v && null != context_1) {
hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_1);
dc_v = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_e);
}
if (!context_added[0]) {
context_added[0] = true;
if (context_1 != null)
l.add(context_1);
l.add(context_2);
}
if (null != dc_v)
l.add(dc_v);
else throw new IllegalArgumentException("dropping vowels is bad4:" + vowel);
return;
}
if (vowel.equals(o_VOWEL)) {
String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2);
DuffCode dc_v = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_o);
if (null == dc_v && null != context_1) {
hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_1);
dc_v = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_o);
}
if (!context_added[0]) {
context_added[0] = true;
if (context_1 != null)
l.add(context_1);
l.add(context_2);
}
if (null != dc_v)
l.add(dc_v);
else throw new IllegalArgumentException("dropping vowels is bad5:" + vowel);
return;
}
//next come the vowels u, A, and U
//these three vowels are grouped together because they all
//can cause the preceding context to change. in particular,
//both u and A cannot be affixed to ordinary k or g, but
//rather the shortened versions of k and g - therefore,
if (vowel.equals(u_VOWEL)) {
String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2);
DuffCode halfHeight = TibetanMachineWeb.getHalfHeightGlyph(hashKey_context);
DuffCode dc_v = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_u);
if (!context_added[0]) {
context_added[0] = true;
if (null != context_1)
l.add(context_1);
if (null == halfHeight)
l.add(context_2);
else
l.add(halfHeight);
}
if (null != dc_v)
l.add(dc_v);
else throw new IllegalArgumentException("dropping vowels is bad6:" + vowel);
return;
}
if (vowel.equals(A_VOWEL)) {
String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2);
DuffCode halfHeight = TibetanMachineWeb.getHalfHeightGlyph(hashKey_context);
DuffCode dc_v = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_A);
if (!context_added[0]) {
context_added[0] = true;
if (null != context_1)
l.add(context_1);
if (null == halfHeight)
l.add(context_2);
else
l.add(halfHeight);
}
if (null != dc_v)
l.add(dc_v);
else throw new IllegalArgumentException("dropping vowels is bad7:" + vowel);
return;
}
if (vowel.equals(U_VOWEL)) {
String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2);
DuffCode halfHeight = TibetanMachineWeb.getHalfHeightGlyph(hashKey_context);
DuffCode dc_v = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_U);
if (!context_added[0]) {
context_added[0] = true;
if (null != context_1)
l.add(context_1);
if (null == halfHeight)
l.add(context_2);
else
l.add(halfHeight);
}
if (null != dc_v && !TibetanMachineWeb.isTopVowel(context_2))
l.add(dc_v);
else throw new IllegalArgumentException("dropping vowels is bad8:" + vowel);
return;
}
//finally, the vowels I and <I
//these vowels are unique in that they both
//require a change from the previous character,
//and consist of two glyphs themselves
if (vowel.equals(I_VOWEL)) {
String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2);
DuffCode halfHeight = TibetanMachineWeb.getHalfHeightGlyph(hashKey_context);
DuffCode dc_v_sub = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_A);
DuffCode dc_v_sup = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_i);
if (!context_added[0]) {
context_added[0] = true;
if (null != context_1)
l.add(context_1);
if (null == halfHeight)
l.add(context_2);
else
l.add(halfHeight);
}
if (null != dc_v_sub && null != dc_v_sup) {
l.add(dc_v_sub);
l.add(dc_v_sup);
} else throw new IllegalArgumentException("dropping vowels is bad9:" + vowel);
return;
}
if (vowel.equals(reverse_I_VOWEL)) {
String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2);
DuffCode halfHeight = TibetanMachineWeb.getHalfHeightGlyph(hashKey_context);
DuffCode dc_v_sub = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_A);
DuffCode[] tv_array = (DuffCode[])TibetanMachineWeb.getTibHash().get(reverse_i_VOWEL);
DuffCode dc_v_sup = tv_array[TibetanMachineWeb.TMW];
if (!context_added[0]) {
context_added[0] = true;
if (null != context_1)
l.add(context_1);
if (null == halfHeight)
l.add(context_2);
else
l.add(halfHeight);
}
if (null != dc_v_sub && null != dc_v_sup) {
l.add(dc_v_sub);
l.add(dc_v_sup);
} else throw new IllegalArgumentException("dropping vowels is bad10:" + vowel);
return;
}
throw new IllegalArgumentException("bad vowel " + vowel);
}
/**
* True if you want TibetanMachineWeb-to-Extended-Wylie conversion
* to produce Wylie that, if typed, will produce the same sequence
* of TibetanMachineWeb glyphs. Without it, converting the glyphs
* you get from typing jskad, skaska, skaskaska, skaskaskaska,
* etc. will not give you Wylie, that, if typed in again, will
* produce the original glyphs. Hence, if this is true, then you
* get working, end-to-end Wylie for syntactically illegal
* sequences of glyphs. */
private static final boolean makeIllegalTibetanGoEndToEnd = true;
/** Returns "a"/"A", unless wylie (which really is EWTS, not ACIP)
is already "a". */
private static String aVowelToUseAfter(boolean EWTSNotACIP, String wylie) {
if (wylie.equals(ACHEN) && EWTSNotACIP) {
/* it's EWTS{a}, not EWTS{aa}, for achen alone. But it's
ACIP{AA}. */
return "";
} else
return ((EWTSNotACIP)
? WYLIE_aVOWEL : "A" /* hard-coded ACIP constant */);
}
private static String unambiguousPostAVowelTranslit(boolean EWTSNotACIP,
String wylie1,
String wylie2,
String acip1,
String acip2) {
String disambiguator = "";
// type "lard" vs. "lar.d", and you'll see the need for this
// disambiguation of suffix and postsuffix. sa doesn't take
// any head letters, so only da needs to be considered.
if (TibetanMachineWeb.isWylieTop(wylie1)
&& wylie2.equals(/* FIXME: hard-coded */ "d"))
disambiguator = (EWTSNotACIP) ? WYLIE_DISAMBIGUATING_KEY_STRING : "-";
if (EWTSNotACIP)
return wylie1 + disambiguator + wylie2;
else
return acip1 + disambiguator + acip2;
}
/**
* Gets the Extended Wylie for the given sequence of glyphs if
* EWTSNotACIP is true, or the ACIP otherwise.
* @param EWTSNotACIP true if you want THDL Extended Wylie, false if
* you want ACIP
* @param dcs an array of TMW glyphs
* @param noSuch an array which will not be touched if this is
* successful; however, if there is no THDL Extended Wylie/ACIP
* corresponding to these glyphs, then noSuch[0] will be set to true
* @return the Extended Wylie/ACIP corresponding to these glyphs (with
* font size info), or null */
public static TranslitList getTranslit(boolean EWTSNotACIP,
SizedDuffCode[] dcs,
boolean noSuch[]) {
StringBuffer warnings = (debug ? new StringBuffer() : null);
TranslitList ans
= getTranslitImplementation(EWTSNotACIP, dcs, noSuch, warnings);
if (debug && warnings.length() > 0)
System.out.println("DEBUG: warnings in TMW->Wylie: " + warnings);
return ans;
}
/** True for and only for ma and nga because 'am and 'ang are
appendages. */
private static final boolean isAppendageNonVowelWylie(String wylie) {
return (MA.equals(wylie) /* 'AM */
|| NGA.equals(wylie) /* 'ANG, 'UNG */
|| SA.equals(wylie) /* 'OS, 'US, maybe 'IS */
|| RA.equals(wylie) /* 'UR */
);
}
// DLC FIXME: {H}, U+0F7F, is part of a grapheme cluster!
// David Chapman and I both need a comprehensive list of these
// guys. Get it from Unicode 4.0 spec?
/** Scans the TMW glyphs in glyphList and creates the returned
list of grapheme clusters based on them. A grapheme cluster
is a consonant or consonant stack with optional adornment or a
number (possibly super- or subscribed) or some other glyph
alone. */
private static TGCList breakTshegBarIntoGraphemeClusters(java.util.List glyphList,
boolean noSuchWylie[]) {
// Definition: adornment means vowels and achungs and bindus.
// It should be this, though (FIXME): any combining
// characters.
int sz = glyphList.size();
ThdlDebug.verify(sz > 0);
// A list of grapheme clusters (see UnicodeGraphemeCluster).
// sz is an overestimate (speeds us up, wastes some memory).
TMWGCList gcs = new TMWGCList(sz);
StringBuffer buildingUpVowel = new StringBuffer(); // for {cui}, we append to this guy twice.
String nonVowelWylie = null; // for the "c" in {cui}
int pairType = TGCPair.TYPE_OTHER;
for (int i = 0; i < sz; i++) {
DuffCode dc = ((SizedDuffCode)glyphList.get(i)).getDuffCode();
String wylie = TibetanMachineWeb.getWylieForGlyph(dc, noSuchWylie);
boolean buildingUpSanskritNext = false;
if ((buildingUpSanskritNext
= (TibetanMachineWeb.isWylieSanskritConsonantStack(wylie)
||
/* U+0FAD, which should become ACIP "V", not "W",
though the EWTS is "w" just as it is for
TMW(fontNum==1).53: */
(8 == dc.getFontNum() && 69 == dc.getCharNum())))
|| TibetanMachineWeb.isWylieTibetanConsonantOrConsonantStack(wylie)) {
if (buildingUpVowel.length() > 0 || null != nonVowelWylie) {
gcs.add(new TGCPair(nonVowelWylie,
buildingUpVowel.toString(),
pairType));
buildingUpVowel.delete(0, buildingUpVowel.length());
}
// We want {p-y}, not {py}.
nonVowelWylie
= TibetanMachineWeb.getHashKeyForGlyph(dc.getFontNum(), dc.getCharNum());
pairType = (buildingUpSanskritNext
? TGCPair.TYPE_SANSKRIT
: TGCPair.TYPE_TIBETAN);
} else if (TibetanMachineWeb.isWylieAdornmentAndContainsVowel(wylie)
|| TibetanMachineWeb.isWylieAdornment(wylie)) {
buildingUpVowel.append(wylie);
// DLC FIXME: I bet three or four vowels together
// breaks TMW->ACIP and TMW->EWTS. Test it. When it
// does, revamp TGCPair to have a set of vowels. The
// output order should be consistent with the
// Unicode-imposed order on vowels. (Maybe modulo the
// CCV bug in Unicode w.r.t. above- and below-base
// vowels?)
} else {
// number or weird thing:
if (buildingUpVowel.length() > 0 || null != nonVowelWylie) {
gcs.add(new TGCPair(nonVowelWylie,
buildingUpVowel.toString(),
pairType));
buildingUpVowel.delete(0, buildingUpVowel.length());
nonVowelWylie = null;
}
gcs.add(new TGCPair(wylie, null, TGCPair.TYPE_OTHER));
pairType = TGCPair.TYPE_OTHER;
}
}
if (buildingUpVowel.length() > 0 || null != nonVowelWylie) {
gcs.add(new TGCPair(nonVowelWylie,
buildingUpVowel.toString(),
pairType));
}
return gcs;
}
/** Returns a string that classifies gcs as a legal Tibetan tsheg
* bar, a single Sanskrit grapheme cluster
* ("single-sanskrit-gc"), or invalid ("invalid"). If
* noPrefixTests is true, then ggyi will be seen as a
* "prefix-root", even though gya doesn't take a ga prefix. */
public static String getClassificationOfTshegBar(TGCList gcs,
// DLC the warnings are Wylie-specific
StringBuffer warnings,
boolean noPrefixTests) {
String candidateType = null;
// Now that we have grapheme clusters, see if they match any
// of the "legal tsheg bars":
int sz = gcs.size();
if (sz == 1) {
TGCPair tp = gcs.get(0);
int cls = tp.classification;
if (TGCPair.SANSKRIT_WITHOUT_VOWEL == cls
|| TGCPair.SANSKRIT_WITH_VOWEL == cls)
return "single-sanskrit-gc";
}
TGCPair lastPair = null;
for (int i = 0; i < sz; i++) {
TGCPair tp = gcs.get(i);
int cls = tp.classification;
String wylie = tp.getWylie();
if (TGCPair.OTHER == cls) {
if (TibetanMachineWeb.isWylieNumber(wylie)) {
if (null == candidateType) {
candidateType = "number";
} else {
if ("number" != candidateType) {
if (null != warnings)
warnings.append("Found something odd; the wylie is " + wylie + "\n");
candidateType = "invalid";
break;
}
}
} else {
if (null != warnings)
warnings.append("Found something odd; the wylie is " + wylie + "\n");
candidateType = "invalid";
break;
}
} else if (TGCPair.SANSKRIT_WITHOUT_VOWEL == cls
|| TGCPair.SANSKRIT_WITH_VOWEL == cls) {
candidateType = "invalid";
break;
} else if (TGCPair.CONSONANTAL_WITHOUT_VOWEL == cls
|| TGCPair.CONSONANTAL_WITH_VOWEL == cls) {
if (null == candidateType) {
if (TibetanMachineWeb.isWylieLeft(wylie)) {
candidateType = "prefix/root";
} else {
candidateType = "root";
}
} else {
if ("prefix/root" == candidateType) {
if (ACHUNG.equals(wylie)) {
// peek ahead to distinguish between ba's,
// ba'ala and ba'am:
TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null;
String nextwylie = (nexttp == null) ? "" : nexttp.getWylie();
if (isAppendageNonVowelWylie(nextwylie)) {
candidateType = "maybe-appendaged-prefix/root";
} else {
if (noPrefixTests
|| isLegalPrefixRootCombo(lastPair.getConsonantWylie(),
tp.getConsonantWylie()))
candidateType = "prefix/root-root/suffix";
else
candidateType = "root-suffix";
}
} else if (TibetanMachineWeb.isWylieRight(wylie)) {
if (noPrefixTests
|| isLegalPrefixRootCombo(lastPair.getConsonantWylie(),
tp.getConsonantWylie()))
candidateType = "prefix/root-root/suffix";
else
candidateType = "root-suffix";
} else if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) {
candidateType = "appendaged-prefix/root";
} else {
if (noPrefixTests
|| isLegalPrefixRootCombo(lastPair.getConsonantWylie(),
tp.getConsonantWylie()))
candidateType = "prefix-root";
else {
if (null != warnings)
warnings.append("Found what would be a prefix-root combo, but the root stack with wylie " + wylie + " does not take the prefix with wylie " + lastPair.getConsonantWylie());
candidateType = "invalid";
break;
}
}
} else if ("root" == candidateType) {
if (ACHUNG.equals(wylie)) {
// peek ahead to distinguish between pa's,
// pa'ala and pa'am:
TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null;
String nextwylie = (nexttp == null) ? "" : nexttp.getWylie();
if (isAppendageNonVowelWylie(nextwylie)) {
candidateType = "maybe-appendaged-root";
} else {
candidateType = "root-suffix";
}
} else if (TibetanMachineWeb.isWylieRight(wylie)) {
candidateType = "root-suffix";
} else if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) {
candidateType = "appendaged-root";
} else {
if (null != warnings)
warnings.append("Found a non-prefix consonant or consonant stack followed by a consonant or consonant stack that is not simply a suffix; that thing's wylie is " + wylie + "\n");
candidateType = "invalid";
break;
}
} else if ("prefix-root" == candidateType) {
if (ACHUNG.equals(wylie)) {
// peek ahead to distinguish between bpa's,
// bpa'ala and bpa'am:
TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null;
String nextwylie = (nexttp == null) ? "" : nexttp.getWylie();
if (isAppendageNonVowelWylie(nextwylie)) {
candidateType = "maybe-appendaged-prefix-root";
} else {
candidateType = "prefix-root-suffix";
}
} else if (TibetanMachineWeb.isWylieRight(wylie)) {
candidateType = "prefix-root-suffix";
} else if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) {
candidateType = "appendaged-prefix-root";
} else {
if (null != warnings)
warnings.append("Found a prefix plus a root stack plus a non-suffix consonant or consonant stack whose wylie is " + wylie + "\n");
candidateType = "invalid";
break;
}
} else if ("prefix/root-root/suffix" == candidateType) {
// this has no peekahead, gag'am works.
if (ACHUNG.equals(wylie)) {
// peek ahead to distinguish between
// gga'am and gaga'ala:
TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null;
String nextwylie = (nexttp == null) ? "" : nexttp.getWylie();
if (isAppendageNonVowelWylie(nextwylie)) {
candidateType = "maybe-appendaged-prefix/root-root/suffix";
} else {
candidateType = "prefix-root-suffix";
}
} else if (TibetanMachineWeb.isWylieFarRight(wylie)) {
candidateType = "prefix/root-root/suffix-suffix/postsuffix";
} else if (TibetanMachineWeb.isWylieRight(wylie)) {
candidateType = "prefix-root-suffix";
} else if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) {
candidateType = "appendaged-prefix/root-root/suffix";
} else {
if (null != warnings)
warnings.append("Found a prefix/root stack plus a suffix/root stack plus a non-suffix, non-postsuffix consonant or consonant stack whose wylie is " + wylie + "\n");
candidateType = "invalid";
break;
}
} else if ("root-suffix" == candidateType) {
// This has no peekahead w.r.t. 'am and 'ang,
// but it needs none because we peeked to be
// sure that this was root-suffix and not
// maybe-appendaged-root.
if (TibetanMachineWeb.isWylieFarRight(wylie)) {
candidateType = "root-suffix-postsuffix";
} else if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) {
candidateType = "appendaged-root-suffix";
} else if (ACHUNG.equals(wylie)) {
candidateType = "maybe-appendaged-root-suffix";
} else {
if (null != warnings)
warnings.append("Found a root stack plus a suffix plus a non-postsuffix consonant or consonant stack whose wylie is " + wylie + "\n");
candidateType = "invalid";
break;
}
} else if ("prefix/root-root/suffix-suffix/postsuffix" == candidateType
|| "prefix-root-suffix" == candidateType) {
// this has no peekahead and needs none.
if (TibetanMachineWeb.isWylieFarRight(wylie)) {
candidateType = "prefix-root-suffix-postsuffix";
} else if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) {
// if we simply prepended to
// candidateType, we wouldn't get interned
// strings.
candidateType = ("appendaged-" + candidateType).intern();
} else if (ACHUNG.equals(wylie)) {
candidateType = ("maybe-appendaged-" + candidateType).intern();
} else {
if (null != warnings)
warnings.append("Found a prefix/root stack plus a suffix/root stack plus a suffix/postsuffix plus a non-postsuffix consonant or consonant stack whose wylie is " + wylie + "\n");
candidateType = "invalid";
break;
}
} else if ("prefix-root-suffix-postsuffix" == candidateType) {
// this has no peekahead and needs none.
if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) {
candidateType = "appendaged-prefix-root-suffix-postsuffix";
} else if (ACHUNG.equals(wylie)) {
candidateType = "maybe-appendaged-prefix-root-suffix-postsuffix";
} else {
if (null != warnings)
warnings.append("Found a prefix plus root stack plus suffix plus postsuffix; then found yet another consonant or consonant stack whose wylie is " + wylie + "\n");
candidateType = "invalid";
break;
}
} else if ("root-suffix-postsuffix" == candidateType) {
// this has no peekahead and needs none.
if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) {
candidateType = "appendaged-root-suffix-postsuffix";
} else if (ACHUNG.equals(wylie)) {
candidateType = "maybe-appendaged-root-suffix-postsuffix";
} else {
if (null != warnings)
warnings.append("Found a root stack plus suffix plus postsuffix; then found yet another consonant or consonant stack whose wylie is " + wylie + "\n");
candidateType = "invalid";
break;
}
} else if (candidateType.startsWith("maybe-appendaged-")) {
if (isAppendageNonVowelWylie(wylie)) {
candidateType
= candidateType.substring("maybe-".length()).intern();
} else {
if (null != warnings)
warnings.append("Found a tsheg bar that has an achung (" + ACHUNG + ") tacked on, followed by some other thing whose wylie is " + wylie + "\n");
candidateType = "invalid";
break;
}
} else if (candidateType.startsWith("appendaged-")) {
if (TibetanMachineWeb.isWylieAchungAppendage(wylie)
// 'ang:
|| TibetanMachineWeb.isWylieAchungAppendage(lastPair.getWylie() + wylie)
// 'ongs, as in ma'ongs:
|| (i > 1
&& TibetanMachineWeb.isWylieAchungAppendage(gcs.get(i-2).getWylie() + lastPair.getWylie() + wylie))) {
// candidateType stays what it is.
} else if (ACHUNG.equals(wylie)) {
candidateType = ("maybe-" + candidateType).intern();
} else {
if (null != warnings)
warnings.append("Found a tsheg bar that has a 'i, 'e, 'o, 'u, or 'ang 'am appendage already and then found yet another consonant or consonant stack whose wylie is " + wylie + "\n");
candidateType = "invalid";
break;
}
} else {
if ("invalid" == candidateType)
throw new Error("forgot to break out of the for loop after diagnosing invalidity.");
if ("number" != candidateType)
throw new Error("missed a case; case is " + candidateType);
if (null != warnings)
warnings.append("Found a consonant or consonant stack after something odd; the consonantish thing has wylie " + wylie + "\n");
candidateType = "invalid";
break;
}
}
} else if (TGCPair.LONE_VOWEL == cls) {
if (null != warnings)
warnings.append("Found a vowel that did not follow either a Tibetan consonant or consonant stack or another vowel.");
candidateType = "invalid";
break;
} else {
throw new Error("bad cls");
}
lastPair = tp;
}
if (null == candidateType) candidateType = "invalid";
if (candidateType.startsWith("maybe-appendaged-")) {
if (null != warnings)
warnings.append("Found a tsheg bar that has an extra achung (" + ACHUNG + ") tacked on\n");
candidateType = "invalid";
}
return candidateType;
}
/** Appends to translitBuffer the EWTS/ACIP for the glyph list
glyphList (which should be an ArrayList for speed). The font
size of the transliteration will be fontSize. The
transliteration will be very user-friendly for "legal tsheg
bars" and will be valid, but possibly ugly (interspersed with
disambiguators or extra vowels, etc.) Wylie/ACIP for other
things, such as Sanskrit transliteration. Updates warnings
and noSuch like the caller does.
<p>What constitutes a legal, non-punctuation, non-whitespace
tsheg bar? The following are the only such:</p>
<ul>
<li>one or more numbers</li>
<li>a legal "tyllable" appended with zero or more particles
from the set { 'i, 'o, 'u, 'e, 'ang, 'am }</li>
</ul>
<p>A "tyllable" is, by definition, one of the following:</p>
<ul>
<li>a single, possibly adorned consonant stack</li>
<li>two consonant stacks where one is a single,
unadorned consonant (and is a prefix it it is first and
a suffix if it is last) and the other is possibly
adorned</li>
<li>three consonant stacks where at most one has adornment.
If the second has adornment, then the first must be an
unadorned prefix consonant and the last must be an
unadorned suffix consonant. If the first has adornment,
then the second must be an unadorned suffix consonant
and the third must be an unadorned secondary suffix
consonant.</li>
<li>four consonant stacks where either none is adorned or
only the second consonant stack is adorned, the first is
an unadorned prefix consonant, the third is an unadorned
suffix consonant, and the fourth is an unadorned
secondary suffix consonant.</li>
</ul>
<p>When there are three unadorned consonant stacks in a
tyllable, a hard-coded list of valid Tibetan tsheg bars is
relied upon to determine if the 'a'/'A' vowel comes after
the first or the second consonant.</p> */
private static void getTshegBarTranslit(boolean EWTSNotACIP,
java.util.List glyphList,
boolean noSuch[],
StringBuffer warnings,
TranslitList translitBuffer) {
// FIXME: If font size changes within a tsheg-bar, we don't
// handle that.
int fontSize = ((SizedDuffCode)glyphList.get(0)).getFontSize();
TGCList gcs
= breakTshegBarIntoGraphemeClusters(glyphList, noSuch);
String candidateType = getClassificationOfTshegBar(gcs, warnings, false);
if (debug) System.out.println("DEBUG: tsheg bar classification is " + candidateType);
int sz = gcs.size();
if (candidateType == "invalid"
|| candidateType == "single-sanskrit-gc") {
// Forget beauty and succintness -- just be sure to
// generate transliteration that can be converted
// unambiguously into Tibetan. Use a disambiguator or
// vowel after each grapheme cluster.
//
// If we truly didn't care about beauty, we'd just lump
// SANSKRIT_WITHOUT_VOWEL and SANSKRIT_WITH_VOWEL into
// OTHER.
for (int i = 0; i < sz; i++) {
TGCPair tp = (TGCPair)gcs.get(i);
int cls = tp.classification;
String wylie = tp.getWylie();
String translit = (EWTSNotACIP) ? wylie : tp.getACIP();
if (TibetanMachineWeb.isWylieVowel(wylie) && i > 0) {
// au would be achen with au vowel, so use a.u; ai
// would be achen with ai vowel, so use a.i; l-i
// won't happen, you'd see la-i or gla-i, not l-i
// or gl-i; similarly for r-i, r-I, and l-I.
// Even though we only need it for ka.u and ka.i
// and a.u and a.i, we always do it (see Rule 10
// of the September 1, 2003 draft of EWTS
// standard).
translitBuffer.append(WYLIE_DISAMBIGUATING_KEY, fontSize);
}
translitBuffer.append(translit, fontSize);
if (TibetanMachineWeb.isWylieTibetanConsonantOrConsonantStack(wylie)
|| TibetanMachineWeb.isWylieSanskritConsonantStack(wylie)) {
translitBuffer.append(aVowelToUseAfter(EWTSNotACIP, wylie), fontSize);
} else if (i + 1 < sz) {
if (TGCPair.CONSONANTAL_WITH_VOWEL != cls
&& TGCPair.SANSKRIT_WITH_VOWEL != cls)
translitBuffer.append(EWTSNotACIP
? WYLIE_DISAMBIGUATING_KEY : '-',
fontSize);
}
}
} else {
// Generate perfect, beautiful transliteration, using the
// minimum number of vowels and disambiguators.
int leftover = sz + 1;
// Appendaged vs. not appendaged? it affects nothing at
// this stage except for pa'm vs. pa'am and
// appendaged-prefix/root-root/suffix (e.g., 'ad'i
// (incorrect) vs. 'da'i (correct)).
boolean appendaged = (candidateType.startsWith("appendaged-"));
candidateType = getCandidateTypeModuloAppendage(candidateType);
if ("prefix/root-root/suffix-suffix/postsuffix" == candidateType) {
/* Update: Chris Fynn wrote this in response to an
e-mail from David Chapman on Feb 21, 2005:
<quote Chris Fynn feb 21 2005>
When working out the rules for Tibetan and Dzongkha
collation in Bhutan we came up with the following sequences
that could be ambiguous:
0F51 0F42 0F66
0F60 0F42 0F66
0F51 0F44 0F66
0F42 0F53 0F51
0F58 0F53 0F51
0F56 0F42 0F66
0F51 0F56 0F66
0F60 0F56 0F66
0F58 0F42 0F66
0F58 0F44 0F66
0F51 0F58 0F66
After much consultation with experts in Bhutan it was
decided these should always be read as follows:
0F51 0F42 0F66 dgas
0F60 0F42 0F66 'gas
0F51 0F44 0F66 dngas *
0F42 0F53 0F51 gnad
0F58 0F53 0F51 mnad *
0F56 0F42 0F66 bags
0F51 0F56 0F66 dbas
0F60 0F56 0F66 'bas *
0F58 0F42 0F66 mags
0F58 0F44 0F66 mangs
0F51 0F58 0F66 dmas
In most cases it was found that only one of the two possible
readings actually existed as words. 0F51 0F44 0F66 , 0F58
0F53 0F51, and 0F60 0F56 0F66 were not found as syllables in
any known words, but the experts felt that *if* they
occurred in Tibetan or Dzongkha text then dngas, mnad, and
'bas would be the most likely reading.
</quote>
Because of this e-mail, dbas and dngas were added to the list of
exceptions. */
/* Yes, this is ambiguous. How do we handle it? See
* this from Andres (but note that only 4 of the 14 in
* the second list are ambiguous because ra na sa and
* la are not prefixes):
*
* <quote>
* I'm posting this upon David Chandler's
* request. According to Lobsang Thonden in Modern
* Tibetan Grammar Language (page 42), with regards to
* identifying the root letter in 3 lettered words
* there are only 23 ambiguous cases. He writes:
*
* If the last letter is 'sa' and the first two
* letters are affixes, then the SECOND ONE is the
* root letter in the following 9 WORDS ONLY:
*
* gdas gnas gsas dgas dmas bdas mdas 'gas 'das [NOTE:
* Andres later came across 'bad, so we handle it this
* way also]
*
* And the FIRST is the root letter in the following
* 14 WORDS ONLY:
*
* rags lags nags bags bangs gangs rangs langs nangs
* sangs babs rabs rams nams
*
* As I mentioned before, I think that the best
* solution for now is to hard-wire these cases. Even
* if the list is not exhaustive, at least we'll have
* most cases covered.
* </quote>
*
* But there's more to the rule, as bug 998476 shows:
* bsad is correct, not bas.d, so we have to interpret
* as prefix-root-suffix. */
leftover = 3;
/* FIXME: these constants are hard-wired here, rather
* than in THDLWylieConstants, because I'm lazy. */
String wylie1 = ((TGCPair)gcs.get(0)).getWylie();
String wylie2 = ((TGCPair)gcs.get(1)).getWylie();
String wylie3 = ((TGCPair)gcs.get(2)).getWylie();
String acip1 = (EWTSNotACIP) ? null : ((TGCPair)gcs.get(0)).getACIP();
String acip2 = (EWTSNotACIP) ? null : ((TGCPair)gcs.get(1)).getACIP();
String acip3 = (EWTSNotACIP) ? null : ((TGCPair)gcs.get(2)).getACIP();
if (wylie3.equals("d")
|| ((wylie1.equals("g") && (wylie2.equals("d")
|| wylie2.equals("n")
|| wylie2.equals("s")))
|| (wylie1.equals("d") && (wylie2.equals("g")
|| wylie2.equals("m")
|| wylie2.equals("b")
|| wylie2.equals("ng")))
|| (wylie1.equals("b") && wylie2.equals("d"))
|| (wylie1.equals("m") && wylie2.equals("d"))
|| (wylie1.equals("'") && (wylie2.equals("g")
|| wylie2.equals("d")
|| wylie2.equals("b"))))) {
// prefix-root-suffix
if (TibetanMachineWeb.isAmbiguousWylie(wylie1, wylie2)) {
if (EWTSNotACIP) {
translitBuffer.append(wylie1
+ WYLIE_DISAMBIGUATING_KEY
+ wylie2,
fontSize);
} else {
translitBuffer.append(acip1 + '-' + acip2,
fontSize);
}
} else {
if (EWTSNotACIP) {
translitBuffer.append(wylie1 + wylie2,
fontSize);
} else {
translitBuffer.append(acip1 + acip2,
fontSize);
}
}
translitBuffer.append(aVowelToUseAfter(EWTSNotACIP, wylie2)
+ (EWTSNotACIP ? wylie3 : acip3),
fontSize);
} else {
// root-suffix-postsuffix
if (EWTSNotACIP)
translitBuffer.append(wylie1
+ aVowelToUseAfter(EWTSNotACIP, wylie1)
+ unambiguousPostAVowelTranslit(EWTSNotACIP,
wylie2,
wylie3,
acip2,
acip3),
fontSize);
else
translitBuffer.append(acip1
+ aVowelToUseAfter(EWTSNotACIP, wylie1)
+ unambiguousPostAVowelTranslit(EWTSNotACIP,
wylie2,
wylie3,
acip2,
acip3),
fontSize);
}
} else if ("root" == candidateType
|| (!appendaged
&& "prefix/root-root/suffix" == candidateType)
|| "prefix/root" == candidateType
|| "root-suffix-postsuffix" == candidateType
|| "root-suffix" == candidateType) {
String wylie1 = ((TGCPair)gcs.get(0)).getWylie();
String acip1 = (EWTSNotACIP) ? null : ((TGCPair)gcs.get(0)).getACIP();
leftover = 1;
translitBuffer.append((EWTSNotACIP) ? wylie1 : acip1, fontSize);
if (((TGCPair)gcs.get(0)).classification
!= TGCPair.CONSONANTAL_WITH_VOWEL) {
ThdlDebug.verify(TGCPair.CONSONANTAL_WITHOUT_VOWEL
== ((TGCPair)gcs.get(0)).classification);
translitBuffer.append(aVowelToUseAfter(EWTSNotACIP, wylie1),
fontSize);
if (debug) System.out.println("DEBUG: appending vowel 2");
} else {
if (debug) System.out.println("DEBUG: already has vowel 2");
}
if ("root-suffix-postsuffix" == candidateType) {
leftover = 3;
String wylie2 = ((TGCPair)gcs.get(1)).getWylie();
String wylie3 = ((TGCPair)gcs.get(2)).getWylie();
String acip2 = (EWTSNotACIP) ? null : ((TGCPair)gcs.get(1)).getACIP();
String acip3 = (EWTSNotACIP) ? null : ((TGCPair)gcs.get(2)).getACIP();
translitBuffer.append(unambiguousPostAVowelTranslit(EWTSNotACIP,
wylie2,
wylie3,
acip2,
acip3),
fontSize);
}
} else if ("prefix-root-suffix" == candidateType
|| "prefix-root" == candidateType
|| (appendaged
&& "prefix/root-root/suffix" == candidateType)
|| "prefix-root-suffix-postsuffix" == candidateType) {
String wylie1 = ((TGCPair)gcs.get(0)).getWylie();
String wylie2 = ((TGCPair)gcs.get(1)).getWylie();
String acip1 = (EWTSNotACIP) ? null : ((TGCPair)gcs.get(0)).getACIP();
String acip2 = (EWTSNotACIP) ? null : ((TGCPair)gcs.get(1)).getACIP();
leftover = 2;
if (TibetanMachineWeb.isAmbiguousWylie(wylie1, wylie2))
if (EWTSNotACIP)
translitBuffer.append(wylie1 + WYLIE_DISAMBIGUATING_KEY + wylie2,
fontSize);
else
translitBuffer.append(acip1 + '-' + acip2,
fontSize);
else
if (EWTSNotACIP)
translitBuffer.append(wylie1 + wylie2, fontSize);
else
translitBuffer.append(acip1 + acip2, fontSize);
if (((TGCPair)gcs.get(1)).classification
!= TGCPair.CONSONANTAL_WITH_VOWEL) {
ThdlDebug.verify(TGCPair.CONSONANTAL_WITHOUT_VOWEL
== ((TGCPair)gcs.get(1)).classification);
if (debug) System.out.println("DEBUG: appending vowel 1");
translitBuffer.append(aVowelToUseAfter(EWTSNotACIP, wylie2),
fontSize);
} else {
if (debug) System.out.println("DEBUG: already has vowel 1");
}
if ("prefix-root-suffix-postsuffix" == candidateType) {
leftover = 4;
String wylie3 = ((TGCPair)gcs.get(2)).getWylie();
String wylie4 = ((TGCPair)gcs.get(3)).getWylie();
String acip3 = (EWTSNotACIP) ? null : ((TGCPair)gcs.get(2)).getACIP();
String acip4 = (EWTSNotACIP) ? null : ((TGCPair)gcs.get(3)).getACIP();
translitBuffer.append(unambiguousPostAVowelTranslit(EWTSNotACIP,
wylie3,
wylie4,
acip3,
acip4),
fontSize);
}
} else if ("number" == candidateType) {
leftover = 0;
} else {
throw new Error("missed a case down here");
}
// append the wylie/ACIP left over:
String lastPairTranslit = null;
if (appendaged && leftover >= 1) {
TGCPair tp = (TGCPair)gcs.get(leftover-1);
lastPairTranslit = (EWTSNotACIP
? tp.getWylie(null)
: tp.getACIP(null));
if ((translitBuffer.length() == 0)
|| !translitBuffer.get(translitBuffer.length() - 1).getTranslit().endsWith(lastPairTranslit)) {
int l;
if ((l = translitBuffer.length()) > 0) {
String s = translitBuffer.get(l - 1).getTranslit();
char lc = s.charAt(s.length() - 1);
ThdlDebug.verify(lc == ((EWTSNotACIP) ? 'a' : 'A') /* hard-coded ACIP and EWTS values */);
lastPairTranslit = lastPairTranslit + lc; /* 'da'i can cause this */
} else {
ThdlDebug.verify(false); // this better never happen.
lastPairTranslit = null;
}
}
}
for (int i = leftover; i < sz; i++) {
TGCPair tp = (TGCPair)gcs.get(i);
String y;
translitBuffer.append(EWTSNotACIP
? (y = tp.getWylie(lastPairTranslit))
: (y = tp.getACIP(lastPairTranslit)),
fontSize);
if (appendaged)
lastPairTranslit = y;
}
}
}
/**
* Gets the Extended Wylie/ACIP for a sequence of glyphs. This works
* as follows:
*
* <p>We run along until we hit whitespace or punctuation. We take
* everything before that and we see if it's a legal Tibetan tsheg bar,
* either a number or a word fragment. If it is, we insert only one
* vowel in the correct place. If not, then we throw a disambiguating
* key or a vowel after each stack.
*
* @param EWTSNotACIP true if you want THDL Extended Wylie, false if
* you want ACIP
* @param dcs an array of glyphs
* @param noSuch an array which will not be touched if this is
* successful; however, if there is no THDL Extended Wylie/ACIP
* corresponding to these glyphs, then noSuch[0] will be set to true
* @param warnings either null or a buffer to which will be appended
* warnings about illegal tsheg bars
* @return the Extended Wylie/ACIP corresponding to these glyphs (with
* font size info), or null */
private static TranslitList getTranslitImplementation(boolean EWTSNotACIP,
SizedDuffCode[] dcs,
boolean noSuch[],
StringBuffer warnings) {
// DLC FIXME: " " should become " " for ACIP
ArrayList glyphList = new ArrayList();
TranslitList translitBuffer = new TranslitList();
for (int i = 0; i < dcs.length; i++) {
char ch = dcs[i].getDuffCode().getCharacter();
int fsz = dcs[i].getFontSize();
if ((int)ch < 32) { // 32 is space, ' '
if (!glyphList.isEmpty()) {
getTshegBarTranslit(EWTSNotACIP, glyphList, noSuch,
warnings, translitBuffer);
glyphList.clear();
if (null != warnings)
warnings.append("Some glyphs came right before a newline; they did not have a tsheg or shad come first.");
}
// In ACIP, \n\n (or \r\n\r\n with DOS line feeds)
// indicates a real line break.
if (!EWTSNotACIP && '\n' == ch) {
if (i > 0
&& dcs[i - 1].getDuffCode().getCharacter() == '\r')
translitBuffer.append("\r\n", fsz);
else
translitBuffer.append(ch, fsz);
}
translitBuffer.append(ch, fsz);
} else { // (int)ch >= 32
String wylie
= TibetanMachineWeb.getWylieForGlyph(dcs[i].getDuffCode(),
noSuch);
String acip = null;
if (!EWTSNotACIP) {
// U+0F04 and U+0F05 -- these require lookahead to
// see if the ACIP is # (two shishes) or * (one
// swish)
int howManyConsumed[] = new int[] { -1 /* invalid */ };
acip = TibetanMachineWeb.getACIPForGlyph(dcs[i].getDuffCode(),
((i+1<dcs.length)
? dcs[i+1].getDuffCode()
: null),
((i+2<dcs.length)
? dcs[i+2].getDuffCode()
: null),
noSuch,
howManyConsumed);
ThdlDebug.verify(howManyConsumed[0] <= 3
&& howManyConsumed[0] >= 1);
i += howManyConsumed[0] - 1;
}
if (TibetanMachineWeb.isWyliePunc(wylie)
&& !TibetanMachineWeb.isWylieAdornment(wylie)) {
if (!glyphList.isEmpty()) {
getTshegBarTranslit(EWTSNotACIP, glyphList, noSuch,
warnings, translitBuffer);
glyphList.clear();
}
//append the punctuation:
translitBuffer.append(EWTSNotACIP ? wylie : acip, fsz);
} else {
glyphList.add(dcs[i]);
}
}
} // for
// replace remaining TMW with transliteration
if (!glyphList.isEmpty()) {
getTshegBarTranslit(EWTSNotACIP, glyphList, noSuch,
warnings, translitBuffer);
// glyphList.clear() if we weren't about to exit...
if (null != warnings)
warnings.append("The stretch of Tibetan ended without final punctuation.");
}
return ((translitBuffer.length() > 0) ? translitBuffer : null);
}
/** Returns "root" instead of "appendaged-root", for example. */
private static final String getCandidateTypeModuloAppendage(String candidateType) {
if (candidateType.startsWith("appendaged-")) {
candidateType
= candidateType.substring("appendaged-".length()).intern();
}
return candidateType;
}
/** Returns an array of size 2 that lists all the possible indices
* of the root stack given the chosen candidate type. A negative
* number appears if there are not that many possible positions
* for the root. (You'll get two negative numbers if there is no
* root stack.) */
public static final int[] getIndicesOfRootForCandidateType(String candidateType) {
// Appendaged vs. not appendaged? it affects nothing.
candidateType = getCandidateTypeModuloAppendage(candidateType);
int[] rv = new int[] { -1, -1 };
if (candidateType == "prefix/root"
|| candidateType.startsWith("root")) {
rv[0] = 0;
} else if (candidateType.startsWith("prefix/root-")) {
rv[0] = 0;
rv[1] = 1;
} else if (candidateType.startsWith("prefix-root")) {
rv[0] = 1;
}
return rv;
}
/** Returns true if and only if the stack with Wylie <i>root</i>
* can take the prefix <i>prefix</i>. */
private static boolean isLegalPrefixRootCombo(String prefix, String root) {
// This will be decomposed enough. If you can decompose it,
// then it doesn't take a prefix!
if (!TibetanMachineWeb.isKnownHashKey(root)) {
root = root.replace('+', '-');
if (!TibetanMachineWeb.isKnownHashKey(root)) {
// If the glyph isn't even in TibetanMachine, then
// it's not able to take any prefix.
return false;
}
}
String ru = TibetanMachineWeb.getUnicodeForWylieForGlyph(root);
// ru may be for (head, root, sub), (head, root), (root), or
// (root, sub). Try all possibilities that are possible with
// a String of length ru. If there's a wa-zur, then we say
// (FIXME: do we say correctly?) that a stack with wa-zur can
// take a prefix if and only if the stack without can take a
// prefix.
if (ru == null) throw new Error("how? root is " + root); // FIXME: make this an assertion
int rl = ru.length();
if (ru.charAt(rl - 1) == UnicodeConstants.EWSUB_wa_zur)
--rl; // forget about wa-zur: see above.
if (rl == 2) {
char ch0 = ru.charAt(0);
char ch1 = UnicodeUtils.getNominalRepresentationOfSubscribedConsonant(ru.charAt(1));
// (head, root) and (root, sub) are possibilities.
if (ACHUNG.equals(prefix)) {
return LegalTshegBar.takesAchungPrefix(ch0, ch1, UnicodeConstants.EW_ABSENT)
|| LegalTshegBar.takesAchungPrefix(UnicodeConstants.EW_ABSENT, ch0, ch1);
} else if ("b".equals(prefix)) {
return LegalTshegBar.takesBao(ch0, ch1, UnicodeConstants.EW_ABSENT)
|| LegalTshegBar.takesBao(UnicodeConstants.EW_ABSENT, ch0, ch1);
} else if ("m".equals(prefix)) {
return LegalTshegBar.takesMao(ch0, ch1, UnicodeConstants.EW_ABSENT)
|| LegalTshegBar.takesMao(UnicodeConstants.EW_ABSENT, ch0, ch1);
} else if ("g".equals(prefix)) {
return LegalTshegBar.takesGao(ch0, ch1, UnicodeConstants.EW_ABSENT)
|| LegalTshegBar.takesGao(UnicodeConstants.EW_ABSENT, ch0, ch1);
} else if ("d".equals(prefix)) {
return LegalTshegBar.takesDao(ch0, ch1, UnicodeConstants.EW_ABSENT)
|| LegalTshegBar.takesDao(UnicodeConstants.EW_ABSENT, ch0, ch1);
} else {
throw new IllegalArgumentException("prefix is " + prefix);
}
} else if (rl == 1) {
char ch0 = ru.charAt(0);
// (root) is the only choice.
if (ACHUNG.equals(prefix)) {
return LegalTshegBar.takesAchungPrefix(UnicodeConstants.EW_ABSENT, ch0, UnicodeConstants.EW_ABSENT);
} else if ("b".equals(prefix)) {
return LegalTshegBar.takesBao(UnicodeConstants.EW_ABSENT, ch0, UnicodeConstants.EW_ABSENT);
} else if ("m".equals(prefix)) {
return LegalTshegBar.takesMao(UnicodeConstants.EW_ABSENT, ch0, UnicodeConstants.EW_ABSENT);
} else if ("g".equals(prefix)) {
return LegalTshegBar.takesGao(UnicodeConstants.EW_ABSENT, ch0, UnicodeConstants.EW_ABSENT);
} else if ("d".equals(prefix)) {
return LegalTshegBar.takesDao(UnicodeConstants.EW_ABSENT, ch0, UnicodeConstants.EW_ABSENT);
} else {
throw new IllegalArgumentException("prefix is " + prefix);
}
} else if (rl == 3) {
char ch0 = ru.charAt(0);
char ch1 = UnicodeUtils.getNominalRepresentationOfSubscribedConsonant(ru.charAt(1));
char ch2 = UnicodeUtils.getNominalRepresentationOfSubscribedConsonant(ru.charAt(2));
// (head, root, sub) is the only choice.
if (ACHUNG.equals(prefix)) {
return LegalTshegBar.takesAchungPrefix(ch0, ch1, ch2);
} else if ("b".equals(prefix)) {
return LegalTshegBar.takesBao(ch0, ch1, ch2);
} else if ("m".equals(prefix)) {
return LegalTshegBar.takesMao(ch0, ch1, ch2);
} else if ("g".equals(prefix)) {
return LegalTshegBar.takesGao(ch0, ch1, ch2);
} else if ("d".equals(prefix)) {
return LegalTshegBar.takesDao(ch0, ch1, ch2);
} else {
throw new IllegalArgumentException("prefix is " + prefix);
}
} else {
return false;
}
}
}