7198f23361
table exactly and I fear that it makes the ACIP->Tibetan converter code a lot uglier. The TODO(DLC)[EWTS->Tibetan] comments littered throughout are part of the ugliness; they point to the ugliness. If each were addressed, cleanliness could perhaps be achieved. I've largely forgotten exactly what this change does, but it attempts to improve EWTS->Tibetan conversion. The lexer is probably really, really primitive. I concentrate here on converting a single tsheg bar rather than a whole document. Eclipse was used during part of my journey here and some imports were reorganized merely because I could. :) (Eclipse was needed when the usual ant build failed to run a new test EWTSTest. And I wanted its debugger.) Next steps: end-to-end EWTS tests should bring many problems to light. Fix those. Triage all the TODO comments. I don't know that I'll ever really trust the implementation. The tests are valuable, though. A clean implementation of EWTS->Tibetan in Jython might hold enough interest for me; I'd like to learn Python.
258 lines
9.3 KiB
Java
258 lines
9.3 KiB
Java
/*
|
|
The contents of this file are subject to the THDL Open Community License
|
|
Version 1.0 (the "License"); you may not use this file except in compliance
|
|
with the License. You may obtain a copy of the License on the THDL web site
|
|
(http://www.thdl.org/).
|
|
|
|
Software distributed under the License is distributed on an "AS IS" basis,
|
|
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
|
License for the specific terms governing rights and limitations under the
|
|
License.
|
|
|
|
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
|
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
|
All Rights Reserved.
|
|
|
|
Contributor(s): ______________________________________.
|
|
*/
|
|
|
|
package org.thdl.tib.text.ttt;
|
|
|
|
import org.thdl.util.ThdlDebug;
|
|
|
|
/** An ordered pair used in ACIP/EWTS-to-TMW/Unicode conversion. The
|
|
* left side is the consonant or empty; the right side is either the
|
|
* vowel or '+' (indicating stacking in both ACIP and EWTS) or a
|
|
* disambiguator (e.g., '-' in ACIP or '.' in EWTS).
|
|
* @author David Chandler */
|
|
/* BIG FIXME: make this package work for EWTS, not just ACIP. (TODO(DLC)[EWTS->Tibetan]: does it?) */
|
|
class TPair {
|
|
/** the part that knows ACIP from EWTS */
|
|
private TTraits traits;
|
|
|
|
/** Returns the part that knows ACIP from EWTS. */
|
|
public TTraits getTraits() { return traits; }
|
|
|
|
/** The left side, or null if there is no left side. I.e., the
|
|
* non-wowel, non-disambiguator, non-'+' guy. */
|
|
private String l;
|
|
String getLeft() {
|
|
ThdlDebug.verify(!"".equals(l));
|
|
return l;
|
|
}
|
|
|
|
/** The right side. That is, the wowel or disambiguator or "+"
|
|
* (for stacking) or null otherwise. */
|
|
private String r;
|
|
String getRight() {
|
|
ThdlDebug.verify(!"".equals(r));
|
|
return r;
|
|
}
|
|
|
|
/** Constructs a new TPair with left side l and right side r.
|
|
* Use null or the empty string to represent an absence. */
|
|
TPair(TTraits traits, String l, String r) {
|
|
// Normalize:
|
|
if (null != l && l.equals("")) l = null;
|
|
if (null != r && r.equals("")) r = null;
|
|
|
|
this.l = l;
|
|
this.r = r;
|
|
this.traits = traits;
|
|
}
|
|
|
|
/** Returns a nice String representation. Returns "(D . E)" for
|
|
* ACIP {DE}, e.g., and (l . r) in general. */
|
|
public String toString() {
|
|
return "("
|
|
+ ((null == l) ? "" : l) + " . "
|
|
+ ((null == r) ? "" : r) + ")";
|
|
}
|
|
|
|
/** Returns the number of transliteration characters that make up
|
|
* this TPair. */
|
|
int size() {
|
|
return (((l == null) ? 0 : l.length())
|
|
+ ((r == null) ? 0 : r.length()));
|
|
}
|
|
|
|
/** Returns a TPair that is like this one except that it is
|
|
* missing N characters. The characters are taken from r, the
|
|
* right side, first and from l, the left side, second. The pair
|
|
* returned may be illegal, such as the (A . ') you can get from
|
|
* ACIP {A'AAMA}.
|
|
* @throws IllegalArgumentException if N is out of range */
|
|
TPair minusNRightmostTransliterationCharacters(int N)
|
|
throws IllegalArgumentException
|
|
{
|
|
int sz;
|
|
String newL = l, newR = r;
|
|
if (N > size())
|
|
throw new IllegalArgumentException("Don't have that many to remove.");
|
|
if (N < 1)
|
|
throw new IllegalArgumentException("You shouldn't call this if you don't want to remove any.");
|
|
if (null != r && (sz = r.length()) > 0) {
|
|
int min = Math.min(sz, N);
|
|
newR = r.substring(0, sz - min);
|
|
N -= min;
|
|
}
|
|
if (N > 0) {
|
|
sz = l.length();
|
|
newL = l.substring(0, sz - N);
|
|
}
|
|
return new TPair(traits, newL, newR);
|
|
}
|
|
|
|
/** Returns true if and only if this is nonempty and if l, if
|
|
* present, is a legal consonant, and if r, if present, is a
|
|
* legal wowel. */
|
|
boolean isLegal() {
|
|
if (size() < 1)
|
|
return false;
|
|
if (null != l && !traits.isConsonant(l))
|
|
return false;
|
|
if (null != r && !traits.isWowel(r))
|
|
return false;
|
|
return true;
|
|
}
|
|
|
|
/** Returns true if and only if this pair could be a Tibetan
|
|
* prefix. */
|
|
boolean isPrefix() {
|
|
return (null != l
|
|
&& ((null == r || "".equals(r))
|
|
|| traits.disambiguator().equals(r)
|
|
|| traits.aVowel().equals(r)) // FIXME: though check for BASKYABS and warn because BSKYABS is more common
|
|
&& traits.isPrefix(l));
|
|
}
|
|
|
|
/** Returns true if and only if this pair could be a Tibetan
|
|
* secondary suffix. */
|
|
boolean isPostSuffix() {
|
|
return (null != l
|
|
&& ((null == r || "".equals(r))
|
|
|| traits.disambiguator().equals(r)
|
|
|| traits.aVowel().equals(r)) // FIXME: though warn about GAMASA vs. GAMS
|
|
&& traits.isPostsuffix(l));
|
|
}
|
|
|
|
/** Returns true if and only if this pair could be a Tibetan
|
|
* suffix. */
|
|
boolean isSuffix() {
|
|
return (null != l
|
|
&& ((null == r || "".equals(r))
|
|
|| traits.disambiguator().equals(r)
|
|
|| traits.aVowel().equals(r))
|
|
&& traits.isSuffix(l));
|
|
}
|
|
|
|
/** Returns true if and only if this pair is merely a
|
|
* disambiguator. */
|
|
boolean isDisambiguator() {
|
|
return (traits.disambiguator().equals(r) && getLeft() == null);
|
|
}
|
|
|
|
/** Yep, this works for TPairs. */
|
|
public boolean equals(Object x) {
|
|
if (x instanceof TPair) {
|
|
TPair p = (TPair)x;
|
|
return ((getLeft() == p.getLeft() || (getLeft() != null && getLeft().equals(p.getLeft())))
|
|
|| (getRight() == p.getRight() || (getRight() != null && getRight().equals(p.getRight()))));
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/** Returns a TPair that is like this pair except that it has a
|
|
* "+" on the right if this pair is empty on the right and is
|
|
* empty on the right if this pair has a disambiguator on the
|
|
* right. May return itself (but never mutates this
|
|
* instance). */
|
|
TPair insideStack() {
|
|
if (null == getRight())
|
|
return new TPair(traits, getLeft(), "+");
|
|
else if (traits.disambiguator().equals(getRight()))
|
|
return new TPair(traits, getLeft(), null);
|
|
else
|
|
return this;
|
|
}
|
|
|
|
/** Returns true if this pair contains a Tibetan number. */
|
|
boolean isNumeric() {
|
|
if (l != null && l.length() == 1) {
|
|
char ch = l.charAt(0);
|
|
return ((ch >= '0' && ch <= '9')
|
|
|| (ch >= '\u0f18' && ch <= '\u0f33')
|
|
|| ch == '\u0f3e' || ch == '\u0f3f');
|
|
}
|
|
return false;
|
|
// TODO(DLC)[EWTS->Tibetan]: what about half-numbers?
|
|
}
|
|
|
|
String getWylie() {
|
|
return getWylie(false);
|
|
}
|
|
|
|
/** Returns the EWTS Wylie that corresponds to this pair if
|
|
* justLeft is false, or the EWTS Wylie that corresponds to just
|
|
* {@link #getLeft()} if justLeft is true.
|
|
*
|
|
* <p>Returns "W" for ACIP "W", "r" for ACIP "R", y for ACIP "Y",
|
|
* even though sometimes the EWTS for those is "w", "R", or "Y".
|
|
* Handle that in the caller. */
|
|
String getWylie(boolean justLeft) {
|
|
String leftWylie = null;
|
|
if (getLeft() != null) {
|
|
leftWylie = traits.getEwtsForConsonant(getLeft());
|
|
if (leftWylie == null) {
|
|
if (isNumeric())
|
|
leftWylie = getLeft();
|
|
}
|
|
}
|
|
if (null == leftWylie) leftWylie = "";
|
|
if (justLeft) return leftWylie;
|
|
String rightWylie = null;
|
|
if (traits.disambiguator().equals(getRight()))
|
|
rightWylie = ".";
|
|
else if ("+".equals(getRight()))
|
|
rightWylie = "+";
|
|
else if (getRight() != null)
|
|
rightWylie = traits.getEwtsForWowel(getRight());
|
|
if (null == rightWylie) rightWylie = "";
|
|
return leftWylie + rightWylie;
|
|
}
|
|
|
|
/** Appends legal Unicode corresponding to this (possible
|
|
* subscribed) pair to sb. FIXME: which normalization form,
|
|
* if any? */
|
|
void getUnicode(StringBuffer sb, boolean subscribed) {
|
|
getUnicode(sb, sb, subscribed);
|
|
}
|
|
|
|
/** Appends legal Unicode corresponding to this (possible
|
|
* subscribed) pair to consonantSB (for the non-vowel part) and
|
|
* vowelSB (for the vowelish part ({'EEm:}, e.g.). FIXME: which
|
|
* normalization form, if any? */
|
|
void getUnicode(StringBuffer consonantSB, StringBuffer vowelSB,
|
|
boolean subscribed) {
|
|
if (null != getLeft()) {
|
|
String x = traits.getUnicodeFor(getLeft(), subscribed);
|
|
if (null == x) throw new Error("TPair: " + getLeft() + " has no Uni");
|
|
consonantSB.append(x);
|
|
}
|
|
if (null != getRight()
|
|
&& !(traits.disambiguator().equals(getRight())
|
|
|| "+".equals(getRight()) || traits.aVowel().equals(getRight()))) {
|
|
String x = traits.getUnicodeForWowel(getRight());
|
|
if (null == x) throw new Error("TPair: " + getRight() + " has no Uni");
|
|
vowelSB.append(x);
|
|
}
|
|
}
|
|
|
|
// TODO(DLC)[EWTS->Tibetan]
|
|
/** Returns true if this pair is surely the last pair in an ACIP
|
|
* stack. Stacking continues through (* . ) and (* . +), but
|
|
* stops anywhere else. */
|
|
boolean endsACIPStack() {
|
|
return (getRight() != null && !"+".equals(getRight()));
|
|
}
|
|
}
|