1afb3a0fdd
\, the Sanskrit virama, is not used. Of the 1370-odd ACIP texts I've got here, about 57% make it through the gauntlet (fewer if you demand a vowel or disambiguator on every stack of a non-Tibetan tsheg bar).
185 lines
6.4 KiB
Java
185 lines
6.4 KiB
Java
/*
|
|
The contents of this file are subject to the THDL Open Community License
|
|
Version 1.0 (the "License"); you may not use this file except in compliance
|
|
with the License. You may obtain a copy of the License on the THDL web site
|
|
(http://www.thdl.org/).
|
|
|
|
Software distributed under the License is distributed on an "AS IS" basis,
|
|
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
|
License for the specific terms governing rights and limitations under the
|
|
License.
|
|
|
|
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
|
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
|
All Rights Reserved.
|
|
|
|
Contributor(s): ______________________________________.
|
|
*/
|
|
|
|
package org.thdl.tib.text.ttt;
|
|
|
|
import org.thdl.util.ThdlDebug;
|
|
|
|
/** An ordered pair used in ACIP-to-TMW conversion. The left side is
|
|
* the consonant or empty; the right side is the vowel, '+', or '-'.
|
|
* @author David Chandler */
|
|
/* DLC BIG FIXME: make this package work for EWTS, not just ACIP. */
|
|
class TPair {
|
|
/** The left side, or null if there is no left side. That is, the
|
|
* non-vowel, non-'m', non-':', non-'-', non-'+' guy. */
|
|
private String l;
|
|
String getLeft() {
|
|
ThdlDebug.verify(!"".equals(l));
|
|
return l;
|
|
}
|
|
|
|
/** The right side. That is, the vowel, with 'm' or ':' "vowel"
|
|
* after it if appropriate, or "-" (disambiguator), or "+"
|
|
* (stacking), or null otherwise. */
|
|
private String r;
|
|
String getRight() {
|
|
ThdlDebug.verify(!"".equals(r));
|
|
return r;
|
|
}
|
|
|
|
/** Constructs a new TPair with left side l and right side r.
|
|
* Use null or the empty string to represent an absence. */
|
|
TPair(String l, String r) {
|
|
// Normalize:
|
|
if (null != l && l.equals("")) l = null;
|
|
if (null != r && r.equals("")) r = null;
|
|
|
|
this.l = l;
|
|
this.r = r;
|
|
}
|
|
|
|
/** Returns a nice String representation. Returns "(D . E)" for
|
|
* ACIP {DE}, e.g., and (l . r) in general. */
|
|
public String toString() {
|
|
return "("
|
|
+ ((null == l) ? "" : l) + " . "
|
|
+ ((null == r) ? "" : r) + ")";
|
|
}
|
|
|
|
/** Returns the number of ACIP characters that make up this
|
|
* TPair. */
|
|
int size() {
|
|
return (((l == null) ? 0 : l.length())
|
|
+ ((r == null) ? 0 : r.length()));
|
|
}
|
|
|
|
/** Returns an TPair that is like this one except that it is
|
|
* missing N characters. The characters are taken from r, the
|
|
* right side, first and from l, the left side, second.
|
|
* @throw IllegalArgumentException if N is out of range */
|
|
TPair minusNRightmostACIPCharacters(int N)
|
|
throws IllegalArgumentException
|
|
{
|
|
int sz;
|
|
String newL = l, newR = r;
|
|
if (N > size())
|
|
throw new IllegalArgumentException("Don't have that many to remove.");
|
|
if (N < 1)
|
|
throw new IllegalArgumentException("You should't call this if you don't want to remove any.");
|
|
if (null != r && (sz = r.length()) > 0) {
|
|
int min = Math.min(sz, N);
|
|
newR = r.substring(0, sz - min);
|
|
N -= min;
|
|
}
|
|
if (N > 0) {
|
|
sz = l.length();
|
|
newL = l.substring(0, sz - N);
|
|
}
|
|
return new TPair(newL, newR);
|
|
}
|
|
|
|
/** Returns true if and only if this is nonempty and is l, if
|
|
* present, is a legal ACIP consonant, and is r, if present, is a
|
|
* legal ACIP vowel. */
|
|
boolean isLegal() {
|
|
if (size() < 1)
|
|
return false;
|
|
if (null != l && !ACIPRules.isConsonant(l))
|
|
return false;
|
|
if (null != r && !ACIPRules.isVowel(l))
|
|
return false;
|
|
return true;
|
|
}
|
|
|
|
/** Returns true if and only if this pair could be a Tibetan
|
|
* prefix. */
|
|
boolean isPrefix() {
|
|
return (null != l
|
|
&& ((null == r || "".equals(r))
|
|
|| "-".equals(r)
|
|
|| "A".equals(r)) // DLC though check for BASKYABS and warn because BSKYABS is more common
|
|
&& ("'".equals(l)
|
|
|| "M".equals(l)
|
|
|| "B".equals(l)
|
|
|| "D".equals(l)
|
|
|| "G".equals(l)));
|
|
}
|
|
|
|
/** Returns true if and only if this pair is merely a
|
|
* disambiguator. */
|
|
boolean isDisambiguator() {
|
|
return ("-".equals(r) && getLeft() == null);
|
|
}
|
|
|
|
/** Returns an TPair that is like this pair except that it has
|
|
* a "+" on the right if this pair is empty on the right and is
|
|
* empty on the right if this pair has a disambiguator (i.e., a
|
|
* '-') on the right. May return itself (but never mutates this
|
|
* instance). */
|
|
TPair insideStack() {
|
|
if (null == getRight())
|
|
return new TPair(getLeft(), "+");
|
|
else if ("-".equals(getRight()))
|
|
return new TPair(getLeft(), null);
|
|
else
|
|
return this;
|
|
}
|
|
|
|
/** Returns true if this pair contains a Tibetan number. */
|
|
boolean isNumeric() {
|
|
char ch;
|
|
return (l != null && l.length() == 1 && (ch = l.charAt(0)) >= '0' && ch <= '9');
|
|
}
|
|
|
|
/** Returns the EWTS Wylie that corresponds to this pair. Untested. */
|
|
String getWylie() {
|
|
String leftWylie = null;
|
|
if (getLeft() != null) {
|
|
leftWylie = ACIPRules.getWylieForACIPConsonant(getLeft());
|
|
if (leftWylie == null) {
|
|
if (isNumeric())
|
|
leftWylie = getLeft();
|
|
}
|
|
}
|
|
String rightWylie = null;
|
|
if ("-".equals(getRight()))
|
|
rightWylie = ".";
|
|
else if ("+".equals(getRight()))
|
|
rightWylie = "+";
|
|
else if (getRight() != null)
|
|
rightWylie = ACIPRules.getWylieForACIPVowel(getRight());
|
|
if (null == leftWylie) leftWylie = "";
|
|
if (null == rightWylie) rightWylie = "";
|
|
return leftWylie + rightWylie;
|
|
}
|
|
|
|
/** Appends legal Unicode corresponding to this (possible
|
|
* subscribed) pair to sb. DLC FIXME: which normalization form,
|
|
* if any? */
|
|
void getUnicode(StringBuffer sb, boolean subscribed) {
|
|
if (null != getLeft()) {
|
|
String x = ACIPRules.getUnicodeFor(getLeft(), subscribed);
|
|
if (null != x) sb.append(x);
|
|
}
|
|
if (null != getRight()
|
|
&& !("-".equals(getRight()) || "A".equals(getRight()))) {
|
|
String x = ACIPRules.getUnicodeFor(getRight(), subscribed);
|
|
if (null != x) sb.append(x);
|
|
}
|
|
}
|
|
}
|