324 lines
12 KiB
Java
324 lines
12 KiB
Java
/*
|
|
The contents of this file are subject to the THDL Open Community License
|
|
Version 1.0 (the "License"); you may not use this file except in compliance
|
|
with the License. You may obtain a copy of the License on the THDL web site
|
|
(http://www.thdl.org/).
|
|
|
|
Software distributed under the License is distributed on an "AS IS" basis,
|
|
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
|
License for the specific terms governing rights and limitations under the
|
|
License.
|
|
|
|
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
|
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
|
All Rights Reserved.
|
|
|
|
Contributor(s): ______________________________________.
|
|
*/
|
|
|
|
package org.thdl.tib.text.ttt;
|
|
|
|
import java.util.ArrayList;
|
|
import java.util.ListIterator;
|
|
|
|
import org.thdl.tib.text.TGCList;
|
|
import org.thdl.tib.text.TibTextUtils;
|
|
|
|
/** A list of {@link TPairList TPairLists}, each of which is for
|
|
* a stack (a grapheme cluster), typically corresponding to one tsheg
|
|
* bar.
|
|
*
|
|
* @author David Chandler */
|
|
class TStackList {
|
|
/** FIXME: change me and see if performance improves. */
|
|
private static final int INITIAL_SIZE = 1;
|
|
|
|
/** a fast, non-thread-safe, random-access list implementation: */
|
|
private ArrayList al;
|
|
|
|
/** Creates an empty list. */
|
|
public TStackList() { al = new ArrayList(INITIAL_SIZE); }
|
|
|
|
/** Creates a list containing just p. */
|
|
public TStackList(TPairList p) {
|
|
al = new ArrayList(1);
|
|
add(p);
|
|
}
|
|
|
|
/** Creates an empty list with the capacity to hold N items. */
|
|
public TStackList(int N) {
|
|
al = new ArrayList(N);
|
|
}
|
|
|
|
/** Returns the ith pair in this list. */
|
|
public TPairList get(int i) { return (TPairList)al.get(i); }
|
|
|
|
/** Adds p to the end of this list. */
|
|
public void add(TPairList p) { al.add(p); }
|
|
|
|
/** Adds all the stacks in c to the end of this list. */
|
|
public void addAll(TStackList c) { al.addAll(c.al); }
|
|
|
|
/** Adds all the stacks in c to this list, inserting them at
|
|
* position k. */
|
|
public void addAll(int k, TStackList c) { al.addAll(k, c.al); }
|
|
|
|
/** Returns the number of TPairLists in this list. */
|
|
public int size() { return al.size(); }
|
|
|
|
/** Returns true if and only if this list is empty. */
|
|
public boolean isEmpty() { return al.isEmpty(); }
|
|
|
|
/** Returns the ACIP input (okay, maybe 1-2-3-4 instead of 1234)
|
|
* corresponding to this stack list. */
|
|
public String recoverACIP() {
|
|
return toStringHelper(false);
|
|
}
|
|
|
|
/** Returns a human-readable representation like {G}{YA} or
|
|
* {GYA}. */
|
|
public String toString() {
|
|
return toStringHelper(true);
|
|
}
|
|
|
|
private String toStringHelper(boolean brackets) {
|
|
int sz = size();
|
|
StringBuffer b = new StringBuffer();
|
|
for (int i = 0; i < sz; i++) {
|
|
if (brackets) b.append('{');
|
|
b.append(get(i).recoverACIP());
|
|
if (brackets) b.append('}');
|
|
}
|
|
return b.toString();
|
|
}
|
|
|
|
/** Returns a human-readable representation.
|
|
* @return something like [[(R . ), (D . O)], [(R . ), (J . E)]] */
|
|
public String toString2() {
|
|
return al.toString();
|
|
}
|
|
|
|
/** Returns true if and only if either x is a TStackList
|
|
* object representing the same TPairLists in the same
|
|
* order or x is a String that is equals to the result of {@link
|
|
* #toString()}. */
|
|
public boolean equals(Object x) {
|
|
if (x instanceof TStackList) {
|
|
return al.equals(((TStackList)x).al);
|
|
} else if (x instanceof String) {
|
|
return toString().equals(x) || toString2().equals(x);
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/** Returns a hashCode appropriate for use with our {@link
|
|
* #equals(Object)} method. */
|
|
public int hashCode() { return al.hashCode(); }
|
|
|
|
/** Returns an iterator for this list. Mutate this list while
|
|
* iterating and you'll have to read the code to know what will
|
|
* happen. */
|
|
public ListIterator listIterator() { return al.listIterator(); }
|
|
|
|
/** Returns a pair with {@link BoolTriple#isLegal} true if and
|
|
* only if this list of stacks is a legal tsheg bar by the rules
|
|
* of Tibetan syntax (sometimes called rules of spelling). If
|
|
* this is legal, then {@link
|
|
* BoolTriple#isLegalAndHasAVowelOnRoot} will be true if and only
|
|
* if there is an explicit {A} vowel on the root stack.
|
|
* @param noPrefixTests true if you want to pretend that every
|
|
* stack can take every prefix, which is not the case in
|
|
* reality */
|
|
public BoolTriple isLegalTshegBar(boolean noPrefixTests) {
|
|
// FIXME: Should we handle PADMA and other Tibetanized Sanskrit fellows consistently? Right now we only treat single-stack Sanskrit guys as legal.
|
|
|
|
TTGCList tgcList = new TTGCList(this);
|
|
StringBuffer warnings = new StringBuffer();
|
|
String candidateType
|
|
= TibTextUtils.getClassificationOfTshegBar(tgcList, warnings, noPrefixTests);
|
|
|
|
// preliminary answer:
|
|
boolean isLegal = (candidateType != "invalid");
|
|
|
|
if (isLegal) {
|
|
if (isClearlyIllegal())
|
|
isLegal = false;
|
|
TPairList firstStack = this.get(0);
|
|
if (1 == firstStack.size()
|
|
&& firstStack.get(0).isPrefix()
|
|
&& null == firstStack.get(0).getRight() // because GAM is legal
|
|
&& !(candidateType.startsWith("prefix")
|
|
|| candidateType.startsWith("appendaged-prefix"))) {
|
|
isLegal = false;
|
|
}
|
|
}
|
|
|
|
boolean isLegalAndHasAVowelOnRoot = false;
|
|
if (isLegal) {
|
|
int rootIndices[]
|
|
= TibTextUtils.getIndicesOfRootForCandidateType(candidateType);
|
|
for (int i = 0; i < 2; i++) {
|
|
if (rootIndices[i] >= 0) {
|
|
int pairListIndex = tgcList.getTPairListIndex(rootIndices[i]);
|
|
TPairList pl = get(pairListIndex);
|
|
TPair p = pl.get(pl.size() - 1);
|
|
isLegalAndHasAVowelOnRoot
|
|
= (p.getRight() != null && p.getRight().startsWith("A")); // could be {A:}, e.g. TODO(DLC)[EWTS->Tibetan]: ???
|
|
if (isLegalAndHasAVowelOnRoot)
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
return new BoolTriple(isLegal,
|
|
isLegalAndHasAVowelOnRoot,
|
|
candidateType);
|
|
}
|
|
|
|
private static final boolean ddebug = false;
|
|
|
|
/** Returns true if and only if this stack list contains a clearly
|
|
* illegal construct. An example of such is a TPair (V . something). */
|
|
boolean isClearlyIllegal() {
|
|
// check for {D}{VA} sorts of things:
|
|
for (int i = 0; i < size(); i++) {
|
|
if (get(i).getACIPError("THIS MAKES IT FASTER AND IS SAFE, DON'T WORRY",
|
|
true /* faster... */)
|
|
!= null) {
|
|
if (ddebug) System.out.println("ddebug: error is " + get(i).getACIPError("THIS MAKES IT FASTER AND IS SAFE, DON'T WORRY", false));
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/** Returns true if and only if this stack list contains a stack
|
|
* that does not end in a vowel or disambiguator. Note that this
|
|
* is not erroneous for legal Tibetan like {BRTAN}, where {B} has
|
|
* no vowel, but it is a warning sign for Sanskrit stacks.
|
|
* @param opl the pair list from which this stack list
|
|
* originated
|
|
* @param isLastStack if non-null, then isLastStack[0] will be
|
|
* set to true if and only if the very last stack is the only
|
|
* stack not to have a vowel or disambiguator on it */
|
|
boolean hasStackWithoutVowel(TPairList opl, boolean[] isLastStack) {
|
|
int runningSize = 0;
|
|
// FIXME: MARDA is MARD==MAR-D to us, but is probably MAR+DA, warn -- see 838470
|
|
for (int i = 0; i < size(); i++) {
|
|
TPairList pl = get(i);
|
|
String l;
|
|
TPair lastPair = opl.getNthNonDisambiguatorPair(runningSize + pl.size() - 1);
|
|
runningSize += pl.size();
|
|
if (null == lastPair.getRight()
|
|
&& !((l = lastPair.getLeft()) != null && l.length() == 1
|
|
&& l.charAt(0) >= '0' && l.charAt(0) <= '9')) {
|
|
if (null != isLastStack) {
|
|
isLastStack[0] = (i + 1 == size());
|
|
if (!isLastStack[0]) {
|
|
throw new Error("But we now stack greedily!");
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
}
|
|
if (runningSize != opl.sizeMinusDisambiguators())
|
|
throw new IllegalArgumentException("runningSize = " + runningSize + "; opl.sizeMinusDisambiguators = " + opl.sizeMinusDisambiguators() + "; opl (" + opl + ") is bad for this stack list (" + toString() + ")");
|
|
return false;
|
|
}
|
|
|
|
/** Returns legal Unicode corresponding to this tsheg bar. FIXME: which normalization form, if any? */
|
|
String getUnicode() {
|
|
StringBuffer u = new StringBuffer(size());
|
|
for (int i = 0; i < size(); i++) {
|
|
get(i).getUnicode(u);
|
|
}
|
|
return u.toString();
|
|
}
|
|
|
|
/** Returns the DuffCodes and errors corresponding to this stack
|
|
list. Each element of the array is a DuffCode or a String, the
|
|
latter if and only if the TMW font cannot represent the
|
|
corresponding stack in this list. Iff shortMessages is true,
|
|
the String elements will be shorter messages. */
|
|
Object[] getDuff(boolean shortMessages,
|
|
boolean noCorrespondingTMWGlyphIsError) {
|
|
ArrayList al = new ArrayList(size()*2); // rough estimate
|
|
int count = 0;
|
|
for (int i = 0; i < size(); i++) {
|
|
get(i).getDuff(al, shortMessages, noCorrespondingTMWGlyphIsError);
|
|
}
|
|
if (size() > 0 && al.size() == 0) {
|
|
throw new Error("But this stack list, " + this + ", contains " + size() + " stacks! How can it not have DuffCodes associated with it?");
|
|
}
|
|
return al.toArray();
|
|
}
|
|
}
|
|
|
|
/** A BoolTriple is used to convey the legality of a particular tsheg
|
|
* bar. (FIXME: This class is misnamed.)
|
|
* @author David Chandler */
|
|
class BoolTriple implements Comparable {
|
|
|
|
/** candidateType is a {@link
|
|
org.thdl.tib.text.TibTextUtils#getClassificationOfTshegBar(TGCList,StringBuffer,boolean)}
|
|
concept. You cannot derive isLegal() from it because {@link
|
|
TStackList#isClearlyIllegal()} and more (think {BNA}) comes
|
|
into play. */
|
|
String candidateType;
|
|
|
|
|
|
/** True if and only if the tsheg bar is a native Tibetan tsheg
|
|
bar or is a single Sanskrit grapheme cluster.
|
|
@see #isLegalButSanskrit() */
|
|
boolean isLegal;
|
|
|
|
|
|
/** Some subset of tsheg bars are legal but legal Sanskrit -- the
|
|
single sanskrit stacks are this way, such as B+DE. We treat
|
|
such a thing as legal because B+DE is the perfect way to input
|
|
such a thing. But then, we treat B+DEB+DE as illegal, even
|
|
though it too is perfect. So we're inconsistent (LOW-PRIORITY
|
|
FIXME), but you really have to watch what happens to
|
|
coloration and warning messages if you change this. */
|
|
boolean isLegalButSanskrit() {
|
|
return (candidateType == "single-sanskrit-gc");
|
|
}
|
|
|
|
/** True if and only if {@link #isLegal} is true and there may be
|
|
an ACIP "A" vowel on the root stack. */
|
|
boolean isLegalAndHasAVowelOnRoot;
|
|
BoolTriple(boolean isLegal,
|
|
boolean isLegalAndHasAVowelOnRoot,
|
|
String candidateType) {
|
|
this.isLegal = isLegal;
|
|
this.isLegalAndHasAVowelOnRoot = isLegalAndHasAVowelOnRoot;
|
|
this.candidateType = candidateType;
|
|
if (!isLegal && (isLegalButSanskrit() || isLegalAndHasAVowelOnRoot))
|
|
throw new IllegalArgumentException();
|
|
}
|
|
|
|
/** The more legal and standard a tsheg bar is, the higher score
|
|
it has. */
|
|
private int score() {
|
|
int score = 0;
|
|
if (isLegalAndHasAVowelOnRoot) {
|
|
score += 5;
|
|
}
|
|
if (isLegal) {
|
|
score += 5;
|
|
}
|
|
if (isLegalButSanskrit()) {
|
|
score -= 3;
|
|
}
|
|
return score;
|
|
}
|
|
|
|
|
|
/** The "most legal" BoolTriple compares higher. Native Tibetan
|
|
beats Sanskrit; native tibetan with a vowel on the root stack
|
|
beats native Tibetan without. */
|
|
public int compareTo(Object o) {
|
|
BoolTriple b = (BoolTriple)o;
|
|
return score() - b.score();
|
|
}
|
|
}
|