Added an unfinished ACIP->Tibetan converter. Once it works properly
for ACIP, it'll easily be made to work as a perfect EWTS Wylie->Tibetan converter. It has an extensive suite of tests for the existing functionality.
This commit is contained in:
parent
39e0435b6b
commit
e21d3774a9
14 changed files with 8709 additions and 21 deletions
|
@ -5,7 +5,7 @@
|
|||
|
||||
@(#)package.html
|
||||
|
||||
Copyright 2001-2002 Tibetan and Himalayan Digital Library
|
||||
Copyright 2001-2003 Tibetan and Himalayan Digital Library
|
||||
|
||||
This software is the confidential and proprietary information of
|
||||
the Tibetan and Himalayan Digital Library. You shall use such
|
||||
|
@ -18,12 +18,14 @@
|
|||
|
||||
Provides classes and methods for dealing with Tibetan text.
|
||||
<p>
|
||||
Designed for use with the Tibetan Computer
|
||||
Company's free cross-platform TibetanMachineWeb fonts, this package
|
||||
contains methods for getting the Extended Wylie
|
||||
correspondences for each TibetanMachineWeb glyph, and for
|
||||
convert back and forth between Extended
|
||||
Wylie and TibetanMachineWeb.
|
||||
Designed for use with the Tibetan Computer Company's free
|
||||
cross-platform TibetanMachineWeb fonts, this package contains methods
|
||||
for getting the Extended Wylie correspondences for each
|
||||
TibetanMachineWeb glyph, and for convert back and forth between
|
||||
Extended Wylie and TibetanMachineWeb. The TMW to Wylie conversion is
|
||||
perfect, but the Wylie to TMW is flawed, so use the code in package
|
||||
org.thdl.tib.text.ttt instead for serious work. The Wylie to TMW here
|
||||
is more like a keyboard than a real Wylie to TMW conversion.
|
||||
<p>
|
||||
This package provides a variety of ways to store TibetanMachineWeb data,
|
||||
and includes methods to aid programmers who want to convert from
|
||||
|
@ -34,5 +36,6 @@ keyboards. Four keyboards have been provided in this release,
|
|||
but users may also create their own keyboards.
|
||||
<h2>Related Documentation</h2>
|
||||
@see <a href="../input/package-summary.html">org.thdl.tib.input</a>
|
||||
@see <a href="ttt/package-summary.html">org.thdl.tib.text.ttt</a>
|
||||
</body>
|
||||
</html>
|
||||
|
|
207
source/org/thdl/tib/text/ttt/ACIPRules.java
Normal file
207
source/org/thdl/tib/text/ttt/ACIPRules.java
Normal file
|
@ -0,0 +1,207 @@
|
|||
/*
|
||||
The contents of this file are subject to the THDL Open Community License
|
||||
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License on the THDL web site
|
||||
(http://www.thdl.org/).
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||
License for the specific terms governing rights and limitations under the
|
||||
License.
|
||||
|
||||
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
||||
All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
*/
|
||||
|
||||
package org.thdl.tib.text.ttt;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.HashMap;
|
||||
|
||||
/** Canonizes some facts regarding the ACIP transcription system.
|
||||
* @author David Chandler */
|
||||
class ACIPRules {
|
||||
/** {Ksh}, the longest consonant, has 3 characters, so this is
|
||||
* three. */
|
||||
public static int MAX_CONSONANT_LENGTH = 3;
|
||||
|
||||
/** {'im:}, the longest "vowel", has 4 characters, so this is
|
||||
* four. */
|
||||
public static int MAX_VOWEL_LENGTH = 4;
|
||||
|
||||
/** For O(1) {@link #isVowel(String)} calls. */
|
||||
private static HashSet acipVowels = null;
|
||||
|
||||
private static String[][] baseVowels = new String[][] {
|
||||
// { ACIP, EWTS }:
|
||||
{ "A", "a" },
|
||||
{ "I", "i" },
|
||||
{ "U", "u" },
|
||||
{ "E", "e" },
|
||||
{ "O", "o" },
|
||||
{ "'I", "I" },
|
||||
{ "'U", "U" },
|
||||
{ "EE", "ai" },
|
||||
{ "OO", "au" },
|
||||
{ "i", "-i" },
|
||||
{ "'i", "-I" },
|
||||
{ "'A", "A" },
|
||||
{ "'O", "Ao" },
|
||||
{ "'E", "Ae" }
|
||||
// DLC I'm on my own with 'O and 'E, but GANG'O appears
|
||||
// and I wonder... so here are 'O and 'E. It's
|
||||
// consistent with 'I and 'A and 'U, at least.
|
||||
};
|
||||
|
||||
/** Returns true if and only if s is an ACIP "vowel". You can't
|
||||
* just call this any time -- A is a consonant and a vowel in
|
||||
* ACIP, so you have to call this in the right context. */
|
||||
public static boolean isVowel(String s) {
|
||||
if (null == acipVowels) {
|
||||
acipVowels = new HashSet();
|
||||
for (int i = 0; i < baseVowels.length; i++) {
|
||||
acipVowels.add(baseVowels[i][0]);
|
||||
acipVowels.add(baseVowels[i][0] + 'm');
|
||||
acipVowels.add(baseVowels[i][0] + ':');
|
||||
acipVowels.add(baseVowels[i][0] + "m:");
|
||||
// DLC '\' for visarga? how shall we do \ the visarga? like a vowel or not?
|
||||
|
||||
}
|
||||
}
|
||||
return (acipVowels.contains(s));
|
||||
}
|
||||
|
||||
/** For O(1) {@link #isConsonant(String)} calls. */
|
||||
private static HashSet consonants = null;
|
||||
|
||||
/** Returns true if and only if acip is an ACIP consonant (without
|
||||
* a vowel). For example, returns true for "K", but not for
|
||||
* "KA" or "X". */
|
||||
public static boolean isConsonant(String acip) {
|
||||
if (consonants == null) {
|
||||
consonants = new HashSet();
|
||||
consonants.add("V");
|
||||
consonants.add("K");
|
||||
consonants.add("KH");
|
||||
consonants.add("G");
|
||||
consonants.add("NG");
|
||||
consonants.add("C");
|
||||
consonants.add("CH");
|
||||
consonants.add("J");
|
||||
consonants.add("NY");
|
||||
consonants.add("T");
|
||||
consonants.add("TH");
|
||||
consonants.add("D");
|
||||
consonants.add("N");
|
||||
consonants.add("P");
|
||||
consonants.add("PH");
|
||||
consonants.add("B");
|
||||
consonants.add("M");
|
||||
consonants.add("TZ");
|
||||
consonants.add("TS");
|
||||
consonants.add("DZ");
|
||||
consonants.add("W");
|
||||
consonants.add("ZH");
|
||||
consonants.add("Z");
|
||||
consonants.add("Y");
|
||||
consonants.add("R");
|
||||
consonants.add("L");
|
||||
consonants.add("SH");
|
||||
consonants.add("S");
|
||||
consonants.add("H");
|
||||
consonants.add("t");
|
||||
consonants.add("th");
|
||||
consonants.add("d");
|
||||
consonants.add("n");
|
||||
consonants.add("sh");
|
||||
consonants.add("dH");
|
||||
consonants.add("DH");
|
||||
consonants.add("BH");
|
||||
consonants.add("DZH"); // longest, MAX_CONSONANT_LENGTH characters
|
||||
consonants.add("Ksh"); // longest, MAX_CONSONANT_LENGTH characters
|
||||
consonants.add("GH");
|
||||
consonants.add("'");
|
||||
consonants.add("A");
|
||||
}
|
||||
return consonants.contains(acip);
|
||||
}
|
||||
|
||||
private static HashMap acipConsonant2wylie = null;
|
||||
/** Returns the EWTS corresponding to the given ACIP consonant
|
||||
* (without the "A" vowel). Returns null if there is no such
|
||||
* EWTS. */
|
||||
static final String getWylieForACIPConsonant(String acip) {
|
||||
if (acipConsonant2wylie == null) {
|
||||
acipConsonant2wylie = new HashMap(37);
|
||||
|
||||
// oddball:
|
||||
acipConsonant2wylie.put("V", "w");
|
||||
|
||||
// more oddballs:
|
||||
acipConsonant2wylie.put("DH", "d+h");
|
||||
acipConsonant2wylie.put("BH", "b+h");
|
||||
acipConsonant2wylie.put("dH", "D+h");
|
||||
acipConsonant2wylie.put("DZH", "dz+h");
|
||||
acipConsonant2wylie.put("Ksh", "k+Sh");
|
||||
acipConsonant2wylie.put("GH", "g+h");
|
||||
|
||||
|
||||
acipConsonant2wylie.put("K", "k");
|
||||
acipConsonant2wylie.put("KH", "kh");
|
||||
acipConsonant2wylie.put("G", "g");
|
||||
acipConsonant2wylie.put("NG", "ng");
|
||||
acipConsonant2wylie.put("C", "c");
|
||||
acipConsonant2wylie.put("CH", "ch");
|
||||
acipConsonant2wylie.put("J", "j");
|
||||
acipConsonant2wylie.put("NY", "ny");
|
||||
acipConsonant2wylie.put("T", "t");
|
||||
acipConsonant2wylie.put("TH", "th");
|
||||
acipConsonant2wylie.put("D", "d");
|
||||
acipConsonant2wylie.put("N", "n");
|
||||
acipConsonant2wylie.put("P", "p");
|
||||
acipConsonant2wylie.put("PH", "ph");
|
||||
acipConsonant2wylie.put("B", "b");
|
||||
acipConsonant2wylie.put("M", "m");
|
||||
acipConsonant2wylie.put("TZ", "ts");
|
||||
acipConsonant2wylie.put("TS", "tsh");
|
||||
acipConsonant2wylie.put("DZ", "dz");
|
||||
acipConsonant2wylie.put("W", "w");
|
||||
acipConsonant2wylie.put("ZH", "zh");
|
||||
acipConsonant2wylie.put("Z", "z");
|
||||
acipConsonant2wylie.put("'", "'");
|
||||
acipConsonant2wylie.put("Y", "y");
|
||||
acipConsonant2wylie.put("R", "r");
|
||||
acipConsonant2wylie.put("L", "l");
|
||||
acipConsonant2wylie.put("SH", "sh");
|
||||
acipConsonant2wylie.put("S", "s");
|
||||
acipConsonant2wylie.put("H", "h");
|
||||
acipConsonant2wylie.put("A", "a");
|
||||
acipConsonant2wylie.put("t", "T");
|
||||
acipConsonant2wylie.put("th", "Th");
|
||||
acipConsonant2wylie.put("d", "D");
|
||||
acipConsonant2wylie.put("n", "N");
|
||||
acipConsonant2wylie.put("sh", "Sh");
|
||||
}
|
||||
return (String)acipConsonant2wylie.get(acip);
|
||||
}
|
||||
|
||||
private static HashMap acipVowel2wylie = null;
|
||||
/** Returns the EWTS corresponding to the given ACIP "vowel".
|
||||
* Returns null if there is no such EWTS. */
|
||||
static final String getWylieForACIPVowel(String acip) {
|
||||
if (acipVowel2wylie == null) {
|
||||
acipVowel2wylie = new HashMap(baseVowels.length * 4);
|
||||
|
||||
for (int i = 0; i < baseVowels.length; i++) {
|
||||
acipVowel2wylie.put(baseVowels[i][0], baseVowels[i][1]);
|
||||
acipVowel2wylie.put(baseVowels[i][0] + 'm', baseVowels[i][1] + 'M');
|
||||
acipVowel2wylie.put(baseVowels[i][0] + ':', baseVowels[i][1] + 'H');
|
||||
acipVowel2wylie.put(baseVowels[i][0] + "m:", baseVowels[i][1] + "MH");
|
||||
}
|
||||
}
|
||||
return (String)acipVowel2wylie.get(acip);
|
||||
}
|
||||
}
|
6885
source/org/thdl/tib/text/ttt/PackageTest.java
Normal file
6885
source/org/thdl/tib/text/ttt/PackageTest.java
Normal file
File diff suppressed because it is too large
Load diff
100
source/org/thdl/tib/text/ttt/ParseIterator.java
Normal file
100
source/org/thdl/tib/text/ttt/ParseIterator.java
Normal file
|
@ -0,0 +1,100 @@
|
|||
/*
|
||||
The contents of this file are subject to the THDL Open Community License
|
||||
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License on the THDL web site
|
||||
(http://www.thdl.org/).
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||
License for the specific terms governing rights and limitations under the
|
||||
License.
|
||||
|
||||
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
||||
All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
*/
|
||||
|
||||
package org.thdl.tib.text.ttt;
|
||||
|
||||
import java.util.ListIterator;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.ArrayList;
|
||||
|
||||
/** An object that can iterate over an {@link #TParseTree}.
|
||||
*
|
||||
* @author David Chandler */
|
||||
class ParseIterator {
|
||||
private ArrayList al = null;
|
||||
private int sz;
|
||||
private ListIterator[] iterators;
|
||||
private boolean first = true;
|
||||
private boolean hasNextParse = true;
|
||||
/** Constructs a new ParseIterator that iterates over a list of
|
||||
* TStackListLists. */
|
||||
ParseIterator(ArrayList al) {
|
||||
this.al = al;
|
||||
sz = al.size();
|
||||
iterators = new ListIterator[sz];
|
||||
hasNextParse = false;
|
||||
for (int i = 0; i < sz; i++) {
|
||||
iterators[i] = ((TStackListList)al.get(i)).listIterator();
|
||||
if (iterators[i].hasNext())
|
||||
hasNextParse = true;
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns true if and only if there is another parse
|
||||
* available. */
|
||||
boolean hasNext() {
|
||||
return hasNextParse;
|
||||
}
|
||||
|
||||
/** Returns the next available parse. */
|
||||
TStackList next() {
|
||||
if (!hasNextParse)
|
||||
throw new NoSuchElementException("no parses left");
|
||||
if (first) {
|
||||
first = false;
|
||||
TStackList x = new TStackList();
|
||||
for (int i = 0; i < sz; i++) {
|
||||
TStackList nextSL = (TStackList)iterators[i].next();
|
||||
x.addAll(nextSL);
|
||||
}
|
||||
|
||||
// The next guy is found by taking the previous item of
|
||||
// each iterator.
|
||||
hasNextParse = false;
|
||||
for (int i = sz - 1; i >= 0; i--) {
|
||||
if (iterators[i].hasNext()) {
|
||||
iterators[i].next();
|
||||
hasNextParse = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
// Up the rightmost iterator you can. If you can, reset all
|
||||
// guys to the right of it. If you can't, we're done.
|
||||
TStackList x = new TStackList(sz);
|
||||
hasNextParse = false;
|
||||
for (int i = sz - 1; i >= 0; i--) {
|
||||
TStackList prevSL = (TStackList)iterators[i].previous();
|
||||
x.addAll(0, prevSL);
|
||||
iterators[i].next();
|
||||
if (!hasNextParse && iterators[i].hasNext()) {
|
||||
hasNextParse = true;
|
||||
iterators[i].next();
|
||||
// Reset all iterators to the right of i.
|
||||
for (int j = i + 1; j < sz; j++) {
|
||||
while (iterators[j].hasPrevious())
|
||||
iterators[j].previous();
|
||||
iterators[j].next();
|
||||
}
|
||||
}
|
||||
}
|
||||
return x;
|
||||
}
|
||||
}
|
170
source/org/thdl/tib/text/ttt/TPair.java
Normal file
170
source/org/thdl/tib/text/ttt/TPair.java
Normal file
|
@ -0,0 +1,170 @@
|
|||
/*
|
||||
The contents of this file are subject to the THDL Open Community License
|
||||
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License on the THDL web site
|
||||
(http://www.thdl.org/).
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||
License for the specific terms governing rights and limitations under the
|
||||
License.
|
||||
|
||||
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
||||
All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
*/
|
||||
|
||||
package org.thdl.tib.text.ttt;
|
||||
|
||||
import org.thdl.util.ThdlDebug;
|
||||
|
||||
/** An ordered pair used in ACIP-to-TMW conversion. The left side is
|
||||
* the consonant or empty; the right side is the vowel, '+', or '-'.
|
||||
* @author David Chandler */
|
||||
/* DLC BIG FIXME: make this package work for EWTS, not just ACIP. */
|
||||
class TPair {
|
||||
/** The left side, or null if there is no left side. That is, the
|
||||
* non-vowel, non-'m', non-':', non-'-', non-'+' guy. */
|
||||
private String l;
|
||||
String getLeft() {
|
||||
ThdlDebug.verify(!"".equals(l));
|
||||
return l;
|
||||
}
|
||||
|
||||
/** The right side. That is, the vowel, with 'm' or ':' "vowel"
|
||||
* after it if appropriate, or "-" (disambiguator), or "+"
|
||||
* (stacking), or null otherwise. */
|
||||
private String r;
|
||||
String getRight() {
|
||||
ThdlDebug.verify(!"".equals(r));
|
||||
return r;
|
||||
}
|
||||
|
||||
/** Constructs a new TPair with left side l and right side r.
|
||||
* Use null or the empty string to represent an absence. */
|
||||
TPair(String l, String r) {
|
||||
// Normalize:
|
||||
if (null != l && l.equals("")) l = null;
|
||||
if (null != r && r.equals("")) r = null;
|
||||
|
||||
this.l = l;
|
||||
this.r = r;
|
||||
}
|
||||
|
||||
/** Returns a nice String representation. Returns "(D . E)" for
|
||||
* ACIP {DE}, e.g., and (l . r) in general. */
|
||||
public String toString() {
|
||||
return "("
|
||||
+ ((null == l) ? "" : l) + " . "
|
||||
+ ((null == r) ? "" : r) + ")";
|
||||
}
|
||||
|
||||
/** Returns the number of ACIP characters that make up this
|
||||
* TPair. */
|
||||
int size() {
|
||||
return (((l == null) ? 0 : l.length())
|
||||
+ ((r == null) ? 0 : r.length()));
|
||||
}
|
||||
|
||||
/** Returns an TPair that is like this one except that it is
|
||||
* missing N characters. The characters are taken from r, the
|
||||
* right side, first and from l, the left side, second.
|
||||
* @throw IllegalArgumentException if N is out of range */
|
||||
TPair minusNRightmostACIPCharacters(int N)
|
||||
throws IllegalArgumentException
|
||||
{
|
||||
int sz;
|
||||
String newL = l, newR = r;
|
||||
if (N > size())
|
||||
throw new IllegalArgumentException("Don't have that many to remove.");
|
||||
if (N < 1)
|
||||
throw new IllegalArgumentException("You should't call this if you don't want to remove any.");
|
||||
if (null != r && (sz = r.length()) > 0) {
|
||||
int min = Math.min(sz, N);
|
||||
newR = r.substring(0, sz - min);
|
||||
N -= min;
|
||||
}
|
||||
if (N > 0) {
|
||||
sz = l.length();
|
||||
newL = l.substring(0, sz - N);
|
||||
}
|
||||
return new TPair(newL, newR);
|
||||
}
|
||||
|
||||
/** Returns true if and only if this is nonempty and is l, if
|
||||
* present, is a legal ACIP consonant, and is r, if present, is a
|
||||
* legal ACIP vowel. */
|
||||
boolean isLegal() {
|
||||
if (size() < 1)
|
||||
return false;
|
||||
if (null != l && !ACIPRules.isConsonant(l))
|
||||
return false;
|
||||
if (null != r && !ACIPRules.isVowel(l))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/** Returns true if and only if this pair could be a Tibetan
|
||||
* prefix. */
|
||||
boolean isPrefix() {
|
||||
return (null != l
|
||||
&& ((null == r || "".equals(r))
|
||||
|| "-".equals(r)
|
||||
|| "A".equals(r)) // DLC though check for BASKYABS and warn because BSKYABS is more common
|
||||
&& ("'".equals(l)
|
||||
|| "M".equals(l)
|
||||
|| "B".equals(l)
|
||||
|| "D".equals(l)
|
||||
|| "G".equals(l)));
|
||||
}
|
||||
|
||||
/** Returns true if and only if this pair is merely a
|
||||
* disambiguator. */
|
||||
boolean isDisambiguator() {
|
||||
return ("-".equals(r) && getLeft() == null);
|
||||
}
|
||||
|
||||
/** Returns an TPair that is like this pair except that it has
|
||||
* a "+" on the right if this pair is empty on the right and is
|
||||
* empty on the right if this pair has a disambiguator (i.e., a
|
||||
* '-') on the right. May return itself (but never mutates this
|
||||
* instance). */
|
||||
TPair insideStack() {
|
||||
if (null == getRight())
|
||||
return new TPair(getLeft(), "+");
|
||||
else if ("-".equals(getRight()))
|
||||
return new TPair(getLeft(), null);
|
||||
else
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Returns true if this pair contains a Tibetan number. */
|
||||
boolean isNumeric() {
|
||||
char ch;
|
||||
return (l != null && l.length() == 1 && (ch = l.charAt(0)) >= '0' && ch <= '9');
|
||||
}
|
||||
|
||||
/** Returns the EWTS Wylie that corresponds to this pair. Untested. */
|
||||
String getWylie() {
|
||||
String leftWylie = null;
|
||||
if (getLeft() != null) {
|
||||
leftWylie = ACIPRules.getWylieForACIPConsonant(getLeft());
|
||||
if (leftWylie == null) {
|
||||
if (isNumeric())
|
||||
leftWylie = getLeft();
|
||||
}
|
||||
}
|
||||
String rightWylie = null;
|
||||
if ("-".equals(getRight()))
|
||||
rightWylie = ".";
|
||||
else if ("+".equals(getRight()))
|
||||
rightWylie = "+";
|
||||
else if (getRight() != null)
|
||||
rightWylie = ACIPRules.getWylieForACIPVowel(getRight());
|
||||
if (null == leftWylie) leftWylie = "";
|
||||
if (null == rightWylie) rightWylie = "";
|
||||
return leftWylie + rightWylie;
|
||||
}
|
||||
}
|
579
source/org/thdl/tib/text/ttt/TPairList.java
Normal file
579
source/org/thdl/tib/text/ttt/TPairList.java
Normal file
|
@ -0,0 +1,579 @@
|
|||
/*
|
||||
The contents of this file are subject to the THDL Open Community License
|
||||
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License on the THDL web site
|
||||
(http://www.thdl.org/).
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||
License for the specific terms governing rights and limitations under the
|
||||
License.
|
||||
|
||||
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
||||
All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
*/
|
||||
|
||||
package org.thdl.tib.text.ttt;
|
||||
|
||||
import org.thdl.tib.text.TibetanMachineWeb;
|
||||
import org.thdl.tib.text.TGCPair;
|
||||
import org.thdl.util.ThdlDebug;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
/** A list of {@link TPair TPairs}, typically corresponding to
|
||||
* one tsheg bar. <i>l</i>' in the design doc is an TPairList.
|
||||
*
|
||||
* @author David Chandler */
|
||||
class TPairList {
|
||||
/** FIXME: change me and see if performance improves. */
|
||||
private static final int INITIAL_SIZE = 1;
|
||||
|
||||
/** a fast, non-thread-safe, random-access list implementation: */
|
||||
private ArrayList al;
|
||||
|
||||
/** Creates a new list containing just p. */
|
||||
public TPairList(TPair p) {
|
||||
al = new ArrayList(1);
|
||||
add(p);
|
||||
}
|
||||
|
||||
/** Creates an empty list. */
|
||||
public TPairList() {
|
||||
al = new ArrayList(INITIAL_SIZE);
|
||||
}
|
||||
|
||||
/** Creates an empty list with the capacity to hold N items. */
|
||||
public TPairList(int N) {
|
||||
al = new ArrayList(N);
|
||||
}
|
||||
|
||||
/** Returns the ith pair in this list. */
|
||||
public TPair get(int i) { return (TPair)al.get(i); }
|
||||
|
||||
/** Adds p to the end of this list. */
|
||||
public void add(TPair p) {
|
||||
if (p == null || (p.getLeft() == null && p.getRight() == null))
|
||||
throw new IllegalArgumentException("p is weird");
|
||||
al.add(p);
|
||||
}
|
||||
|
||||
/** Prepends p to the current list of TPairs. */
|
||||
public void prepend(TPair p) {
|
||||
al.add(0, p);
|
||||
}
|
||||
|
||||
/** Returns the number of TPairs in this list. */
|
||||
public int size() { return al.size(); }
|
||||
|
||||
/** Returns a human-readable representation.
|
||||
* @return something like [(R . ), (D . O)] */
|
||||
public String toString2() {
|
||||
return al.toString();
|
||||
}
|
||||
|
||||
/** Returns a human-readable representation like {G}{YA} or
|
||||
* {G-}{YA}. */
|
||||
public String toString() {
|
||||
int sz = size();
|
||||
StringBuffer b = new StringBuffer();
|
||||
for (int i = 0; i < sz; i++) {
|
||||
b.append('{');
|
||||
if (null != get(i).getLeft())
|
||||
b.append(get(i).getLeft());
|
||||
if (null != get(i).getRight())
|
||||
b.append(get(i).getRight());
|
||||
b.append('}');
|
||||
}
|
||||
return b.toString();
|
||||
}
|
||||
|
||||
/** Returns the ACIP corresponding to this TPairList. It will
|
||||
* be as ambiguous as the input. It may have more disambiguators
|
||||
* than the original, such as in the case of the ACIP {1234}. */
|
||||
String recoverACIP() {
|
||||
StringBuffer original = new StringBuffer();
|
||||
int sz = size();
|
||||
for (int i = 0; i < sz; i++) {
|
||||
TPair p = get(i);
|
||||
if (p.getLeft() != null)
|
||||
original.append(p.getLeft());
|
||||
if (p.getRight() != null)
|
||||
original.append(p.getRight());
|
||||
}
|
||||
return original.toString();
|
||||
}
|
||||
|
||||
/** Returns true if this list contains ( . <vowel>) or (A . ),
|
||||
* which are two simple errors you encounter if you interpret DAA
|
||||
* or TAA or DAI or DAE the wrong way. */
|
||||
boolean hasSimpleError() {
|
||||
int sz = size();
|
||||
for (int i = 0; i < sz; i++) {
|
||||
TPair p = get(i);
|
||||
if ((null == p.getLeft() && !"-".equals(p.getRight()))
|
||||
|| ("A".equals(p.getLeft()) && null == p.getRight()))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// DLC [THE FOLLOWIN... appears, so [#comment] or [comment] is possible. [BLANK PAGE] [MISSING PAGE] [FIRST] [SECOND] [DD1] [DD2] [46A.2] [THE ... [FOLLOWING... [PAGE ... [THESE ... @[7B] [SW: OK] [A FIRST... [ADDENDUM... [END ... [Additional [Some [Note [MISSING [DDD] [INCOMPLETE [LINE [DATA
|
||||
// [A pair of ... which is part of the text! S0200A.ACE
|
||||
// [D] is a correction, eh?
|
||||
|
||||
|
||||
// DLC BDE 'BA' ZHIG RGYUN DU BSTEN, ,YENGS KYANG THUB NA [GNYEN PO,)
|
||||
// 'BYONGS [BLO,) S0375M.ACT
|
||||
|
||||
|
||||
// S0011N.ACT contains [SMON TSIG 'DI'I RTZOM MING MI GSAL,], why the brackets? IS all this really a correction? DLC?
|
||||
// DLC: what are () for?
|
||||
|
||||
/** Finds errors so simple that they can be detected without using
|
||||
* the rules of Tibetan spelling (i.e., tsheg bar syntax).
|
||||
* Returns an error message, or null if there is no error that
|
||||
* you can find without the help of tsheg bar syntax rules. */
|
||||
// DLC RENAME
|
||||
// DLC FIXME: 9BLTA is an error, numbers are all or nothing
|
||||
String getACIPError() {
|
||||
int sz = size();
|
||||
if (0 == sz)
|
||||
return "Warning, empty tsheg bar found while converting from ACIP!";
|
||||
boolean first = true;
|
||||
StringBuffer rv = null;
|
||||
boolean mustBeEntirelyNumeric = get(0).isNumeric();
|
||||
for (int i = 0; i < sz; i++) {
|
||||
TPair p = get(i);
|
||||
if (mustBeEntirelyNumeric != p.isNumeric())
|
||||
return "Cannot convert ACIP " + recoverACIP() + " because it contains a number but also a non-number.";
|
||||
|
||||
if ((i == 0 && "V".equals(p.getLeft()))
|
||||
|| (i > 0 && "V".equals(p.getLeft())
|
||||
&& (null != get(i - 1).getRight()
|
||||
&& !"+".equals(get(i - 1).getRight())))) {
|
||||
if (first) {
|
||||
first = false;
|
||||
rv = new StringBuffer("Cannot convert ACIP ");
|
||||
rv.append(recoverACIP());
|
||||
rv.append(" because {V}, wa-zur, appears without being subscribed to a consonant.");
|
||||
} else {
|
||||
rv.append("; also, {V}, wa-zur, appears without being subscribed to a consonant");
|
||||
}
|
||||
} else if ("A".equals(p.getLeft()) && (null == p.getRight() || "".equals(p.getRight()))) {
|
||||
if (first) {
|
||||
first = false;
|
||||
rv = new StringBuffer("Cannot convert ACIP ");
|
||||
rv.append(recoverACIP());
|
||||
rv.append(" because we would be required to assume that {A} is a consonant, when it is not clear if it is a consonant or a vowel.");
|
||||
} else {
|
||||
rv.append("; also, we would be required to assume that {A} is a consonant, when it is not clear if it is a consonant or a vowel.");
|
||||
}
|
||||
} else if ((null == p.getLeft() && !"-".equals(p.getRight()))
|
||||
|| (null != p.getLeft()
|
||||
&& !ACIPRules.isConsonant(p.getLeft())
|
||||
&& !p.isNumeric())) {
|
||||
if (first) {
|
||||
first = false;
|
||||
rv = new StringBuffer("Cannot convert ACIP ");
|
||||
rv.append(recoverACIP());
|
||||
rv.append(" because ");
|
||||
if (null == p.getLeft()) {
|
||||
rv.append(p.getRight());
|
||||
rv.append(" is a \"vowel\" without an associated consonant");
|
||||
} else {
|
||||
rv.append(p.getLeft());
|
||||
rv.append(" is not an ACIP consonant");
|
||||
}
|
||||
} else {
|
||||
if (null == p.getLeft()) {
|
||||
rv.append("; also, ");
|
||||
rv.append(p.getRight());
|
||||
rv.append(" is an ACIP \"vowel\" without an associated consonant");
|
||||
} else {
|
||||
rv.append("; also, ");
|
||||
rv.append(p.getLeft());
|
||||
rv.append(" is not an ACIP consonant");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if ("+".equals(get(sz - 1).getRight())) {
|
||||
if (first) {
|
||||
first = false;
|
||||
rv = new StringBuffer("Cannot convert ACIP ");
|
||||
rv.append(recoverACIP());
|
||||
rv.append(" because it ends with a {+}.");
|
||||
} else {
|
||||
rv.append("; also, it ends with a {+}.");
|
||||
}
|
||||
}
|
||||
|
||||
// DLC really this is a warning, not an error:
|
||||
if ("-".equals(get(sz - 1).getRight())) {
|
||||
if (first) {
|
||||
first = false;
|
||||
rv = new StringBuffer("Cannot convert ACIP ");
|
||||
rv.append(recoverACIP());
|
||||
rv.append(" because it ends with a {-}.");
|
||||
} else {
|
||||
rv.append("; also, it ends with a {-}.");
|
||||
}
|
||||
}
|
||||
|
||||
return (rv == null) ? null : rv.toString();
|
||||
}
|
||||
|
||||
/** Returns true if and only if either x is an TPairList object
|
||||
* representing the same TPairs in the same order or x is a
|
||||
* String that is equals to the result of {@link #toString()}. */
|
||||
public boolean equals(Object x) {
|
||||
if (x instanceof TPairList) {
|
||||
return al.equals(((TPairList)x).al);
|
||||
} else if (x instanceof String) {
|
||||
return toString().equals(x) || toString2().equals(x);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Returns true if and only if this list is empty. */
|
||||
public boolean isEmpty() { return al.isEmpty(); }
|
||||
|
||||
/** Returns a hashCode appropriate for use with our {@link
|
||||
* #equals(Object)} method. */
|
||||
public int hashCode() { return al.hashCode(); }
|
||||
|
||||
private static final int STOP_STACK = 0;
|
||||
private static final int KEEP_STACKING = 1;
|
||||
private static final int ALWAYS_KEEP_STACKING = 2;
|
||||
private static final int ALWAYS_STOP_STACKING = 3;
|
||||
|
||||
// DLC TEST: BA'I has exactly two syntactically legal parses but just one TStackList.
|
||||
|
||||
/** Returns a set (as as ArrayList) of all possible
|
||||
* TStackLists. Uses knowledge of Tibetan spelling rules
|
||||
* (i.e., tsheg bar syntax) to do so. If this list of pairs has
|
||||
* something clearly illegal in it, or is empty, or is merely a
|
||||
* list of disambiguators etc., then this returns null. */
|
||||
public TParseTree getParseTree() {
|
||||
TParseTree pt = new TParseTree();
|
||||
int sz = size();
|
||||
int firstPair = 0;
|
||||
for (int i = 0; i < sz; i++) {
|
||||
|
||||
// We treat [(B . ), (G . +), (K . ), (T . A)] as if it
|
||||
// could be {B+G+K+T} or {B}{G+K}{T} or {B+G+K}{T} or
|
||||
// {B}{G+K+T} (modulo stack legality); we're conservative.
|
||||
// (Though some stacks won't be legal.)
|
||||
|
||||
|
||||
if (ddebug) System.out.println("i is " + i);
|
||||
TPair p = get(i);
|
||||
if (p.getRight() == null && firstPair + 1 < sz) {
|
||||
// Here's the ambiguity. Let's fill up sl. (B . ) (G
|
||||
// . +) (K . A) could be {B+G+KA} or {BA}{G+KA}, so we
|
||||
// go until we hit a vowel and then break into
|
||||
// TPairLists.
|
||||
int start = firstPair;
|
||||
int blanks[] = new int[sz - start]; // we may not use all of this.
|
||||
int j;
|
||||
for (j = start; j < sz; j++) {
|
||||
TPair pj = get(j);
|
||||
boolean isBlank;
|
||||
if (ddebug) System.out.println("right guy is " + pj.getRight());
|
||||
if (pj.isDisambiguator())
|
||||
blanks[j-start] = ALWAYS_STOP_STACKING;
|
||||
else {
|
||||
if (!(isBlank = (pj.getRight() == null)) && !"+".equals(pj.getRight())) {
|
||||
if (ddebug) System.out.println("breaker breaker at j=" + j);
|
||||
break;
|
||||
}
|
||||
blanks[j-start] = isBlank ? STOP_STACK : ALWAYS_KEEP_STACKING;
|
||||
}
|
||||
}
|
||||
if (j >= sz) j = sz - 1;
|
||||
|
||||
blanks[j-start] = ALWAYS_STOP_STACKING;
|
||||
|
||||
// get(j) [corresponding to blanks[j-i]] is
|
||||
// the last pair in the ambiguous stretch; get(i)
|
||||
// [corresponding to blanks[0]] is the first.
|
||||
|
||||
// We'll end up doing 2**(j-i+1) (i.e., (1 <<
|
||||
// (j-i+1))) iterations. If that's going to be too
|
||||
// many, let's just say there's no legal parse. FIXME:
|
||||
// give a nice error message in this case.
|
||||
if (ddebug) System.out.println("ddebug: we're going to do 2^" + (j-i+1) + " [or " + (1 << (j-i+1)) + "] wacky iterations!");
|
||||
if ((j-i+1) > 13) // if you don't use 13, then change PackageTest.testSlowestTshegBar().
|
||||
return new TParseTree();
|
||||
|
||||
boolean keepGoing = true;
|
||||
TStackListList sll = new TStackListList();
|
||||
do {
|
||||
// Add the stack list currently specified by
|
||||
// blanks if all the stacks in it are legal.
|
||||
// DLC DELETE {
|
||||
// ArrayList x = new ArrayList((j-start+1));
|
||||
// for (int ii = 0; ii < (j-start+1); ii++)
|
||||
// x.add(new Integer(blanks[ii]));
|
||||
// }
|
||||
TStackList sl = new TStackList(sz - start);
|
||||
boolean illegal = false;
|
||||
TPairList currentStack = new TPairList();
|
||||
for (int k = 0; k < j-start+1; k++) {
|
||||
TPair pk = get(start + k);
|
||||
if (!pk.isDisambiguator()) {
|
||||
currentStack.add(pk.insideStack());
|
||||
if (blanks[k] == STOP_STACK) {
|
||||
if (currentStack.isLegalTibetanOrSanskritStack())
|
||||
sl.add(currentStack.asStack());
|
||||
else {
|
||||
illegal = true;
|
||||
break;
|
||||
}
|
||||
currentStack = new TPairList();
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!illegal && !currentStack.isEmpty()) {
|
||||
if (currentStack.isLegalTibetanOrSanskritStack()) {
|
||||
TPairList stack = currentStack.asStack();
|
||||
if (ddebug) System.out.println("adding currentStack " + stack + " to sl " + sl);
|
||||
sl.add(stack);
|
||||
} else {
|
||||
illegal = true;
|
||||
}
|
||||
}
|
||||
if (!illegal) {
|
||||
if (ddebug) System.out.println("adding sl " + sl + " to sll " + sll);
|
||||
sll.add(sl);
|
||||
}
|
||||
|
||||
// Update blanks. Think of this as doing base 2
|
||||
// arithmetic where STOP_STACK is zero,
|
||||
// KEEP_STACKING is one, and ALWAYS_KEEP_STACKING
|
||||
// and ALWAYS_STOP_STACKING are digits we cannot
|
||||
// modify. We'll end up doing 2^M iterations,
|
||||
// where M is the number of fields in blanks that
|
||||
// are not equal to ALWAYS_KEEP_STACKING or
|
||||
// ALWAYS_STOP_STACKING.
|
||||
keepGoing = false;
|
||||
for (int k = j-start; k >= 0; k--) {
|
||||
if (blanks[k] == STOP_STACK) {
|
||||
keepGoing = true;
|
||||
blanks[k] = KEEP_STACKING;
|
||||
// reset all digits to the right of k to
|
||||
// "zero":
|
||||
for (int m = k + 1; m < j-start+1; m++) {
|
||||
if (blanks[m] == KEEP_STACKING)
|
||||
blanks[m] = STOP_STACK;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
} while (keepGoing);
|
||||
if (sll.isEmpty())
|
||||
return null; // STXAL or shT+ZNAGN, e.g.
|
||||
else {
|
||||
if (ddebug) System.out.println("adding sll " + sll + " to parse tree " + pt);
|
||||
pt.add(sll);
|
||||
}
|
||||
|
||||
if (ddebug) System.out.println("i is " + i + " and j is " + j + " and we are resetting so that i==j+1 next time.");
|
||||
i = j;
|
||||
firstPair = j + 1;
|
||||
} else if ("+".equals(p.getRight())) {
|
||||
// Keep firstPair where it is.
|
||||
} else {
|
||||
// Add all pairs in the range [firstPair, i]. Some
|
||||
// pairs are stacks all by themselves, some pairs have
|
||||
// '+' on the right and are thus just part of a stack.
|
||||
// We'll add a whole number of stacks, though.
|
||||
|
||||
// this is initialized to hold the max we might use:
|
||||
TStackListList sll
|
||||
= new TStackListList(i - firstPair + 1);
|
||||
|
||||
TPairList currentStack = new TPairList();
|
||||
for (int j = firstPair; j <= i; j++) {
|
||||
TPair pj = get(j);
|
||||
if (!pj.isDisambiguator()) {
|
||||
currentStack.add(pj.insideStack());
|
||||
if (!"+".equals(pj.getRight())) {
|
||||
if (currentStack.isLegalTibetanOrSanskritStack())
|
||||
sll.add(new TStackList(currentStack.asStack()));
|
||||
else {
|
||||
return null;
|
||||
}
|
||||
currentStack = new TPairList();
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!currentStack.isEmpty())
|
||||
throw new Error("how can this happen? currentStack is " + currentStack);
|
||||
|
||||
if (!sll.isEmpty()) {
|
||||
if (ddebug) System.out.println("adding sll " + sll + " to parse tree " + pt);
|
||||
pt.add(sll);
|
||||
firstPair = i + 1;
|
||||
} // else you probably have {G--YA} or something as
|
||||
// your tsheg bar.
|
||||
}
|
||||
}
|
||||
return pt;
|
||||
}
|
||||
|
||||
/** Returns true if and only if this list of TPairs can be
|
||||
* interpreted as a legal Tibetan stack or a legal Tibetanized
|
||||
* Sanskrit stack. This is private because a precondition is
|
||||
* that no vowels or disambiguators appear except possibly in the
|
||||
* final pair. */
|
||||
private boolean isLegalTibetanOrSanskritStack() {
|
||||
StringBuffer tibetan = new StringBuffer();
|
||||
StringBuffer sanskrit = new StringBuffer();
|
||||
int sz = size();
|
||||
|
||||
// Special case because otherwise wa-zur alone would be seen
|
||||
// as legal.
|
||||
if (sz == 1 && "V".equals(get(0).getLeft()))
|
||||
return false;
|
||||
|
||||
for (int i = 0; i < sz; i++) {
|
||||
TPair p = get(i);
|
||||
String ewts_form
|
||||
= ACIPRules.getWylieForACIPConsonant(p.getLeft());
|
||||
if (null == ewts_form) {
|
||||
if (p.isNumeric())
|
||||
ewts_form = p.getLeft();
|
||||
}
|
||||
if (null == ewts_form) {
|
||||
if (ddebug) System.out.println("testing " + toString2() + " for legality said false. numeric?" + p.isNumeric() + "[1]");
|
||||
return false;
|
||||
}
|
||||
tibetan.append(ewts_form);
|
||||
sanskrit.append(ewts_form);
|
||||
if (i + 1 < sz) {
|
||||
tibetan.append('-');
|
||||
sanskrit.append('+');
|
||||
}
|
||||
}
|
||||
boolean ans =
|
||||
(TibetanMachineWeb.hasGlyph(tibetan.toString())
|
||||
|| TibetanMachineWeb.hasGlyph(sanskrit.toString()));
|
||||
if (ddebug) System.out.println("testing " + toString2() + " for legality said " + ans + " [2]; san is " + sanskrit + " tib is " + tibetan + ".");
|
||||
return ans;
|
||||
}
|
||||
private static final boolean ddebug = false;
|
||||
|
||||
/** Mutates this TPairList object such that the last pair is
|
||||
* empty or is a vowel, but is never the stacking operator ('+')
|
||||
* or a disambiguator (i.e., a '-' on the right).
|
||||
* @return this instance */
|
||||
private TPairList asStack() {
|
||||
if (!isEmpty()) {
|
||||
TPair lastPair = get(size() - 1);
|
||||
if ("+".equals(lastPair.getRight()))
|
||||
al.set(size() - 1, new TPair(lastPair.getLeft(), null));
|
||||
else if ("-".equals(lastPair.getRight()))
|
||||
al.set(size() - 1, new TPair(lastPair.getLeft(), null));
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Adds the TGCPairs corresponding to this list to the end of
|
||||
* pl. Some TPairs correspond to more than one TGCPair
|
||||
* ({AA:}); some TGCPairs correspond to more than one TPair
|
||||
* ({G+YA}). To keep track, indexList will be appended to in
|
||||
* lockstep with pl. index (wrapped as an {@link
|
||||
* java.lang#Integer}) will be appended to indexList once each
|
||||
* time we append to pl. This assumes that this TPairList
|
||||
* corresponds to exactly one Tibetan grapheme cluster (i.e.,
|
||||
* stack). Note that U+0F7F (ACIP {:}) is part of a stack, not a
|
||||
* stack all on its own. */
|
||||
void populateWithTGCPairs(ArrayList pl, ArrayList indexList, int index) {
|
||||
int sz = size();
|
||||
if (sz == 0) {
|
||||
return;
|
||||
} else {
|
||||
// drop the disambiguator, if there is one.
|
||||
|
||||
boolean isNumeric = false;
|
||||
StringBuffer lWylie = new StringBuffer();
|
||||
int i;
|
||||
// All pairs but the last:
|
||||
for (i = 0; i + 1 < sz; i++) {
|
||||
lWylie.append(get(i).getWylie());
|
||||
if (get(i).isNumeric())
|
||||
isNumeric = true;
|
||||
}
|
||||
|
||||
// The last pair:
|
||||
TPair p = get(i);
|
||||
ThdlDebug.verify(!"+".equals(p.getRight()));
|
||||
int where;
|
||||
boolean add_U0F7F = false;
|
||||
if (p.getRight() != null
|
||||
&& (where = p.getRight().indexOf(':')) >= 0) {
|
||||
// this ':' guy is his own TGCPair.
|
||||
add_U0F7F = true;
|
||||
StringBuffer rr = new StringBuffer(p.getRight());
|
||||
rr.deleteCharAt(where);
|
||||
p = new TPair(p.getLeft(), rr.toString());
|
||||
}
|
||||
boolean hasNonAVowel = (!"A".equals(p.getRight()) && null != p.getRight());
|
||||
String thislWylie = ACIPRules.getWylieForACIPConsonant(p.getLeft());
|
||||
if (thislWylie == null) {
|
||||
char ch;
|
||||
if (p.isNumeric()) {
|
||||
thislWylie = p.getLeft();
|
||||
isNumeric = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (null == thislWylie) throw new Error("BADNESS AT MAXIMUM: p is " + p + " and thislWylie is " + thislWylie);
|
||||
lWylie.append(thislWylie);
|
||||
StringBuffer ll = new StringBuffer(lWylie.toString());
|
||||
int ww;
|
||||
// DLC NOW: what about fixed-form RA on top??? test it.
|
||||
while ((ww = ll.indexOf("+")) >= 0)
|
||||
ll.deleteCharAt(ww);
|
||||
boolean isTibetan = TibetanMachineWeb.isWylieTibetanConsonantOrConsonantStack(ll.toString());
|
||||
boolean isSanskrit = TibetanMachineWeb.isWylieSanskritConsonantStack(lWylie.toString());
|
||||
if (!isTibetan && !isSanskrit && !isNumeric && true) {
|
||||
System.out.println("DLC: OTHER for " + lWylie + " with vowel " + ACIPRules.getWylieForACIPVowel(p.getRight()) + " and p.getRight()=" + p.getRight());
|
||||
}
|
||||
if (isTibetan && isSanskrit) isSanskrit = false; // RVA, e.g.
|
||||
if (true && hasNonAVowel && ACIPRules.getWylieForACIPVowel(p.getRight()) == null) {
|
||||
System.out.println("DLC: vowel " + ACIPRules.getWylieForACIPVowel(p.getRight()) + " and p.getRight()=" + p.getRight());
|
||||
}
|
||||
TGCPair tp;
|
||||
indexList.add(new Integer(index));
|
||||
tp = new TGCPair(lWylie.toString()
|
||||
+ (hasNonAVowel
|
||||
? ACIPRules.getWylieForACIPVowel(p.getRight())
|
||||
: ""),
|
||||
(isNumeric
|
||||
? TGCPair.OTHER
|
||||
: (hasNonAVowel
|
||||
? (isSanskrit
|
||||
? TGCPair.SANSKRIT_WITH_VOWEL
|
||||
: (isTibetan
|
||||
? TGCPair.CONSONANTAL_WITH_VOWEL
|
||||
: TGCPair.OTHER))
|
||||
: (isSanskrit
|
||||
? TGCPair.SANSKRIT_WITHOUT_VOWEL
|
||||
: (isTibetan
|
||||
? TGCPair.CONSONANTAL_WITHOUT_VOWEL
|
||||
: TGCPair.OTHER)))));
|
||||
pl.add(tp);
|
||||
if (add_U0F7F) {
|
||||
indexList.add(new Integer(index));
|
||||
pl.add(new TGCPair("H", TGCPair.OTHER));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// DLC FIXME: handle 'o' and 'x', e.g. KAo and NYAx.
|
167
source/org/thdl/tib/text/ttt/TPairListFactory.java
Normal file
167
source/org/thdl/tib/text/ttt/TPairListFactory.java
Normal file
|
@ -0,0 +1,167 @@
|
|||
/*
|
||||
The contents of this file are subject to the THDL Open Community License
|
||||
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License on the THDL web site
|
||||
(http://www.thdl.org/).
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||
License for the specific terms governing rights and limitations under the
|
||||
License.
|
||||
|
||||
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
||||
All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
*/
|
||||
|
||||
package org.thdl.tib.text.ttt;
|
||||
|
||||
/** A factory for creating {@link TPairList TPairLists} from
|
||||
* Strings of ACIP.
|
||||
* @author David Chandler */
|
||||
class TPairListFactory {
|
||||
/** This class is not instantiable. */
|
||||
private TPairListFactory() { }
|
||||
|
||||
/** Returns a new TPairList instance. Breaks an ACIP tsheg bar
|
||||
* (roughly a "syllable") into chunks; this computes l'
|
||||
* (for you design doc enthusiasts).
|
||||
*
|
||||
* <p>Here's a rough sketch of the algorithm: run along getting
|
||||
* the current TPair as big as you can. If you get it very
|
||||
* big, but there's something illegal afterward that wouldn't
|
||||
* otherwise be illegal, undo as little as possible to correct.
|
||||
* For example, G'A'I becomes [(G . 'A), (' . I)], and TAA
|
||||
* becomes [(T . A)] in a first pass but then we see that the
|
||||
* rest would be suboptimal, so we backtrack to [(T . )] and then
|
||||
* finally become [(T . ), (A . A)]. We look for (A . ) and (
|
||||
* . <vowel>) in the rest in order to say "the rest would be
|
||||
* suboptimal", i.e. we use TPairList.hasSimpleError()
|
||||
* @param acip a string of ACIP with no punctuation in it */
|
||||
static TPairList breakACIPIntoChunks(String acip) {
|
||||
|
||||
// base case for our recursion:
|
||||
if ("".equals(acip))
|
||||
return new TPairList();
|
||||
|
||||
StringBuffer acipBuf = new StringBuffer(acip);
|
||||
int howMuchBuf[] = new int[1];
|
||||
TPair head = getFirstConsonantAndVowel(acipBuf, howMuchBuf);
|
||||
int howMuch = howMuchBuf[0];
|
||||
TPairList tail;
|
||||
if ((tail
|
||||
= breakACIPIntoChunks(acipBuf.substring(howMuch))).hasSimpleError()) {
|
||||
for (int i = 1; i < howMuch; i++) {
|
||||
// try giving i characters back if that leaves us with
|
||||
// a legal head and makes the rest free of simple
|
||||
// errors.
|
||||
TPairList newTail = null;
|
||||
TPair newHead;
|
||||
if ((newHead = head.minusNRightmostACIPCharacters(i)).isLegal()
|
||||
&& !(newTail
|
||||
= breakACIPIntoChunks(acipBuf.substring(howMuch - i))).hasSimpleError()) {
|
||||
newTail.prepend(newHead);
|
||||
return newTail;
|
||||
}
|
||||
}
|
||||
// It didn't work. Return the first thing we'd thought
|
||||
// of: head appended with tail. (I.e., fall through.)
|
||||
}
|
||||
tail.prepend(head);
|
||||
return tail;
|
||||
}
|
||||
|
||||
/** Returns the largest TPair we can make from the acip
|
||||
* starting from the left. This will return a size zero pair if
|
||||
* and only if acip is the empty string; otherwise, it may return
|
||||
* a pair with either the left or right component empty. This
|
||||
* mutates acip when we run into {NA+YA}; it mutates acip into
|
||||
* {N+YA}. For {NE+YA}, it doesn not mutate acip or behave
|
||||
* intelligently. A later phase will need to turn that into
|
||||
* {N+YE} (DLC). howMuch[0] will be set to the number of
|
||||
* characters of acip that this call has consumed. */
|
||||
private static TPair getFirstConsonantAndVowel(StringBuffer acip,
|
||||
int howMuch[]) {
|
||||
// Note that it is *not* the case that if acip.substring(0, N)
|
||||
// is legal (according to TPair.isLegal()), then
|
||||
// acip.substring(0, N-1) is legal for all N. For example,
|
||||
// think of {shA} and {KshA}. However, 's' is the only tricky
|
||||
// fellow, so it is true that acip.substring(0, N-1) is either
|
||||
// legal or ends with 's' if acip.substring(0, N) is legal.
|
||||
//
|
||||
// We don't, however, use this approach. We just try to find
|
||||
// a consonant of length 3, and then, failing that, of length
|
||||
// 2, etc. Likewise with vowels. This avoids the issue.
|
||||
|
||||
int i, xl = acip.length();
|
||||
if (0 == xl) {
|
||||
howMuch[0] = 0;
|
||||
return new TPair(null, null);
|
||||
}
|
||||
if (acip.charAt(0) == '-') {
|
||||
howMuch[0] = 1;
|
||||
return new TPair(null, "-");
|
||||
}
|
||||
char ch = acip.charAt(0);
|
||||
|
||||
// Numbers never appear in stacks, so if you see 1234, that's
|
||||
// like seeing 1-2-3-4.
|
||||
if (ch >= '0' && ch <= '9') {
|
||||
howMuch[0] = 1; // not 2...
|
||||
return new TPair(acip.substring(0, 1), (xl == 1) ? null : "-");
|
||||
}
|
||||
|
||||
String l = null, r = null;
|
||||
for (i = Math.min(ACIPRules.MAX_CONSONANT_LENGTH, xl); i >= 1; i--) {
|
||||
String t = null;
|
||||
if (ACIPRules.isConsonant(t = acip.substring(0, i))) {
|
||||
l = t;
|
||||
break;
|
||||
}
|
||||
}
|
||||
int ll = (null == l) ? 0 : l.length();
|
||||
if (null != l && xl > ll && acip.charAt(ll) == '-') {
|
||||
howMuch[0] = l.length() + 1;
|
||||
return new TPair(l, "-");
|
||||
}
|
||||
if (null != l && xl > ll && acip.charAt(ll) == '+') {
|
||||
howMuch[0] = l.length() + 1;
|
||||
return new TPair(l, "+");
|
||||
}
|
||||
for (i = Math.min(ACIPRules.MAX_VOWEL_LENGTH, xl - ll); i >= 1; i--) {
|
||||
String t = null;
|
||||
if (ACIPRules.isVowel(t = acip.substring(ll, ll + i))) {
|
||||
r = t;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Treat {BATA+SA'I} like {BAT+SA'I}:
|
||||
int z;
|
||||
if (null != l && "A".equals(r) && ((z = ll + "A".length()) < xl)
|
||||
&& acip.charAt(z) == '+') {
|
||||
acip.deleteCharAt(z-1);
|
||||
howMuch[0] = l.length() + 1;
|
||||
return new TPair(l, "+");
|
||||
}
|
||||
|
||||
// what if we see a character that's not part of any vowel or
|
||||
// consonant? We return it.
|
||||
if (null == l && null == r) {
|
||||
howMuch[0] = 1; // not 2...
|
||||
// add a '-' to avoid exponentials:
|
||||
return new TPair(acip.substring(0, 1), (xl == 1) ? null : "-");
|
||||
}
|
||||
|
||||
howMuch[0] = (((l == null) ? 0 : l.length())
|
||||
+ ((r == null) ? 0 : r.length()));
|
||||
return new TPair(l, r);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// DLC strip out [#...] comments; test for nested comments
|
||||
|
||||
// DLC see Translit directory on ACIP v4 CD-ROM
|
200
source/org/thdl/tib/text/ttt/TParseTree.java
Normal file
200
source/org/thdl/tib/text/ttt/TParseTree.java
Normal file
|
@ -0,0 +1,200 @@
|
|||
/*
|
||||
The contents of this file are subject to the THDL Open Community License
|
||||
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License on the THDL web site
|
||||
(http://www.thdl.org/).
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||
License for the specific terms governing rights and limitations under the
|
||||
License.
|
||||
|
||||
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
||||
All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
*/
|
||||
|
||||
package org.thdl.tib.text.ttt;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
/** A list of non-empty list of {@link #TStackListList
|
||||
* TStackListLists} representing all the ways you could break up a
|
||||
* tsheg bar of ACIP into stacks (i.e., grapheme clusters).
|
||||
*
|
||||
* @author David Chandler */
|
||||
class TParseTree {
|
||||
/** a fast, non-thread-safe, random-access list implementation: */
|
||||
private ArrayList al = new ArrayList();
|
||||
|
||||
/** Creates an empty list. */
|
||||
public TParseTree() { }
|
||||
|
||||
/** Returns the ith pair in this list. */
|
||||
public TStackListList get(int i) { return (TStackListList)al.get(i); }
|
||||
|
||||
/** Adds p to the end of this list. */
|
||||
public void add(TStackListList p)
|
||||
throws IllegalArgumentException
|
||||
{
|
||||
if (p.isEmpty())
|
||||
throw new IllegalArgumentException("p is empty");
|
||||
al.add(p);
|
||||
}
|
||||
|
||||
/** Returns the number of TStackListLists in this list. See
|
||||
* also {@link #numberOfParses()}, which gives a different
|
||||
* interpretation of the size of this tree. */
|
||||
public int size() { return al.size(); }
|
||||
|
||||
/** Returns the number of different parses one could make from
|
||||
* this parse tree. Returns zero if this list is empty. */
|
||||
public int numberOfParses() {
|
||||
if (al.isEmpty()) return 0;
|
||||
int k = 1;
|
||||
int sz = size();
|
||||
for (int i = 0; i < sz; i++) {
|
||||
k *= get(i).size();
|
||||
}
|
||||
return k;
|
||||
}
|
||||
|
||||
/** Returns the number of {@link #TPair pairs} that are in a
|
||||
* parse of this tree. */
|
||||
public int numberOfPairs() {
|
||||
if (al.isEmpty()) return 0;
|
||||
int k = 1;
|
||||
int sz = size();
|
||||
for (int i = 0; i < sz; i++) {
|
||||
// get(i).get(0) is the same size as get(i).get(1),
|
||||
// get(i).get(2), ...
|
||||
k += get(i).get(0).size();
|
||||
}
|
||||
return k;
|
||||
}
|
||||
|
||||
/** Returns an iterator that will iterate over the {@link
|
||||
* #numberOfParses} available. */
|
||||
public ParseIterator getParseIterator() {
|
||||
return new ParseIterator(al);
|
||||
}
|
||||
|
||||
/** Returns a list containing the legal parses of this parse tree.
|
||||
* By "legal", we mean a sequence of stacks that is
|
||||
* legal by the rules of Tibetan tsheg bar syntax (sometimes
|
||||
* called spelling). This will return the {G-YA} parse of {GYA}
|
||||
* as well as the {GYA} parse, so watch yourself. */
|
||||
public TStackListList getLegalParses() {
|
||||
TStackListList sll = new TStackListList(2); // save memory
|
||||
ParseIterator pi = getParseIterator();
|
||||
while (pi.hasNext()) {
|
||||
TStackList sl = pi.next();
|
||||
if (sl.isLegalTshegBar().isLegal) {
|
||||
sll.add(sl);
|
||||
}
|
||||
}
|
||||
return sll;
|
||||
}
|
||||
|
||||
/** Returns a list containing the parses of this parse tree that
|
||||
* are not clearly illegal. */
|
||||
public TStackListList getNonIllegalParses() {
|
||||
TStackListList sll = new TStackListList(2); // save memory
|
||||
ParseIterator pi = getParseIterator();
|
||||
while (pi.hasNext()) {
|
||||
TStackList sl = pi.next();
|
||||
if (!sl.isClearlyIllegal()) {
|
||||
sll.add(sl);
|
||||
}
|
||||
}
|
||||
return sll;
|
||||
}
|
||||
|
||||
/** Returns the best parse, if there is a unique parse that is
|
||||
* clearly preferred to other parses. Basically, if there's a
|
||||
* unique legal parse, you get it. If there's not, but there is
|
||||
* a unique non-illegal parse, you get it. If there's not a
|
||||
* unique answer, null is returned. */
|
||||
// {TZANDRA} is not solved by this, DLC NOW. Solve PADMA PROBLEM!
|
||||
|
||||
// DLC by using this we can get rid of single-sanskrit-gc, eh?
|
||||
public TStackList getBestParse() {
|
||||
TStackListList up = getUniqueParse();
|
||||
if (up.size() == 1)
|
||||
return up.get(0);
|
||||
else if (up.size() == 2) {
|
||||
}
|
||||
up = getNonIllegalParses();
|
||||
int sz = up.size();
|
||||
if (up.size() == 1) {
|
||||
return up.get(0);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/** Returns a list containing the unique legal parse of this parse
|
||||
* tree if there is a unique legal parse. Note that {SRAS} has a
|
||||
* unique legal parse, though {SRS} has two equally good parses;
|
||||
* i.e., note that the {A} vowel is treated specially here
|
||||
* (unlike in {@link #getLegalParses()}). Returns an empty list
|
||||
* if there are no legal parses. Returns a list containing all
|
||||
* legal parses if there two or more equally good parses. By
|
||||
* "legal", we mean a sequence of stacks that is legal
|
||||
* by the rules of Tibetan tsheg bar syntax (sometimes called
|
||||
* spelling). */
|
||||
public TStackListList getUniqueParse() {
|
||||
TStackListList allLegalParses = new TStackListList(2); // save memory
|
||||
TStackListList legalParsesWithVowelOnRoot = new TStackListList(1);
|
||||
ParseIterator pi = getParseIterator();
|
||||
while (pi.hasNext()) {
|
||||
TStackList sl = pi.next();
|
||||
BoolPair bpa = sl.isLegalTshegBar();
|
||||
if (bpa.isLegal) {
|
||||
if (bpa.isLegalAndHasAVowelOnRoot)
|
||||
legalParsesWithVowelOnRoot.add(sl);
|
||||
allLegalParses.add(sl);
|
||||
}
|
||||
}
|
||||
if (legalParsesWithVowelOnRoot.size() == 1)
|
||||
return legalParsesWithVowelOnRoot;
|
||||
else {
|
||||
if (legalParsesWithVowelOnRoot.size() == 2) {
|
||||
// DLC is this even valid?
|
||||
if (legalParsesWithVowelOnRoot.get(0).size() != 1 + legalParsesWithVowelOnRoot.get(1).size())
|
||||
throw new Error("Something other than the G-YA vs. GYA case appeared. Sorry for your trouble! " + legalParsesWithVowelOnRoot.get(0) + " ;; " + legalParsesWithVowelOnRoot.get(1));
|
||||
return new TStackListList(legalParsesWithVowelOnRoot.get(1));
|
||||
}
|
||||
if (allLegalParses.size() == 2) {
|
||||
// DLC is this even valid?
|
||||
if (allLegalParses.get(0).size() != 1 + allLegalParses.get(1).size())
|
||||
throw new Error("Something other than the G-YA vs. GYA case appeared. Sorry for your trouble! " + allLegalParses.get(0) + " ;; " + allLegalParses.get(1));
|
||||
return new TStackListList(allLegalParses.get(1));
|
||||
}
|
||||
return allLegalParses;
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns a human-readable representation. */
|
||||
public String toString() {
|
||||
return al.toString();
|
||||
}
|
||||
|
||||
/** Returns true if and only if either x is an TParseTree
|
||||
* object representing the same TPairLists in the same order
|
||||
* or x is a String that is equals to the result of {@link
|
||||
* #toString()}. */
|
||||
public boolean equals(Object x) {
|
||||
if (x instanceof TParseTree) {
|
||||
return al.equals(((TParseTree)x).al);
|
||||
} else if (x instanceof String) {
|
||||
return toString().equals(x);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Returns a hashCode appropriate for use with our {@link
|
||||
* #equals(Object)} method. */
|
||||
public int hashCode() { return al.hashCode(); }
|
||||
}
|
176
source/org/thdl/tib/text/ttt/TStackList.java
Normal file
176
source/org/thdl/tib/text/ttt/TStackList.java
Normal file
|
@ -0,0 +1,176 @@
|
|||
/*
|
||||
The contents of this file are subject to the THDL Open Community License
|
||||
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License on the THDL web site
|
||||
(http://www.thdl.org/).
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||
License for the specific terms governing rights and limitations under the
|
||||
License.
|
||||
|
||||
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
||||
All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
*/
|
||||
|
||||
package org.thdl.tib.text.ttt;
|
||||
|
||||
import org.thdl.tib.text.TibTextUtils;
|
||||
import org.thdl.tib.text.TGCList;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.ListIterator;
|
||||
|
||||
/** A list of {@link TPairList TPairLists}, each of which is for
|
||||
* a stack (a grapheme cluster), typically corresponding to one tsheg
|
||||
* bar.
|
||||
*
|
||||
* @author David Chandler */
|
||||
class TStackList {
|
||||
/** FIXME: change me and see if performance improves. */
|
||||
private static final int INITIAL_SIZE = 1;
|
||||
|
||||
/** a fast, non-thread-safe, random-access list implementation: */
|
||||
private ArrayList al;
|
||||
|
||||
/** Creates an empty list. */
|
||||
public TStackList() { al = new ArrayList(INITIAL_SIZE); }
|
||||
|
||||
/** Creates a list containing just p. */
|
||||
public TStackList(TPairList p) {
|
||||
al = new ArrayList(1);
|
||||
add(p);
|
||||
}
|
||||
|
||||
/** Creates an empty list with the capacity to hold N items. */
|
||||
public TStackList(int N) {
|
||||
al = new ArrayList(N);
|
||||
}
|
||||
|
||||
/** Returns the ith pair in this list. */
|
||||
public TPairList get(int i) { return (TPairList)al.get(i); }
|
||||
|
||||
/** Adds p to the end of this list. */
|
||||
public void add(TPairList p) { al.add(p); }
|
||||
|
||||
/** Adds all the stacks in c to the end of this list. */
|
||||
public void addAll(TStackList c) { al.addAll(c.al); }
|
||||
|
||||
/** Adds all the stacks in c to this list, inserting them at
|
||||
* position k. */
|
||||
public void addAll(int k, TStackList c) { al.addAll(k, c.al); }
|
||||
|
||||
/** Returns the number of TPairLists in this list. */
|
||||
public int size() { return al.size(); }
|
||||
|
||||
/** Returns true if and only if this list is empty. */
|
||||
public boolean isEmpty() { return al.isEmpty(); }
|
||||
|
||||
/** Returns a human-readable representation like {G}{YA} or
|
||||
* {GYA}. */
|
||||
public String toString() {
|
||||
int sz = size();
|
||||
StringBuffer b = new StringBuffer();
|
||||
for (int i = 0; i < sz; i++) {
|
||||
b.append('{');
|
||||
b.append(get(i).recoverACIP());
|
||||
b.append('}');
|
||||
}
|
||||
return b.toString();
|
||||
}
|
||||
|
||||
/** Returns a human-readable representation.
|
||||
* @return something like [[(R . ), (D . O)], [(R . ), (J . E)]] */
|
||||
public String toString2() {
|
||||
return al.toString();
|
||||
}
|
||||
|
||||
/** Returns true if and only if either x is an TStackList
|
||||
* object representing the same TPairLists in the same
|
||||
* order or x is a String that is equals to the result of {@link
|
||||
* #toString()}. */
|
||||
public boolean equals(Object x) {
|
||||
if (x instanceof TStackList) {
|
||||
return al.equals(((TStackList)x).al);
|
||||
} else if (x instanceof String) {
|
||||
return toString().equals(x) || toString2().equals(x);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Returns a hashCode appropriate for use with our {@link
|
||||
* #equals(Object)} method. */
|
||||
public int hashCode() { return al.hashCode(); }
|
||||
|
||||
/** Returns an iterator for this list. Mutate this list while
|
||||
* iterating and you'll have to read the code to know what will
|
||||
* happen. */
|
||||
public ListIterator listIterator() { return al.listIterator(); }
|
||||
|
||||
/** Returns a pair with {@link BoolPair#isLegal} true if and only
|
||||
* if this list of stacks is a legal tsheg bar by the rules of
|
||||
* Tibetan syntax (sometimes called rules of spelling). If this
|
||||
* is legal, then {@link BoolPair#isLegalAndHasAVowelOnRoot} will
|
||||
* be true if and only if there is an explicit {A} vowel on the
|
||||
* root stack. */
|
||||
public BoolPair isLegalTshegBar() {
|
||||
// DLC handle PADMA and other Tibetanized Sanskrit fellows. Right now we only handle single-stack guys.
|
||||
|
||||
TTGCList tgcList = new TTGCList(this);
|
||||
StringBuffer warnings = new StringBuffer();
|
||||
String candidateType
|
||||
= TibTextUtils.getClassificationOfTshegBar(tgcList, warnings);
|
||||
// System.out.println("DLC: " + toString() + " has candidateType " + candidateType + " and warnings " + warnings);
|
||||
|
||||
// preliminary answer:
|
||||
boolean isLegal = (candidateType != "invalid");
|
||||
|
||||
if (isLegal) {
|
||||
if (isClearlyIllegal())
|
||||
isLegal = false;
|
||||
}
|
||||
|
||||
boolean isLegalAndHasAVowelOnRoot = false;
|
||||
if (isLegal) {
|
||||
int rootIndices[]
|
||||
= TibTextUtils.getIndicesOfRootForCandidateType(candidateType);
|
||||
for (int i = 0; i < 2; i++) {
|
||||
if (rootIndices[i] >= 0) {
|
||||
int pairListIndex = tgcList.getTPairListIndex(rootIndices[i]);
|
||||
TPairList pl = get(pairListIndex);
|
||||
TPair p = pl.get(pl.size() - 1);
|
||||
isLegalAndHasAVowelOnRoot
|
||||
= (p.getRight() != null && p.getRight().startsWith("A")); // could be {A:}, e.g.
|
||||
if (isLegalAndHasAVowelOnRoot)
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return new BoolPair(isLegal, isLegalAndHasAVowelOnRoot);
|
||||
}
|
||||
|
||||
/** Returns true if and only if this stack list contains a clearly
|
||||
* illegal construct, such as an TPair (V . something). */
|
||||
boolean isClearlyIllegal() {
|
||||
// check for {D}{VA} sorts of things:
|
||||
for (int i = 0; i < size(); i++) {
|
||||
if (get(i).getACIPError() != null) {
|
||||
System.out.println("DLC: error is " + get(i).getACIPError());
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
class BoolPair {
|
||||
boolean isLegal;
|
||||
boolean isLegalAndHasAVowelOnRoot;
|
||||
BoolPair(boolean isLegal, boolean isLegalAndHasAVowelOnRoot) {
|
||||
this.isLegal = isLegal;
|
||||
this.isLegalAndHasAVowelOnRoot = isLegalAndHasAVowelOnRoot;
|
||||
}
|
||||
}
|
86
source/org/thdl/tib/text/ttt/TStackListList.java
Normal file
86
source/org/thdl/tib/text/ttt/TStackListList.java
Normal file
|
@ -0,0 +1,86 @@
|
|||
/*
|
||||
The contents of this file are subject to the THDL Open Community License
|
||||
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License on the THDL web site
|
||||
(http://www.thdl.org/).
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||
License for the specific terms governing rights and limitations under the
|
||||
License.
|
||||
|
||||
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
||||
All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
*/
|
||||
|
||||
package org.thdl.tib.text.ttt;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.ListIterator;
|
||||
|
||||
/** A list of {@link #TStackList} objects, each of which is for a
|
||||
* stack (a grapheme cluster), typically corresponding to one
|
||||
* ambiguous section of a tsheg bar.
|
||||
*
|
||||
* @author David Chandler */
|
||||
class TStackListList {
|
||||
/** a fast, non-thread-safe, random-access list implementation: */
|
||||
private ArrayList al;
|
||||
|
||||
/** Creates an empty list. */
|
||||
public TStackListList() { al = new ArrayList(); }
|
||||
|
||||
/** Creates a list containing just p. */
|
||||
public TStackListList(TStackList p) {
|
||||
al = new ArrayList(1);
|
||||
add(p);
|
||||
}
|
||||
|
||||
/** Creates an empty list with the capacity to hold N items. */
|
||||
public TStackListList(int N) {
|
||||
al = new ArrayList(N);
|
||||
}
|
||||
|
||||
/** Returns the ith pair in this list. */
|
||||
public TStackList get(int i) { return (TStackList)al.get(i); }
|
||||
|
||||
/** Adds p to the end of this list. */
|
||||
public void add(TStackList p) { al.add(p); }
|
||||
|
||||
/** Returns the number of TStackList objects in this list. */
|
||||
public int size() { return al.size(); }
|
||||
|
||||
/** Returns true if and only if this list is empty. */
|
||||
public boolean isEmpty() { return al.isEmpty(); }
|
||||
|
||||
/** Returns a human-readable representation.
|
||||
* @return something like [[[(R . ), (D . O)], [(R . ), (J . E)]]] */
|
||||
public String toString() {
|
||||
return al.toString();
|
||||
}
|
||||
|
||||
/** Returns true if and only if either x is an TStackListList
|
||||
* object representing the same TStackList objects in the same
|
||||
* order or x is a String that is equals to the result of {@link
|
||||
* #toString()}. */
|
||||
public boolean equals(Object x) {
|
||||
if (x instanceof TStackListList) {
|
||||
return al.equals(((TStackListList)x).al);
|
||||
} else if (x instanceof String) {
|
||||
return toString().equals(x);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Returns a hashCode appropriate for use with our {@link
|
||||
* #equals(Object)} method. */
|
||||
public int hashCode() { return al.hashCode(); }
|
||||
|
||||
/** Returns an iterator for this list. Mutate this list while
|
||||
* iterating and you'll have to read the code to know what will
|
||||
* happen. */
|
||||
public ListIterator listIterator() { return al.listIterator(); }
|
||||
}
|
63
source/org/thdl/tib/text/ttt/TTGCList.java
Normal file
63
source/org/thdl/tib/text/ttt/TTGCList.java
Normal file
|
@ -0,0 +1,63 @@
|
|||
/*
|
||||
The contents of this file are subject to the THDL Open Community License
|
||||
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License on the THDL web site
|
||||
(http://www.thdl.org/).
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||
License for the specific terms governing rights and limitations under the
|
||||
License.
|
||||
|
||||
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
||||
All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
*/
|
||||
|
||||
package org.thdl.tib.text.ttt;
|
||||
|
||||
import org.thdl.tib.text.TGCList;
|
||||
import org.thdl.tib.text.TGCPair;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
/** A list of grapheme clusters.
|
||||
*
|
||||
* @author David Chandler */
|
||||
class TTGCList implements TGCList {
|
||||
// I could use one list of an ordered pair (TGCPair, int), but I
|
||||
// use two lists.
|
||||
private ArrayList al;
|
||||
private ArrayList stackIndices;
|
||||
|
||||
/** Don't use this. */
|
||||
private TTGCList() { }
|
||||
|
||||
/** Creates a TGCList. */
|
||||
public TTGCList(TStackList sl) {
|
||||
al = new ArrayList();
|
||||
stackIndices = new ArrayList();
|
||||
int sz = sl.size();
|
||||
for (int i = 0; i < sz; i++) {
|
||||
sl.get(i).populateWithTGCPairs(al, stackIndices, i);
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns the ith pair in this list. */
|
||||
public TGCPair get(int i) {
|
||||
return (TGCPair)al.get(i);
|
||||
}
|
||||
|
||||
/** Returns the number of TGCPairs in this list. */
|
||||
public int size() { return al.size(); }
|
||||
|
||||
/** Returns a zero-based index of an TPairList inside the stack
|
||||
* list from which this list was constructed. This pair list is
|
||||
* the one that caused the TGCPair at index tgcPairIndex to come
|
||||
* into existence. */
|
||||
public int getTPairListIndex(int tgcPairIndex) {
|
||||
return ((Integer)stackIndices.get(tgcPairIndex)).intValue();
|
||||
}
|
||||
}
|
31
source/org/thdl/tib/text/ttt/package.html
Normal file
31
source/org/thdl/tib/text/ttt/package.html
Normal file
|
@ -0,0 +1,31 @@
|
|||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
|
||||
<html>
|
||||
<head>
|
||||
<!--
|
||||
|
||||
@(#)package.html
|
||||
|
||||
Copyright 2003 Tibetan and Himalayan Digital Library
|
||||
|
||||
This software is the confidential and proprietary information of
|
||||
the Tibetan and Himalayan Digital Library. You shall use such
|
||||
information only in accordance with the terms of the license
|
||||
agreement you entered into with the THDL.
|
||||
|
||||
-->
|
||||
</head>
|
||||
<body bgcolor="white">
|
||||
|
||||
Provides classes and methods for converting Latin transliteration of
|
||||
Tibetan text into Tibetan.
|
||||
<p>
|
||||
This package (whose name, ttt, stands for transliteration-to-Tibetan)
|
||||
contains methods for converting ACIP transliteration into Tibetan
|
||||
Machine Web and methods for converting EWTS transliteration into
|
||||
Tibetan Machine Web. It has extensive tests, though probably not
|
||||
mentioned in these Javadoc documents.
|
||||
</p>
|
||||
<h2>Related Documentation</h2>
|
||||
@see <a href="../package-summary.html">org.thdl.tib.text</a>
|
||||
</body>
|
||||
</html>
|
Loading…
Add table
Add a link
Reference in a new issue