Added an unfinished ACIP->Tibetan converter. Once it works properly
for ACIP, it'll easily be made to work as a perfect EWTS Wylie->Tibetan converter. It has an extensive suite of tests for the existing functionality.
This commit is contained in:
parent
39e0435b6b
commit
e21d3774a9
14 changed files with 8709 additions and 21 deletions
46
build.xml
46
build.xml
|
@ -1,3 +1,22 @@
|
|||
<!--
|
||||
|
||||
The contents of this file are subject to the THDL Open Community License
|
||||
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License on the THDL web site
|
||||
(http://www.thdl.org/).
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||
License for the specific terms governing rights and limitations under the
|
||||
License.
|
||||
|
||||
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||
Library (THDL). Portions created by the THDL are Copyright 2002-2003 THDL.
|
||||
All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
-->
|
||||
|
||||
<!-- @author David Chandler, dchandler@users.sourceforge.net
|
||||
|
||||
FIXME
|
||||
|
@ -13,12 +32,11 @@
|
|||
|
||||
This build file is the main one, and it uses jwsbuild.xml for
|
||||
building Java Web Start (JWS) releases. See the comments in that
|
||||
file to learn why.
|
||||
file to learn why. Likewise with junitbuild.xml.
|
||||
|
||||
Quick start for the impatient: This buildfile looks for the
|
||||
following, which are not in the CVS repository:
|
||||
|
||||
extensions/jdom.jar
|
||||
$ANT_HOME/lib/xalan.jar
|
||||
$ANT_HOME/lib/xercesImpl.jar
|
||||
$ANT_HOME/lib/xml-apis.jar
|
||||
|
@ -391,6 +409,19 @@
|
|||
description="compiles all JUnit test cases that can be compiled in the present CLASSPATH (NB that this distinction is just wishful thinking for now because we have such weak test coverage at this point)" >
|
||||
<mkdir dir="${junitbin}"/>
|
||||
<antcall target="create-timestamp-source-code"/> <!-- DLC NOW! The -run targets are mucking with this! It isn't fatal, but it should be fixed. -->
|
||||
<antcall target="our-internal-javac-task">
|
||||
<param name="mybin" value="${junitbin}"/>
|
||||
<param name="my.included.source.file"
|
||||
value="org/thdl/tib/text/ttt/PackageTest.java"/>
|
||||
</antcall>
|
||||
<antcall target="copy-ini-files-to-bin-dir-for-jarring">
|
||||
<param name="mybin" value="${junitbin}"/>
|
||||
</antcall>
|
||||
<antcall target="our-internal-javac-task">
|
||||
<param name="mybin" value="${junitbin}"/>
|
||||
<param name="my.included.source.file"
|
||||
value="org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIETest.java"/>
|
||||
</antcall>
|
||||
<antcall target="our-internal-javac-task">
|
||||
<param name="mybin" value="${junitbin}"/>
|
||||
<param name="my.included.source.file"
|
||||
|
@ -411,13 +442,6 @@
|
|||
<param name="my.included.source.file"
|
||||
value="org/thdl/tib/text/tshegbar/LegalTshegBarTest.java"/>
|
||||
</antcall>
|
||||
|
||||
<antcall target="our-internal-javac-task">
|
||||
<param name="mybin" value="${junitbin}"/>
|
||||
<param name="my.included.source.file"
|
||||
value="org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIETest.java"/>
|
||||
</antcall>
|
||||
|
||||
<antcall target="create-timestamp-source-code">
|
||||
<param name="mybin" value="${junitbin}"/>
|
||||
</antcall>
|
||||
|
@ -426,10 +450,6 @@
|
|||
<param name="my.included.source.file"
|
||||
value="org/thdl/tib/input/DuffPaneTest.java"/>
|
||||
</antcall>
|
||||
<antcall target="copy-ini-files-to-bin-dir-for-jarring">
|
||||
<param name="mybin" value="${junitbin}"/>
|
||||
</antcall>
|
||||
|
||||
<antcall target="our-internal-javac-task">
|
||||
<param name="mybin" value="${junitbin}"/>
|
||||
<param name="my.included.source.file"
|
||||
|
|
|
@ -56,10 +56,11 @@
|
|||
<formatter type="xml"/><!-- If not XML, then 'ant -buildfile
|
||||
build.xml check-report' will fail. -->
|
||||
<sysproperty key="java.awt.headless" value="true"/>
|
||||
<test name="org.thdl.tib.text.ttt.PackageTest"/>
|
||||
<test name="org.thdl.tib.input.TMW_RTF_TO_THDL_WYLIETest"/>
|
||||
<test name="org.thdl.util.RTFFixerInputStreamTest"/>
|
||||
<test name="org.thdl.util.ThdlLazyExceptionTest"/>
|
||||
<test name="org.thdl.util.TrieTest"/>
|
||||
<test name="org.thdl.tib.input.TMW_RTF_TO_THDL_WYLIETest"/>
|
||||
<test name="org.thdl.tib.text.tshegbar.UnicodeUtilsTest"/>
|
||||
<test name="org.thdl.tib.text.tshegbar.LegalTshegBarTest"/>
|
||||
<test name="org.thdl.tib.text.tshegbar.UnicodeGraphemeClusterTest"/>
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
|
||||
@(#)package.html
|
||||
|
||||
Copyright 2001-2002 Tibetan and Himalayan Digital Library
|
||||
Copyright 2001-2003 Tibetan and Himalayan Digital Library
|
||||
|
||||
This software is the confidential and proprietary information of
|
||||
the Tibetan and Himalayan Digital Library. You shall use such
|
||||
|
@ -18,12 +18,14 @@
|
|||
|
||||
Provides classes and methods for dealing with Tibetan text.
|
||||
<p>
|
||||
Designed for use with the Tibetan Computer
|
||||
Company's free cross-platform TibetanMachineWeb fonts, this package
|
||||
contains methods for getting the Extended Wylie
|
||||
correspondences for each TibetanMachineWeb glyph, and for
|
||||
convert back and forth between Extended
|
||||
Wylie and TibetanMachineWeb.
|
||||
Designed for use with the Tibetan Computer Company's free
|
||||
cross-platform TibetanMachineWeb fonts, this package contains methods
|
||||
for getting the Extended Wylie correspondences for each
|
||||
TibetanMachineWeb glyph, and for convert back and forth between
|
||||
Extended Wylie and TibetanMachineWeb. The TMW to Wylie conversion is
|
||||
perfect, but the Wylie to TMW is flawed, so use the code in package
|
||||
org.thdl.tib.text.ttt instead for serious work. The Wylie to TMW here
|
||||
is more like a keyboard than a real Wylie to TMW conversion.
|
||||
<p>
|
||||
This package provides a variety of ways to store TibetanMachineWeb data,
|
||||
and includes methods to aid programmers who want to convert from
|
||||
|
@ -34,5 +36,6 @@ keyboards. Four keyboards have been provided in this release,
|
|||
but users may also create their own keyboards.
|
||||
<h2>Related Documentation</h2>
|
||||
@see <a href="../input/package-summary.html">org.thdl.tib.input</a>
|
||||
@see <a href="ttt/package-summary.html">org.thdl.tib.text.ttt</a>
|
||||
</body>
|
||||
</html>
|
||||
|
|
207
source/org/thdl/tib/text/ttt/ACIPRules.java
Normal file
207
source/org/thdl/tib/text/ttt/ACIPRules.java
Normal file
|
@ -0,0 +1,207 @@
|
|||
/*
|
||||
The contents of this file are subject to the THDL Open Community License
|
||||
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License on the THDL web site
|
||||
(http://www.thdl.org/).
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||
License for the specific terms governing rights and limitations under the
|
||||
License.
|
||||
|
||||
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
||||
All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
*/
|
||||
|
||||
package org.thdl.tib.text.ttt;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.HashMap;
|
||||
|
||||
/** Canonizes some facts regarding the ACIP transcription system.
|
||||
* @author David Chandler */
|
||||
class ACIPRules {
|
||||
/** {Ksh}, the longest consonant, has 3 characters, so this is
|
||||
* three. */
|
||||
public static int MAX_CONSONANT_LENGTH = 3;
|
||||
|
||||
/** {'im:}, the longest "vowel", has 4 characters, so this is
|
||||
* four. */
|
||||
public static int MAX_VOWEL_LENGTH = 4;
|
||||
|
||||
/** For O(1) {@link #isVowel(String)} calls. */
|
||||
private static HashSet acipVowels = null;
|
||||
|
||||
private static String[][] baseVowels = new String[][] {
|
||||
// { ACIP, EWTS }:
|
||||
{ "A", "a" },
|
||||
{ "I", "i" },
|
||||
{ "U", "u" },
|
||||
{ "E", "e" },
|
||||
{ "O", "o" },
|
||||
{ "'I", "I" },
|
||||
{ "'U", "U" },
|
||||
{ "EE", "ai" },
|
||||
{ "OO", "au" },
|
||||
{ "i", "-i" },
|
||||
{ "'i", "-I" },
|
||||
{ "'A", "A" },
|
||||
{ "'O", "Ao" },
|
||||
{ "'E", "Ae" }
|
||||
// DLC I'm on my own with 'O and 'E, but GANG'O appears
|
||||
// and I wonder... so here are 'O and 'E. It's
|
||||
// consistent with 'I and 'A and 'U, at least.
|
||||
};
|
||||
|
||||
/** Returns true if and only if s is an ACIP "vowel". You can't
|
||||
* just call this any time -- A is a consonant and a vowel in
|
||||
* ACIP, so you have to call this in the right context. */
|
||||
public static boolean isVowel(String s) {
|
||||
if (null == acipVowels) {
|
||||
acipVowels = new HashSet();
|
||||
for (int i = 0; i < baseVowels.length; i++) {
|
||||
acipVowels.add(baseVowels[i][0]);
|
||||
acipVowels.add(baseVowels[i][0] + 'm');
|
||||
acipVowels.add(baseVowels[i][0] + ':');
|
||||
acipVowels.add(baseVowels[i][0] + "m:");
|
||||
// DLC '\' for visarga? how shall we do \ the visarga? like a vowel or not?
|
||||
|
||||
}
|
||||
}
|
||||
return (acipVowels.contains(s));
|
||||
}
|
||||
|
||||
/** For O(1) {@link #isConsonant(String)} calls. */
|
||||
private static HashSet consonants = null;
|
||||
|
||||
/** Returns true if and only if acip is an ACIP consonant (without
|
||||
* a vowel). For example, returns true for "K", but not for
|
||||
* "KA" or "X". */
|
||||
public static boolean isConsonant(String acip) {
|
||||
if (consonants == null) {
|
||||
consonants = new HashSet();
|
||||
consonants.add("V");
|
||||
consonants.add("K");
|
||||
consonants.add("KH");
|
||||
consonants.add("G");
|
||||
consonants.add("NG");
|
||||
consonants.add("C");
|
||||
consonants.add("CH");
|
||||
consonants.add("J");
|
||||
consonants.add("NY");
|
||||
consonants.add("T");
|
||||
consonants.add("TH");
|
||||
consonants.add("D");
|
||||
consonants.add("N");
|
||||
consonants.add("P");
|
||||
consonants.add("PH");
|
||||
consonants.add("B");
|
||||
consonants.add("M");
|
||||
consonants.add("TZ");
|
||||
consonants.add("TS");
|
||||
consonants.add("DZ");
|
||||
consonants.add("W");
|
||||
consonants.add("ZH");
|
||||
consonants.add("Z");
|
||||
consonants.add("Y");
|
||||
consonants.add("R");
|
||||
consonants.add("L");
|
||||
consonants.add("SH");
|
||||
consonants.add("S");
|
||||
consonants.add("H");
|
||||
consonants.add("t");
|
||||
consonants.add("th");
|
||||
consonants.add("d");
|
||||
consonants.add("n");
|
||||
consonants.add("sh");
|
||||
consonants.add("dH");
|
||||
consonants.add("DH");
|
||||
consonants.add("BH");
|
||||
consonants.add("DZH"); // longest, MAX_CONSONANT_LENGTH characters
|
||||
consonants.add("Ksh"); // longest, MAX_CONSONANT_LENGTH characters
|
||||
consonants.add("GH");
|
||||
consonants.add("'");
|
||||
consonants.add("A");
|
||||
}
|
||||
return consonants.contains(acip);
|
||||
}
|
||||
|
||||
private static HashMap acipConsonant2wylie = null;
|
||||
/** Returns the EWTS corresponding to the given ACIP consonant
|
||||
* (without the "A" vowel). Returns null if there is no such
|
||||
* EWTS. */
|
||||
static final String getWylieForACIPConsonant(String acip) {
|
||||
if (acipConsonant2wylie == null) {
|
||||
acipConsonant2wylie = new HashMap(37);
|
||||
|
||||
// oddball:
|
||||
acipConsonant2wylie.put("V", "w");
|
||||
|
||||
// more oddballs:
|
||||
acipConsonant2wylie.put("DH", "d+h");
|
||||
acipConsonant2wylie.put("BH", "b+h");
|
||||
acipConsonant2wylie.put("dH", "D+h");
|
||||
acipConsonant2wylie.put("DZH", "dz+h");
|
||||
acipConsonant2wylie.put("Ksh", "k+Sh");
|
||||
acipConsonant2wylie.put("GH", "g+h");
|
||||
|
||||
|
||||
acipConsonant2wylie.put("K", "k");
|
||||
acipConsonant2wylie.put("KH", "kh");
|
||||
acipConsonant2wylie.put("G", "g");
|
||||
acipConsonant2wylie.put("NG", "ng");
|
||||
acipConsonant2wylie.put("C", "c");
|
||||
acipConsonant2wylie.put("CH", "ch");
|
||||
acipConsonant2wylie.put("J", "j");
|
||||
acipConsonant2wylie.put("NY", "ny");
|
||||
acipConsonant2wylie.put("T", "t");
|
||||
acipConsonant2wylie.put("TH", "th");
|
||||
acipConsonant2wylie.put("D", "d");
|
||||
acipConsonant2wylie.put("N", "n");
|
||||
acipConsonant2wylie.put("P", "p");
|
||||
acipConsonant2wylie.put("PH", "ph");
|
||||
acipConsonant2wylie.put("B", "b");
|
||||
acipConsonant2wylie.put("M", "m");
|
||||
acipConsonant2wylie.put("TZ", "ts");
|
||||
acipConsonant2wylie.put("TS", "tsh");
|
||||
acipConsonant2wylie.put("DZ", "dz");
|
||||
acipConsonant2wylie.put("W", "w");
|
||||
acipConsonant2wylie.put("ZH", "zh");
|
||||
acipConsonant2wylie.put("Z", "z");
|
||||
acipConsonant2wylie.put("'", "'");
|
||||
acipConsonant2wylie.put("Y", "y");
|
||||
acipConsonant2wylie.put("R", "r");
|
||||
acipConsonant2wylie.put("L", "l");
|
||||
acipConsonant2wylie.put("SH", "sh");
|
||||
acipConsonant2wylie.put("S", "s");
|
||||
acipConsonant2wylie.put("H", "h");
|
||||
acipConsonant2wylie.put("A", "a");
|
||||
acipConsonant2wylie.put("t", "T");
|
||||
acipConsonant2wylie.put("th", "Th");
|
||||
acipConsonant2wylie.put("d", "D");
|
||||
acipConsonant2wylie.put("n", "N");
|
||||
acipConsonant2wylie.put("sh", "Sh");
|
||||
}
|
||||
return (String)acipConsonant2wylie.get(acip);
|
||||
}
|
||||
|
||||
private static HashMap acipVowel2wylie = null;
|
||||
/** Returns the EWTS corresponding to the given ACIP "vowel".
|
||||
* Returns null if there is no such EWTS. */
|
||||
static final String getWylieForACIPVowel(String acip) {
|
||||
if (acipVowel2wylie == null) {
|
||||
acipVowel2wylie = new HashMap(baseVowels.length * 4);
|
||||
|
||||
for (int i = 0; i < baseVowels.length; i++) {
|
||||
acipVowel2wylie.put(baseVowels[i][0], baseVowels[i][1]);
|
||||
acipVowel2wylie.put(baseVowels[i][0] + 'm', baseVowels[i][1] + 'M');
|
||||
acipVowel2wylie.put(baseVowels[i][0] + ':', baseVowels[i][1] + 'H');
|
||||
acipVowel2wylie.put(baseVowels[i][0] + "m:", baseVowels[i][1] + "MH");
|
||||
}
|
||||
}
|
||||
return (String)acipVowel2wylie.get(acip);
|
||||
}
|
||||
}
|
6885
source/org/thdl/tib/text/ttt/PackageTest.java
Normal file
6885
source/org/thdl/tib/text/ttt/PackageTest.java
Normal file
File diff suppressed because it is too large
Load diff
100
source/org/thdl/tib/text/ttt/ParseIterator.java
Normal file
100
source/org/thdl/tib/text/ttt/ParseIterator.java
Normal file
|
@ -0,0 +1,100 @@
|
|||
/*
|
||||
The contents of this file are subject to the THDL Open Community License
|
||||
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License on the THDL web site
|
||||
(http://www.thdl.org/).
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||
License for the specific terms governing rights and limitations under the
|
||||
License.
|
||||
|
||||
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
||||
All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
*/
|
||||
|
||||
package org.thdl.tib.text.ttt;
|
||||
|
||||
import java.util.ListIterator;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.ArrayList;
|
||||
|
||||
/** An object that can iterate over an {@link #TParseTree}.
|
||||
*
|
||||
* @author David Chandler */
|
||||
class ParseIterator {
|
||||
private ArrayList al = null;
|
||||
private int sz;
|
||||
private ListIterator[] iterators;
|
||||
private boolean first = true;
|
||||
private boolean hasNextParse = true;
|
||||
/** Constructs a new ParseIterator that iterates over a list of
|
||||
* TStackListLists. */
|
||||
ParseIterator(ArrayList al) {
|
||||
this.al = al;
|
||||
sz = al.size();
|
||||
iterators = new ListIterator[sz];
|
||||
hasNextParse = false;
|
||||
for (int i = 0; i < sz; i++) {
|
||||
iterators[i] = ((TStackListList)al.get(i)).listIterator();
|
||||
if (iterators[i].hasNext())
|
||||
hasNextParse = true;
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns true if and only if there is another parse
|
||||
* available. */
|
||||
boolean hasNext() {
|
||||
return hasNextParse;
|
||||
}
|
||||
|
||||
/** Returns the next available parse. */
|
||||
TStackList next() {
|
||||
if (!hasNextParse)
|
||||
throw new NoSuchElementException("no parses left");
|
||||
if (first) {
|
||||
first = false;
|
||||
TStackList x = new TStackList();
|
||||
for (int i = 0; i < sz; i++) {
|
||||
TStackList nextSL = (TStackList)iterators[i].next();
|
||||
x.addAll(nextSL);
|
||||
}
|
||||
|
||||
// The next guy is found by taking the previous item of
|
||||
// each iterator.
|
||||
hasNextParse = false;
|
||||
for (int i = sz - 1; i >= 0; i--) {
|
||||
if (iterators[i].hasNext()) {
|
||||
iterators[i].next();
|
||||
hasNextParse = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
// Up the rightmost iterator you can. If you can, reset all
|
||||
// guys to the right of it. If you can't, we're done.
|
||||
TStackList x = new TStackList(sz);
|
||||
hasNextParse = false;
|
||||
for (int i = sz - 1; i >= 0; i--) {
|
||||
TStackList prevSL = (TStackList)iterators[i].previous();
|
||||
x.addAll(0, prevSL);
|
||||
iterators[i].next();
|
||||
if (!hasNextParse && iterators[i].hasNext()) {
|
||||
hasNextParse = true;
|
||||
iterators[i].next();
|
||||
// Reset all iterators to the right of i.
|
||||
for (int j = i + 1; j < sz; j++) {
|
||||
while (iterators[j].hasPrevious())
|
||||
iterators[j].previous();
|
||||
iterators[j].next();
|
||||
}
|
||||
}
|
||||
}
|
||||
return x;
|
||||
}
|
||||
}
|
170
source/org/thdl/tib/text/ttt/TPair.java
Normal file
170
source/org/thdl/tib/text/ttt/TPair.java
Normal file
|
@ -0,0 +1,170 @@
|
|||
/*
|
||||
The contents of this file are subject to the THDL Open Community License
|
||||
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License on the THDL web site
|
||||
(http://www.thdl.org/).
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||
License for the specific terms governing rights and limitations under the
|
||||
License.
|
||||
|
||||
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
||||
All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
*/
|
||||
|
||||
package org.thdl.tib.text.ttt;
|
||||
|
||||
import org.thdl.util.ThdlDebug;
|
||||
|
||||
/** An ordered pair used in ACIP-to-TMW conversion. The left side is
|
||||
* the consonant or empty; the right side is the vowel, '+', or '-'.
|
||||
* @author David Chandler */
|
||||
/* DLC BIG FIXME: make this package work for EWTS, not just ACIP. */
|
||||
class TPair {
|
||||
/** The left side, or null if there is no left side. That is, the
|
||||
* non-vowel, non-'m', non-':', non-'-', non-'+' guy. */
|
||||
private String l;
|
||||
String getLeft() {
|
||||
ThdlDebug.verify(!"".equals(l));
|
||||
return l;
|
||||
}
|
||||
|
||||
/** The right side. That is, the vowel, with 'm' or ':' "vowel"
|
||||
* after it if appropriate, or "-" (disambiguator), or "+"
|
||||
* (stacking), or null otherwise. */
|
||||
private String r;
|
||||
String getRight() {
|
||||
ThdlDebug.verify(!"".equals(r));
|
||||
return r;
|
||||
}
|
||||
|
||||
/** Constructs a new TPair with left side l and right side r.
|
||||
* Use null or the empty string to represent an absence. */
|
||||
TPair(String l, String r) {
|
||||
// Normalize:
|
||||
if (null != l && l.equals("")) l = null;
|
||||
if (null != r && r.equals("")) r = null;
|
||||
|
||||
this.l = l;
|
||||
this.r = r;
|
||||
}
|
||||
|
||||
/** Returns a nice String representation. Returns "(D . E)" for
|
||||
* ACIP {DE}, e.g., and (l . r) in general. */
|
||||
public String toString() {
|
||||
return "("
|
||||
+ ((null == l) ? "" : l) + " . "
|
||||
+ ((null == r) ? "" : r) + ")";
|
||||
}
|
||||
|
||||
/** Returns the number of ACIP characters that make up this
|
||||
* TPair. */
|
||||
int size() {
|
||||
return (((l == null) ? 0 : l.length())
|
||||
+ ((r == null) ? 0 : r.length()));
|
||||
}
|
||||
|
||||
/** Returns an TPair that is like this one except that it is
|
||||
* missing N characters. The characters are taken from r, the
|
||||
* right side, first and from l, the left side, second.
|
||||
* @throw IllegalArgumentException if N is out of range */
|
||||
TPair minusNRightmostACIPCharacters(int N)
|
||||
throws IllegalArgumentException
|
||||
{
|
||||
int sz;
|
||||
String newL = l, newR = r;
|
||||
if (N > size())
|
||||
throw new IllegalArgumentException("Don't have that many to remove.");
|
||||
if (N < 1)
|
||||
throw new IllegalArgumentException("You should't call this if you don't want to remove any.");
|
||||
if (null != r && (sz = r.length()) > 0) {
|
||||
int min = Math.min(sz, N);
|
||||
newR = r.substring(0, sz - min);
|
||||
N -= min;
|
||||
}
|
||||
if (N > 0) {
|
||||
sz = l.length();
|
||||
newL = l.substring(0, sz - N);
|
||||
}
|
||||
return new TPair(newL, newR);
|
||||
}
|
||||
|
||||
/** Returns true if and only if this is nonempty and is l, if
|
||||
* present, is a legal ACIP consonant, and is r, if present, is a
|
||||
* legal ACIP vowel. */
|
||||
boolean isLegal() {
|
||||
if (size() < 1)
|
||||
return false;
|
||||
if (null != l && !ACIPRules.isConsonant(l))
|
||||
return false;
|
||||
if (null != r && !ACIPRules.isVowel(l))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/** Returns true if and only if this pair could be a Tibetan
|
||||
* prefix. */
|
||||
boolean isPrefix() {
|
||||
return (null != l
|
||||
&& ((null == r || "".equals(r))
|
||||
|| "-".equals(r)
|
||||
|| "A".equals(r)) // DLC though check for BASKYABS and warn because BSKYABS is more common
|
||||
&& ("'".equals(l)
|
||||
|| "M".equals(l)
|
||||
|| "B".equals(l)
|
||||
|| "D".equals(l)
|
||||
|| "G".equals(l)));
|
||||
}
|
||||
|
||||
/** Returns true if and only if this pair is merely a
|
||||
* disambiguator. */
|
||||
boolean isDisambiguator() {
|
||||
return ("-".equals(r) && getLeft() == null);
|
||||
}
|
||||
|
||||
/** Returns an TPair that is like this pair except that it has
|
||||
* a "+" on the right if this pair is empty on the right and is
|
||||
* empty on the right if this pair has a disambiguator (i.e., a
|
||||
* '-') on the right. May return itself (but never mutates this
|
||||
* instance). */
|
||||
TPair insideStack() {
|
||||
if (null == getRight())
|
||||
return new TPair(getLeft(), "+");
|
||||
else if ("-".equals(getRight()))
|
||||
return new TPair(getLeft(), null);
|
||||
else
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Returns true if this pair contains a Tibetan number. */
|
||||
boolean isNumeric() {
|
||||
char ch;
|
||||
return (l != null && l.length() == 1 && (ch = l.charAt(0)) >= '0' && ch <= '9');
|
||||
}
|
||||
|
||||
/** Returns the EWTS Wylie that corresponds to this pair. Untested. */
|
||||
String getWylie() {
|
||||
String leftWylie = null;
|
||||
if (getLeft() != null) {
|
||||
leftWylie = ACIPRules.getWylieForACIPConsonant(getLeft());
|
||||
if (leftWylie == null) {
|
||||
if (isNumeric())
|
||||
leftWylie = getLeft();
|
||||
}
|
||||
}
|
||||
String rightWylie = null;
|
||||
if ("-".equals(getRight()))
|
||||
rightWylie = ".";
|
||||
else if ("+".equals(getRight()))
|
||||
rightWylie = "+";
|
||||
else if (getRight() != null)
|
||||
rightWylie = ACIPRules.getWylieForACIPVowel(getRight());
|
||||
if (null == leftWylie) leftWylie = "";
|
||||
if (null == rightWylie) rightWylie = "";
|
||||
return leftWylie + rightWylie;
|
||||
}
|
||||
}
|
579
source/org/thdl/tib/text/ttt/TPairList.java
Normal file
579
source/org/thdl/tib/text/ttt/TPairList.java
Normal file
|
@ -0,0 +1,579 @@
|
|||
/*
|
||||
The contents of this file are subject to the THDL Open Community License
|
||||
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License on the THDL web site
|
||||
(http://www.thdl.org/).
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||
License for the specific terms governing rights and limitations under the
|
||||
License.
|
||||
|
||||
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
||||
All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
*/
|
||||
|
||||
package org.thdl.tib.text.ttt;
|
||||
|
||||
import org.thdl.tib.text.TibetanMachineWeb;
|
||||
import org.thdl.tib.text.TGCPair;
|
||||
import org.thdl.util.ThdlDebug;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
/** A list of {@link TPair TPairs}, typically corresponding to
|
||||
* one tsheg bar. <i>l</i>' in the design doc is an TPairList.
|
||||
*
|
||||
* @author David Chandler */
|
||||
class TPairList {
|
||||
/** FIXME: change me and see if performance improves. */
|
||||
private static final int INITIAL_SIZE = 1;
|
||||
|
||||
/** a fast, non-thread-safe, random-access list implementation: */
|
||||
private ArrayList al;
|
||||
|
||||
/** Creates a new list containing just p. */
|
||||
public TPairList(TPair p) {
|
||||
al = new ArrayList(1);
|
||||
add(p);
|
||||
}
|
||||
|
||||
/** Creates an empty list. */
|
||||
public TPairList() {
|
||||
al = new ArrayList(INITIAL_SIZE);
|
||||
}
|
||||
|
||||
/** Creates an empty list with the capacity to hold N items. */
|
||||
public TPairList(int N) {
|
||||
al = new ArrayList(N);
|
||||
}
|
||||
|
||||
/** Returns the ith pair in this list. */
|
||||
public TPair get(int i) { return (TPair)al.get(i); }
|
||||
|
||||
/** Adds p to the end of this list. */
|
||||
public void add(TPair p) {
|
||||
if (p == null || (p.getLeft() == null && p.getRight() == null))
|
||||
throw new IllegalArgumentException("p is weird");
|
||||
al.add(p);
|
||||
}
|
||||
|
||||
/** Prepends p to the current list of TPairs. */
|
||||
public void prepend(TPair p) {
|
||||
al.add(0, p);
|
||||
}
|
||||
|
||||
/** Returns the number of TPairs in this list. */
|
||||
public int size() { return al.size(); }
|
||||
|
||||
/** Returns a human-readable representation.
|
||||
* @return something like [(R . ), (D . O)] */
|
||||
public String toString2() {
|
||||
return al.toString();
|
||||
}
|
||||
|
||||
/** Returns a human-readable representation like {G}{YA} or
|
||||
* {G-}{YA}. */
|
||||
public String toString() {
|
||||
int sz = size();
|
||||
StringBuffer b = new StringBuffer();
|
||||
for (int i = 0; i < sz; i++) {
|
||||
b.append('{');
|
||||
if (null != get(i).getLeft())
|
||||
b.append(get(i).getLeft());
|
||||
if (null != get(i).getRight())
|
||||
b.append(get(i).getRight());
|
||||
b.append('}');
|
||||
}
|
||||
return b.toString();
|
||||
}
|
||||
|
||||
/** Returns the ACIP corresponding to this TPairList. It will
|
||||
* be as ambiguous as the input. It may have more disambiguators
|
||||
* than the original, such as in the case of the ACIP {1234}. */
|
||||
String recoverACIP() {
|
||||
StringBuffer original = new StringBuffer();
|
||||
int sz = size();
|
||||
for (int i = 0; i < sz; i++) {
|
||||
TPair p = get(i);
|
||||
if (p.getLeft() != null)
|
||||
original.append(p.getLeft());
|
||||
if (p.getRight() != null)
|
||||
original.append(p.getRight());
|
||||
}
|
||||
return original.toString();
|
||||
}
|
||||
|
||||
/** Returns true if this list contains ( . <vowel>) or (A . ),
|
||||
* which are two simple errors you encounter if you interpret DAA
|
||||
* or TAA or DAI or DAE the wrong way. */
|
||||
boolean hasSimpleError() {
|
||||
int sz = size();
|
||||
for (int i = 0; i < sz; i++) {
|
||||
TPair p = get(i);
|
||||
if ((null == p.getLeft() && !"-".equals(p.getRight()))
|
||||
|| ("A".equals(p.getLeft()) && null == p.getRight()))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// DLC [THE FOLLOWIN... appears, so [#comment] or [comment] is possible. [BLANK PAGE] [MISSING PAGE] [FIRST] [SECOND] [DD1] [DD2] [46A.2] [THE ... [FOLLOWING... [PAGE ... [THESE ... @[7B] [SW: OK] [A FIRST... [ADDENDUM... [END ... [Additional [Some [Note [MISSING [DDD] [INCOMPLETE [LINE [DATA
|
||||
// [A pair of ... which is part of the text! S0200A.ACE
|
||||
// [D] is a correction, eh?
|
||||
|
||||
|
||||
// DLC BDE 'BA' ZHIG RGYUN DU BSTEN, ,YENGS KYANG THUB NA [GNYEN PO,)
|
||||
// 'BYONGS [BLO,) S0375M.ACT
|
||||
|
||||
|
||||
// S0011N.ACT contains [SMON TSIG 'DI'I RTZOM MING MI GSAL,], why the brackets? IS all this really a correction? DLC?
|
||||
// DLC: what are () for?
|
||||
|
||||
/** Finds errors so simple that they can be detected without using
|
||||
* the rules of Tibetan spelling (i.e., tsheg bar syntax).
|
||||
* Returns an error message, or null if there is no error that
|
||||
* you can find without the help of tsheg bar syntax rules. */
|
||||
// DLC RENAME
|
||||
// DLC FIXME: 9BLTA is an error, numbers are all or nothing
|
||||
String getACIPError() {
|
||||
int sz = size();
|
||||
if (0 == sz)
|
||||
return "Warning, empty tsheg bar found while converting from ACIP!";
|
||||
boolean first = true;
|
||||
StringBuffer rv = null;
|
||||
boolean mustBeEntirelyNumeric = get(0).isNumeric();
|
||||
for (int i = 0; i < sz; i++) {
|
||||
TPair p = get(i);
|
||||
if (mustBeEntirelyNumeric != p.isNumeric())
|
||||
return "Cannot convert ACIP " + recoverACIP() + " because it contains a number but also a non-number.";
|
||||
|
||||
if ((i == 0 && "V".equals(p.getLeft()))
|
||||
|| (i > 0 && "V".equals(p.getLeft())
|
||||
&& (null != get(i - 1).getRight()
|
||||
&& !"+".equals(get(i - 1).getRight())))) {
|
||||
if (first) {
|
||||
first = false;
|
||||
rv = new StringBuffer("Cannot convert ACIP ");
|
||||
rv.append(recoverACIP());
|
||||
rv.append(" because {V}, wa-zur, appears without being subscribed to a consonant.");
|
||||
} else {
|
||||
rv.append("; also, {V}, wa-zur, appears without being subscribed to a consonant");
|
||||
}
|
||||
} else if ("A".equals(p.getLeft()) && (null == p.getRight() || "".equals(p.getRight()))) {
|
||||
if (first) {
|
||||
first = false;
|
||||
rv = new StringBuffer("Cannot convert ACIP ");
|
||||
rv.append(recoverACIP());
|
||||
rv.append(" because we would be required to assume that {A} is a consonant, when it is not clear if it is a consonant or a vowel.");
|
||||
} else {
|
||||
rv.append("; also, we would be required to assume that {A} is a consonant, when it is not clear if it is a consonant or a vowel.");
|
||||
}
|
||||
} else if ((null == p.getLeft() && !"-".equals(p.getRight()))
|
||||
|| (null != p.getLeft()
|
||||
&& !ACIPRules.isConsonant(p.getLeft())
|
||||
&& !p.isNumeric())) {
|
||||
if (first) {
|
||||
first = false;
|
||||
rv = new StringBuffer("Cannot convert ACIP ");
|
||||
rv.append(recoverACIP());
|
||||
rv.append(" because ");
|
||||
if (null == p.getLeft()) {
|
||||
rv.append(p.getRight());
|
||||
rv.append(" is a \"vowel\" without an associated consonant");
|
||||
} else {
|
||||
rv.append(p.getLeft());
|
||||
rv.append(" is not an ACIP consonant");
|
||||
}
|
||||
} else {
|
||||
if (null == p.getLeft()) {
|
||||
rv.append("; also, ");
|
||||
rv.append(p.getRight());
|
||||
rv.append(" is an ACIP \"vowel\" without an associated consonant");
|
||||
} else {
|
||||
rv.append("; also, ");
|
||||
rv.append(p.getLeft());
|
||||
rv.append(" is not an ACIP consonant");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if ("+".equals(get(sz - 1).getRight())) {
|
||||
if (first) {
|
||||
first = false;
|
||||
rv = new StringBuffer("Cannot convert ACIP ");
|
||||
rv.append(recoverACIP());
|
||||
rv.append(" because it ends with a {+}.");
|
||||
} else {
|
||||
rv.append("; also, it ends with a {+}.");
|
||||
}
|
||||
}
|
||||
|
||||
// DLC really this is a warning, not an error:
|
||||
if ("-".equals(get(sz - 1).getRight())) {
|
||||
if (first) {
|
||||
first = false;
|
||||
rv = new StringBuffer("Cannot convert ACIP ");
|
||||
rv.append(recoverACIP());
|
||||
rv.append(" because it ends with a {-}.");
|
||||
} else {
|
||||
rv.append("; also, it ends with a {-}.");
|
||||
}
|
||||
}
|
||||
|
||||
return (rv == null) ? null : rv.toString();
|
||||
}
|
||||
|
||||
/** Returns true if and only if either x is an TPairList object
|
||||
* representing the same TPairs in the same order or x is a
|
||||
* String that is equals to the result of {@link #toString()}. */
|
||||
public boolean equals(Object x) {
|
||||
if (x instanceof TPairList) {
|
||||
return al.equals(((TPairList)x).al);
|
||||
} else if (x instanceof String) {
|
||||
return toString().equals(x) || toString2().equals(x);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Returns true if and only if this list is empty. */
|
||||
public boolean isEmpty() { return al.isEmpty(); }
|
||||
|
||||
/** Returns a hashCode appropriate for use with our {@link
|
||||
* #equals(Object)} method. */
|
||||
public int hashCode() { return al.hashCode(); }
|
||||
|
||||
private static final int STOP_STACK = 0;
|
||||
private static final int KEEP_STACKING = 1;
|
||||
private static final int ALWAYS_KEEP_STACKING = 2;
|
||||
private static final int ALWAYS_STOP_STACKING = 3;
|
||||
|
||||
// DLC TEST: BA'I has exactly two syntactically legal parses but just one TStackList.
|
||||
|
||||
/** Returns a set (as as ArrayList) of all possible
|
||||
* TStackLists. Uses knowledge of Tibetan spelling rules
|
||||
* (i.e., tsheg bar syntax) to do so. If this list of pairs has
|
||||
* something clearly illegal in it, or is empty, or is merely a
|
||||
* list of disambiguators etc., then this returns null. */
|
||||
public TParseTree getParseTree() {
|
||||
TParseTree pt = new TParseTree();
|
||||
int sz = size();
|
||||
int firstPair = 0;
|
||||
for (int i = 0; i < sz; i++) {
|
||||
|
||||
// We treat [(B . ), (G . +), (K . ), (T . A)] as if it
|
||||
// could be {B+G+K+T} or {B}{G+K}{T} or {B+G+K}{T} or
|
||||
// {B}{G+K+T} (modulo stack legality); we're conservative.
|
||||
// (Though some stacks won't be legal.)
|
||||
|
||||
|
||||
if (ddebug) System.out.println("i is " + i);
|
||||
TPair p = get(i);
|
||||
if (p.getRight() == null && firstPair + 1 < sz) {
|
||||
// Here's the ambiguity. Let's fill up sl. (B . ) (G
|
||||
// . +) (K . A) could be {B+G+KA} or {BA}{G+KA}, so we
|
||||
// go until we hit a vowel and then break into
|
||||
// TPairLists.
|
||||
int start = firstPair;
|
||||
int blanks[] = new int[sz - start]; // we may not use all of this.
|
||||
int j;
|
||||
for (j = start; j < sz; j++) {
|
||||
TPair pj = get(j);
|
||||
boolean isBlank;
|
||||
if (ddebug) System.out.println("right guy is " + pj.getRight());
|
||||
if (pj.isDisambiguator())
|
||||
blanks[j-start] = ALWAYS_STOP_STACKING;
|
||||
else {
|
||||
if (!(isBlank = (pj.getRight() == null)) && !"+".equals(pj.getRight())) {
|
||||
if (ddebug) System.out.println("breaker breaker at j=" + j);
|
||||
break;
|
||||
}
|
||||
blanks[j-start] = isBlank ? STOP_STACK : ALWAYS_KEEP_STACKING;
|
||||
}
|
||||
}
|
||||
if (j >= sz) j = sz - 1;
|
||||
|
||||
blanks[j-start] = ALWAYS_STOP_STACKING;
|
||||
|
||||
// get(j) [corresponding to blanks[j-i]] is
|
||||
// the last pair in the ambiguous stretch; get(i)
|
||||
// [corresponding to blanks[0]] is the first.
|
||||
|
||||
// We'll end up doing 2**(j-i+1) (i.e., (1 <<
|
||||
// (j-i+1))) iterations. If that's going to be too
|
||||
// many, let's just say there's no legal parse. FIXME:
|
||||
// give a nice error message in this case.
|
||||
if (ddebug) System.out.println("ddebug: we're going to do 2^" + (j-i+1) + " [or " + (1 << (j-i+1)) + "] wacky iterations!");
|
||||
if ((j-i+1) > 13) // if you don't use 13, then change PackageTest.testSlowestTshegBar().
|
||||
return new TParseTree();
|
||||
|
||||
boolean keepGoing = true;
|
||||
TStackListList sll = new TStackListList();
|
||||
do {
|
||||
// Add the stack list currently specified by
|
||||
// blanks if all the stacks in it are legal.
|
||||
// DLC DELETE {
|
||||
// ArrayList x = new ArrayList((j-start+1));
|
||||
// for (int ii = 0; ii < (j-start+1); ii++)
|
||||
// x.add(new Integer(blanks[ii]));
|
||||
// }
|
||||
TStackList sl = new TStackList(sz - start);
|
||||
boolean illegal = false;
|
||||
TPairList currentStack = new TPairList();
|
||||
for (int k = 0; k < j-start+1; k++) {
|
||||
TPair pk = get(start + k);
|
||||
if (!pk.isDisambiguator()) {
|
||||
currentStack.add(pk.insideStack());
|
||||
if (blanks[k] == STOP_STACK) {
|
||||
if (currentStack.isLegalTibetanOrSanskritStack())
|
||||
sl.add(currentStack.asStack());
|
||||
else {
|
||||
illegal = true;
|
||||
break;
|
||||
}
|
||||
currentStack = new TPairList();
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!illegal && !currentStack.isEmpty()) {
|
||||
if (currentStack.isLegalTibetanOrSanskritStack()) {
|
||||
TPairList stack = currentStack.asStack();
|
||||
if (ddebug) System.out.println("adding currentStack " + stack + " to sl " + sl);
|
||||
sl.add(stack);
|
||||
} else {
|
||||
illegal = true;
|
||||
}
|
||||
}
|
||||
if (!illegal) {
|
||||
if (ddebug) System.out.println("adding sl " + sl + " to sll " + sll);
|
||||
sll.add(sl);
|
||||
}
|
||||
|
||||
// Update blanks. Think of this as doing base 2
|
||||
// arithmetic where STOP_STACK is zero,
|
||||
// KEEP_STACKING is one, and ALWAYS_KEEP_STACKING
|
||||
// and ALWAYS_STOP_STACKING are digits we cannot
|
||||
// modify. We'll end up doing 2^M iterations,
|
||||
// where M is the number of fields in blanks that
|
||||
// are not equal to ALWAYS_KEEP_STACKING or
|
||||
// ALWAYS_STOP_STACKING.
|
||||
keepGoing = false;
|
||||
for (int k = j-start; k >= 0; k--) {
|
||||
if (blanks[k] == STOP_STACK) {
|
||||
keepGoing = true;
|
||||
blanks[k] = KEEP_STACKING;
|
||||
// reset all digits to the right of k to
|
||||
// "zero":
|
||||
for (int m = k + 1; m < j-start+1; m++) {
|
||||
if (blanks[m] == KEEP_STACKING)
|
||||
blanks[m] = STOP_STACK;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
} while (keepGoing);
|
||||
if (sll.isEmpty())
|
||||
return null; // STXAL or shT+ZNAGN, e.g.
|
||||
else {
|
||||
if (ddebug) System.out.println("adding sll " + sll + " to parse tree " + pt);
|
||||
pt.add(sll);
|
||||
}
|
||||
|
||||
if (ddebug) System.out.println("i is " + i + " and j is " + j + " and we are resetting so that i==j+1 next time.");
|
||||
i = j;
|
||||
firstPair = j + 1;
|
||||
} else if ("+".equals(p.getRight())) {
|
||||
// Keep firstPair where it is.
|
||||
} else {
|
||||
// Add all pairs in the range [firstPair, i]. Some
|
||||
// pairs are stacks all by themselves, some pairs have
|
||||
// '+' on the right and are thus just part of a stack.
|
||||
// We'll add a whole number of stacks, though.
|
||||
|
||||
// this is initialized to hold the max we might use:
|
||||
TStackListList sll
|
||||
= new TStackListList(i - firstPair + 1);
|
||||
|
||||
TPairList currentStack = new TPairList();
|
||||
for (int j = firstPair; j <= i; j++) {
|
||||
TPair pj = get(j);
|
||||
if (!pj.isDisambiguator()) {
|
||||
currentStack.add(pj.insideStack());
|
||||
if (!"+".equals(pj.getRight())) {
|
||||
if (currentStack.isLegalTibetanOrSanskritStack())
|
||||
sll.add(new TStackList(currentStack.asStack()));
|
||||
else {
|
||||
return null;
|
||||
}
|
||||
currentStack = new TPairList();
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!currentStack.isEmpty())
|
||||
throw new Error("how can this happen? currentStack is " + currentStack);
|
||||
|
||||
if (!sll.isEmpty()) {
|
||||
if (ddebug) System.out.println("adding sll " + sll + " to parse tree " + pt);
|
||||
pt.add(sll);
|
||||
firstPair = i + 1;
|
||||
} // else you probably have {G--YA} or something as
|
||||
// your tsheg bar.
|
||||
}
|
||||
}
|
||||
return pt;
|
||||
}
|
||||
|
||||
/** Returns true if and only if this list of TPairs can be
|
||||
* interpreted as a legal Tibetan stack or a legal Tibetanized
|
||||
* Sanskrit stack. This is private because a precondition is
|
||||
* that no vowels or disambiguators appear except possibly in the
|
||||
* final pair. */
|
||||
private boolean isLegalTibetanOrSanskritStack() {
|
||||
StringBuffer tibetan = new StringBuffer();
|
||||
StringBuffer sanskrit = new StringBuffer();
|
||||
int sz = size();
|
||||
|
||||
// Special case because otherwise wa-zur alone would be seen
|
||||
// as legal.
|
||||
if (sz == 1 && "V".equals(get(0).getLeft()))
|
||||
return false;
|
||||
|
||||
for (int i = 0; i < sz; i++) {
|
||||
TPair p = get(i);
|
||||
String ewts_form
|
||||
= ACIPRules.getWylieForACIPConsonant(p.getLeft());
|
||||
if (null == ewts_form) {
|
||||
if (p.isNumeric())
|
||||
ewts_form = p.getLeft();
|
||||
}
|
||||
if (null == ewts_form) {
|
||||
if (ddebug) System.out.println("testing " + toString2() + " for legality said false. numeric?" + p.isNumeric() + "[1]");
|
||||
return false;
|
||||
}
|
||||
tibetan.append(ewts_form);
|
||||
sanskrit.append(ewts_form);
|
||||
if (i + 1 < sz) {
|
||||
tibetan.append('-');
|
||||
sanskrit.append('+');
|
||||
}
|
||||
}
|
||||
boolean ans =
|
||||
(TibetanMachineWeb.hasGlyph(tibetan.toString())
|
||||
|| TibetanMachineWeb.hasGlyph(sanskrit.toString()));
|
||||
if (ddebug) System.out.println("testing " + toString2() + " for legality said " + ans + " [2]; san is " + sanskrit + " tib is " + tibetan + ".");
|
||||
return ans;
|
||||
}
|
||||
private static final boolean ddebug = false;
|
||||
|
||||
/** Mutates this TPairList object such that the last pair is
|
||||
* empty or is a vowel, but is never the stacking operator ('+')
|
||||
* or a disambiguator (i.e., a '-' on the right).
|
||||
* @return this instance */
|
||||
private TPairList asStack() {
|
||||
if (!isEmpty()) {
|
||||
TPair lastPair = get(size() - 1);
|
||||
if ("+".equals(lastPair.getRight()))
|
||||
al.set(size() - 1, new TPair(lastPair.getLeft(), null));
|
||||
else if ("-".equals(lastPair.getRight()))
|
||||
al.set(size() - 1, new TPair(lastPair.getLeft(), null));
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Adds the TGCPairs corresponding to this list to the end of
|
||||
* pl. Some TPairs correspond to more than one TGCPair
|
||||
* ({AA:}); some TGCPairs correspond to more than one TPair
|
||||
* ({G+YA}). To keep track, indexList will be appended to in
|
||||
* lockstep with pl. index (wrapped as an {@link
|
||||
* java.lang#Integer}) will be appended to indexList once each
|
||||
* time we append to pl. This assumes that this TPairList
|
||||
* corresponds to exactly one Tibetan grapheme cluster (i.e.,
|
||||
* stack). Note that U+0F7F (ACIP {:}) is part of a stack, not a
|
||||
* stack all on its own. */
|
||||
void populateWithTGCPairs(ArrayList pl, ArrayList indexList, int index) {
|
||||
int sz = size();
|
||||
if (sz == 0) {
|
||||
return;
|
||||
} else {
|
||||
// drop the disambiguator, if there is one.
|
||||
|
||||
boolean isNumeric = false;
|
||||
StringBuffer lWylie = new StringBuffer();
|
||||
int i;
|
||||
// All pairs but the last:
|
||||
for (i = 0; i + 1 < sz; i++) {
|
||||
lWylie.append(get(i).getWylie());
|
||||
if (get(i).isNumeric())
|
||||
isNumeric = true;
|
||||
}
|
||||
|
||||
// The last pair:
|
||||
TPair p = get(i);
|
||||
ThdlDebug.verify(!"+".equals(p.getRight()));
|
||||
int where;
|
||||
boolean add_U0F7F = false;
|
||||
if (p.getRight() != null
|
||||
&& (where = p.getRight().indexOf(':')) >= 0) {
|
||||
// this ':' guy is his own TGCPair.
|
||||
add_U0F7F = true;
|
||||
StringBuffer rr = new StringBuffer(p.getRight());
|
||||
rr.deleteCharAt(where);
|
||||
p = new TPair(p.getLeft(), rr.toString());
|
||||
}
|
||||
boolean hasNonAVowel = (!"A".equals(p.getRight()) && null != p.getRight());
|
||||
String thislWylie = ACIPRules.getWylieForACIPConsonant(p.getLeft());
|
||||
if (thislWylie == null) {
|
||||
char ch;
|
||||
if (p.isNumeric()) {
|
||||
thislWylie = p.getLeft();
|
||||
isNumeric = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (null == thislWylie) throw new Error("BADNESS AT MAXIMUM: p is " + p + " and thislWylie is " + thislWylie);
|
||||
lWylie.append(thislWylie);
|
||||
StringBuffer ll = new StringBuffer(lWylie.toString());
|
||||
int ww;
|
||||
// DLC NOW: what about fixed-form RA on top??? test it.
|
||||
while ((ww = ll.indexOf("+")) >= 0)
|
||||
ll.deleteCharAt(ww);
|
||||
boolean isTibetan = TibetanMachineWeb.isWylieTibetanConsonantOrConsonantStack(ll.toString());
|
||||
boolean isSanskrit = TibetanMachineWeb.isWylieSanskritConsonantStack(lWylie.toString());
|
||||
if (!isTibetan && !isSanskrit && !isNumeric && true) {
|
||||
System.out.println("DLC: OTHER for " + lWylie + " with vowel " + ACIPRules.getWylieForACIPVowel(p.getRight()) + " and p.getRight()=" + p.getRight());
|
||||
}
|
||||
if (isTibetan && isSanskrit) isSanskrit = false; // RVA, e.g.
|
||||
if (true && hasNonAVowel && ACIPRules.getWylieForACIPVowel(p.getRight()) == null) {
|
||||
System.out.println("DLC: vowel " + ACIPRules.getWylieForACIPVowel(p.getRight()) + " and p.getRight()=" + p.getRight());
|
||||
}
|
||||
TGCPair tp;
|
||||
indexList.add(new Integer(index));
|
||||
tp = new TGCPair(lWylie.toString()
|
||||
+ (hasNonAVowel
|
||||
? ACIPRules.getWylieForACIPVowel(p.getRight())
|
||||
: ""),
|
||||
(isNumeric
|
||||
? TGCPair.OTHER
|
||||
: (hasNonAVowel
|
||||
? (isSanskrit
|
||||
? TGCPair.SANSKRIT_WITH_VOWEL
|
||||
: (isTibetan
|
||||
? TGCPair.CONSONANTAL_WITH_VOWEL
|
||||
: TGCPair.OTHER))
|
||||
: (isSanskrit
|
||||
? TGCPair.SANSKRIT_WITHOUT_VOWEL
|
||||
: (isTibetan
|
||||
? TGCPair.CONSONANTAL_WITHOUT_VOWEL
|
||||
: TGCPair.OTHER)))));
|
||||
pl.add(tp);
|
||||
if (add_U0F7F) {
|
||||
indexList.add(new Integer(index));
|
||||
pl.add(new TGCPair("H", TGCPair.OTHER));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// DLC FIXME: handle 'o' and 'x', e.g. KAo and NYAx.
|
167
source/org/thdl/tib/text/ttt/TPairListFactory.java
Normal file
167
source/org/thdl/tib/text/ttt/TPairListFactory.java
Normal file
|
@ -0,0 +1,167 @@
|
|||
/*
|
||||
The contents of this file are subject to the THDL Open Community License
|
||||
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License on the THDL web site
|
||||
(http://www.thdl.org/).
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||
License for the specific terms governing rights and limitations under the
|
||||
License.
|
||||
|
||||
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
||||
All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
*/
|
||||
|
||||
package org.thdl.tib.text.ttt;
|
||||
|
||||
/** A factory for creating {@link TPairList TPairLists} from
|
||||
* Strings of ACIP.
|
||||
* @author David Chandler */
|
||||
class TPairListFactory {
|
||||
/** This class is not instantiable. */
|
||||
private TPairListFactory() { }
|
||||
|
||||
/** Returns a new TPairList instance. Breaks an ACIP tsheg bar
|
||||
* (roughly a "syllable") into chunks; this computes l'
|
||||
* (for you design doc enthusiasts).
|
||||
*
|
||||
* <p>Here's a rough sketch of the algorithm: run along getting
|
||||
* the current TPair as big as you can. If you get it very
|
||||
* big, but there's something illegal afterward that wouldn't
|
||||
* otherwise be illegal, undo as little as possible to correct.
|
||||
* For example, G'A'I becomes [(G . 'A), (' . I)], and TAA
|
||||
* becomes [(T . A)] in a first pass but then we see that the
|
||||
* rest would be suboptimal, so we backtrack to [(T . )] and then
|
||||
* finally become [(T . ), (A . A)]. We look for (A . ) and (
|
||||
* . <vowel>) in the rest in order to say "the rest would be
|
||||
* suboptimal", i.e. we use TPairList.hasSimpleError()
|
||||
* @param acip a string of ACIP with no punctuation in it */
|
||||
static TPairList breakACIPIntoChunks(String acip) {
|
||||
|
||||
// base case for our recursion:
|
||||
if ("".equals(acip))
|
||||
return new TPairList();
|
||||
|
||||
StringBuffer acipBuf = new StringBuffer(acip);
|
||||
int howMuchBuf[] = new int[1];
|
||||
TPair head = getFirstConsonantAndVowel(acipBuf, howMuchBuf);
|
||||
int howMuch = howMuchBuf[0];
|
||||
TPairList tail;
|
||||
if ((tail
|
||||
= breakACIPIntoChunks(acipBuf.substring(howMuch))).hasSimpleError()) {
|
||||
for (int i = 1; i < howMuch; i++) {
|
||||
// try giving i characters back if that leaves us with
|
||||
// a legal head and makes the rest free of simple
|
||||
// errors.
|
||||
TPairList newTail = null;
|
||||
TPair newHead;
|
||||
if ((newHead = head.minusNRightmostACIPCharacters(i)).isLegal()
|
||||
&& !(newTail
|
||||
= breakACIPIntoChunks(acipBuf.substring(howMuch - i))).hasSimpleError()) {
|
||||
newTail.prepend(newHead);
|
||||
return newTail;
|
||||
}
|
||||
}
|
||||
// It didn't work. Return the first thing we'd thought
|
||||
// of: head appended with tail. (I.e., fall through.)
|
||||
}
|
||||
tail.prepend(head);
|
||||
return tail;
|
||||
}
|
||||
|
||||
/** Returns the largest TPair we can make from the acip
|
||||
* starting from the left. This will return a size zero pair if
|
||||
* and only if acip is the empty string; otherwise, it may return
|
||||
* a pair with either the left or right component empty. This
|
||||
* mutates acip when we run into {NA+YA}; it mutates acip into
|
||||
* {N+YA}. For {NE+YA}, it doesn not mutate acip or behave
|
||||
* intelligently. A later phase will need to turn that into
|
||||
* {N+YE} (DLC). howMuch[0] will be set to the number of
|
||||
* characters of acip that this call has consumed. */
|
||||
private static TPair getFirstConsonantAndVowel(StringBuffer acip,
|
||||
int howMuch[]) {
|
||||
// Note that it is *not* the case that if acip.substring(0, N)
|
||||
// is legal (according to TPair.isLegal()), then
|
||||
// acip.substring(0, N-1) is legal for all N. For example,
|
||||
// think of {shA} and {KshA}. However, 's' is the only tricky
|
||||
// fellow, so it is true that acip.substring(0, N-1) is either
|
||||
// legal or ends with 's' if acip.substring(0, N) is legal.
|
||||
//
|
||||
// We don't, however, use this approach. We just try to find
|
||||
// a consonant of length 3, and then, failing that, of length
|
||||
// 2, etc. Likewise with vowels. This avoids the issue.
|
||||
|
||||
int i, xl = acip.length();
|
||||
if (0 == xl) {
|
||||
howMuch[0] = 0;
|
||||
return new TPair(null, null);
|
||||
}
|
||||
if (acip.charAt(0) == '-') {
|
||||
howMuch[0] = 1;
|
||||
return new TPair(null, "-");
|
||||
}
|
||||
char ch = acip.charAt(0);
|
||||
|
||||
// Numbers never appear in stacks, so if you see 1234, that's
|
||||
// like seeing 1-2-3-4.
|
||||
if (ch >= '0' && ch <= '9') {
|
||||
howMuch[0] = 1; // not 2...
|
||||
return new TPair(acip.substring(0, 1), (xl == 1) ? null : "-");
|
||||
}
|
||||
|
||||
String l = null, r = null;
|
||||
for (i = Math.min(ACIPRules.MAX_CONSONANT_LENGTH, xl); i >= 1; i--) {
|
||||
String t = null;
|
||||
if (ACIPRules.isConsonant(t = acip.substring(0, i))) {
|
||||
l = t;
|
||||
break;
|
||||
}
|
||||
}
|
||||
int ll = (null == l) ? 0 : l.length();
|
||||
if (null != l && xl > ll && acip.charAt(ll) == '-') {
|
||||
howMuch[0] = l.length() + 1;
|
||||
return new TPair(l, "-");
|
||||
}
|
||||
if (null != l && xl > ll && acip.charAt(ll) == '+') {
|
||||
howMuch[0] = l.length() + 1;
|
||||
return new TPair(l, "+");
|
||||
}
|
||||
for (i = Math.min(ACIPRules.MAX_VOWEL_LENGTH, xl - ll); i >= 1; i--) {
|
||||
String t = null;
|
||||
if (ACIPRules.isVowel(t = acip.substring(ll, ll + i))) {
|
||||
r = t;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Treat {BATA+SA'I} like {BAT+SA'I}:
|
||||
int z;
|
||||
if (null != l && "A".equals(r) && ((z = ll + "A".length()) < xl)
|
||||
&& acip.charAt(z) == '+') {
|
||||
acip.deleteCharAt(z-1);
|
||||
howMuch[0] = l.length() + 1;
|
||||
return new TPair(l, "+");
|
||||
}
|
||||
|
||||
// what if we see a character that's not part of any vowel or
|
||||
// consonant? We return it.
|
||||
if (null == l && null == r) {
|
||||
howMuch[0] = 1; // not 2...
|
||||
// add a '-' to avoid exponentials:
|
||||
return new TPair(acip.substring(0, 1), (xl == 1) ? null : "-");
|
||||
}
|
||||
|
||||
howMuch[0] = (((l == null) ? 0 : l.length())
|
||||
+ ((r == null) ? 0 : r.length()));
|
||||
return new TPair(l, r);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// DLC strip out [#...] comments; test for nested comments
|
||||
|
||||
// DLC see Translit directory on ACIP v4 CD-ROM
|
200
source/org/thdl/tib/text/ttt/TParseTree.java
Normal file
200
source/org/thdl/tib/text/ttt/TParseTree.java
Normal file
|
@ -0,0 +1,200 @@
|
|||
/*
|
||||
The contents of this file are subject to the THDL Open Community License
|
||||
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License on the THDL web site
|
||||
(http://www.thdl.org/).
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||
License for the specific terms governing rights and limitations under the
|
||||
License.
|
||||
|
||||
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
||||
All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
*/
|
||||
|
||||
package org.thdl.tib.text.ttt;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
/** A list of non-empty list of {@link #TStackListList
|
||||
* TStackListLists} representing all the ways you could break up a
|
||||
* tsheg bar of ACIP into stacks (i.e., grapheme clusters).
|
||||
*
|
||||
* @author David Chandler */
|
||||
class TParseTree {
|
||||
/** a fast, non-thread-safe, random-access list implementation: */
|
||||
private ArrayList al = new ArrayList();
|
||||
|
||||
/** Creates an empty list. */
|
||||
public TParseTree() { }
|
||||
|
||||
/** Returns the ith pair in this list. */
|
||||
public TStackListList get(int i) { return (TStackListList)al.get(i); }
|
||||
|
||||
/** Adds p to the end of this list. */
|
||||
public void add(TStackListList p)
|
||||
throws IllegalArgumentException
|
||||
{
|
||||
if (p.isEmpty())
|
||||
throw new IllegalArgumentException("p is empty");
|
||||
al.add(p);
|
||||
}
|
||||
|
||||
/** Returns the number of TStackListLists in this list. See
|
||||
* also {@link #numberOfParses()}, which gives a different
|
||||
* interpretation of the size of this tree. */
|
||||
public int size() { return al.size(); }
|
||||
|
||||
/** Returns the number of different parses one could make from
|
||||
* this parse tree. Returns zero if this list is empty. */
|
||||
public int numberOfParses() {
|
||||
if (al.isEmpty()) return 0;
|
||||
int k = 1;
|
||||
int sz = size();
|
||||
for (int i = 0; i < sz; i++) {
|
||||
k *= get(i).size();
|
||||
}
|
||||
return k;
|
||||
}
|
||||
|
||||
/** Returns the number of {@link #TPair pairs} that are in a
|
||||
* parse of this tree. */
|
||||
public int numberOfPairs() {
|
||||
if (al.isEmpty()) return 0;
|
||||
int k = 1;
|
||||
int sz = size();
|
||||
for (int i = 0; i < sz; i++) {
|
||||
// get(i).get(0) is the same size as get(i).get(1),
|
||||
// get(i).get(2), ...
|
||||
k += get(i).get(0).size();
|
||||
}
|
||||
return k;
|
||||
}
|
||||
|
||||
/** Returns an iterator that will iterate over the {@link
|
||||
* #numberOfParses} available. */
|
||||
public ParseIterator getParseIterator() {
|
||||
return new ParseIterator(al);
|
||||
}
|
||||
|
||||
/** Returns a list containing the legal parses of this parse tree.
|
||||
* By "legal", we mean a sequence of stacks that is
|
||||
* legal by the rules of Tibetan tsheg bar syntax (sometimes
|
||||
* called spelling). This will return the {G-YA} parse of {GYA}
|
||||
* as well as the {GYA} parse, so watch yourself. */
|
||||
public TStackListList getLegalParses() {
|
||||
TStackListList sll = new TStackListList(2); // save memory
|
||||
ParseIterator pi = getParseIterator();
|
||||
while (pi.hasNext()) {
|
||||
TStackList sl = pi.next();
|
||||
if (sl.isLegalTshegBar().isLegal) {
|
||||
sll.add(sl);
|
||||
}
|
||||
}
|
||||
return sll;
|
||||
}
|
||||
|
||||
/** Returns a list containing the parses of this parse tree that
|
||||
* are not clearly illegal. */
|
||||
public TStackListList getNonIllegalParses() {
|
||||
TStackListList sll = new TStackListList(2); // save memory
|
||||
ParseIterator pi = getParseIterator();
|
||||
while (pi.hasNext()) {
|
||||
TStackList sl = pi.next();
|
||||
if (!sl.isClearlyIllegal()) {
|
||||
sll.add(sl);
|
||||
}
|
||||
}
|
||||
return sll;
|
||||
}
|
||||
|
||||
/** Returns the best parse, if there is a unique parse that is
|
||||
* clearly preferred to other parses. Basically, if there's a
|
||||
* unique legal parse, you get it. If there's not, but there is
|
||||
* a unique non-illegal parse, you get it. If there's not a
|
||||
* unique answer, null is returned. */
|
||||
// {TZANDRA} is not solved by this, DLC NOW. Solve PADMA PROBLEM!
|
||||
|
||||
// DLC by using this we can get rid of single-sanskrit-gc, eh?
|
||||
public TStackList getBestParse() {
|
||||
TStackListList up = getUniqueParse();
|
||||
if (up.size() == 1)
|
||||
return up.get(0);
|
||||
else if (up.size() == 2) {
|
||||
}
|
||||
up = getNonIllegalParses();
|
||||
int sz = up.size();
|
||||
if (up.size() == 1) {
|
||||
return up.get(0);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/** Returns a list containing the unique legal parse of this parse
|
||||
* tree if there is a unique legal parse. Note that {SRAS} has a
|
||||
* unique legal parse, though {SRS} has two equally good parses;
|
||||
* i.e., note that the {A} vowel is treated specially here
|
||||
* (unlike in {@link #getLegalParses()}). Returns an empty list
|
||||
* if there are no legal parses. Returns a list containing all
|
||||
* legal parses if there two or more equally good parses. By
|
||||
* "legal", we mean a sequence of stacks that is legal
|
||||
* by the rules of Tibetan tsheg bar syntax (sometimes called
|
||||
* spelling). */
|
||||
public TStackListList getUniqueParse() {
|
||||
TStackListList allLegalParses = new TStackListList(2); // save memory
|
||||
TStackListList legalParsesWithVowelOnRoot = new TStackListList(1);
|
||||
ParseIterator pi = getParseIterator();
|
||||
while (pi.hasNext()) {
|
||||
TStackList sl = pi.next();
|
||||
BoolPair bpa = sl.isLegalTshegBar();
|
||||
if (bpa.isLegal) {
|
||||
if (bpa.isLegalAndHasAVowelOnRoot)
|
||||
legalParsesWithVowelOnRoot.add(sl);
|
||||
allLegalParses.add(sl);
|
||||
}
|
||||
}
|
||||
if (legalParsesWithVowelOnRoot.size() == 1)
|
||||
return legalParsesWithVowelOnRoot;
|
||||
else {
|
||||
if (legalParsesWithVowelOnRoot.size() == 2) {
|
||||
// DLC is this even valid?
|
||||
if (legalParsesWithVowelOnRoot.get(0).size() != 1 + legalParsesWithVowelOnRoot.get(1).size())
|
||||
throw new Error("Something other than the G-YA vs. GYA case appeared. Sorry for your trouble! " + legalParsesWithVowelOnRoot.get(0) + " ;; " + legalParsesWithVowelOnRoot.get(1));
|
||||
return new TStackListList(legalParsesWithVowelOnRoot.get(1));
|
||||
}
|
||||
if (allLegalParses.size() == 2) {
|
||||
// DLC is this even valid?
|
||||
if (allLegalParses.get(0).size() != 1 + allLegalParses.get(1).size())
|
||||
throw new Error("Something other than the G-YA vs. GYA case appeared. Sorry for your trouble! " + allLegalParses.get(0) + " ;; " + allLegalParses.get(1));
|
||||
return new TStackListList(allLegalParses.get(1));
|
||||
}
|
||||
return allLegalParses;
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns a human-readable representation. */
|
||||
public String toString() {
|
||||
return al.toString();
|
||||
}
|
||||
|
||||
/** Returns true if and only if either x is an TParseTree
|
||||
* object representing the same TPairLists in the same order
|
||||
* or x is a String that is equals to the result of {@link
|
||||
* #toString()}. */
|
||||
public boolean equals(Object x) {
|
||||
if (x instanceof TParseTree) {
|
||||
return al.equals(((TParseTree)x).al);
|
||||
} else if (x instanceof String) {
|
||||
return toString().equals(x);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Returns a hashCode appropriate for use with our {@link
|
||||
* #equals(Object)} method. */
|
||||
public int hashCode() { return al.hashCode(); }
|
||||
}
|
176
source/org/thdl/tib/text/ttt/TStackList.java
Normal file
176
source/org/thdl/tib/text/ttt/TStackList.java
Normal file
|
@ -0,0 +1,176 @@
|
|||
/*
|
||||
The contents of this file are subject to the THDL Open Community License
|
||||
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License on the THDL web site
|
||||
(http://www.thdl.org/).
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||
License for the specific terms governing rights and limitations under the
|
||||
License.
|
||||
|
||||
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
||||
All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
*/
|
||||
|
||||
package org.thdl.tib.text.ttt;
|
||||
|
||||
import org.thdl.tib.text.TibTextUtils;
|
||||
import org.thdl.tib.text.TGCList;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.ListIterator;
|
||||
|
||||
/** A list of {@link TPairList TPairLists}, each of which is for
|
||||
* a stack (a grapheme cluster), typically corresponding to one tsheg
|
||||
* bar.
|
||||
*
|
||||
* @author David Chandler */
|
||||
class TStackList {
|
||||
/** FIXME: change me and see if performance improves. */
|
||||
private static final int INITIAL_SIZE = 1;
|
||||
|
||||
/** a fast, non-thread-safe, random-access list implementation: */
|
||||
private ArrayList al;
|
||||
|
||||
/** Creates an empty list. */
|
||||
public TStackList() { al = new ArrayList(INITIAL_SIZE); }
|
||||
|
||||
/** Creates a list containing just p. */
|
||||
public TStackList(TPairList p) {
|
||||
al = new ArrayList(1);
|
||||
add(p);
|
||||
}
|
||||
|
||||
/** Creates an empty list with the capacity to hold N items. */
|
||||
public TStackList(int N) {
|
||||
al = new ArrayList(N);
|
||||
}
|
||||
|
||||
/** Returns the ith pair in this list. */
|
||||
public TPairList get(int i) { return (TPairList)al.get(i); }
|
||||
|
||||
/** Adds p to the end of this list. */
|
||||
public void add(TPairList p) { al.add(p); }
|
||||
|
||||
/** Adds all the stacks in c to the end of this list. */
|
||||
public void addAll(TStackList c) { al.addAll(c.al); }
|
||||
|
||||
/** Adds all the stacks in c to this list, inserting them at
|
||||
* position k. */
|
||||
public void addAll(int k, TStackList c) { al.addAll(k, c.al); }
|
||||
|
||||
/** Returns the number of TPairLists in this list. */
|
||||
public int size() { return al.size(); }
|
||||
|
||||
/** Returns true if and only if this list is empty. */
|
||||
public boolean isEmpty() { return al.isEmpty(); }
|
||||
|
||||
/** Returns a human-readable representation like {G}{YA} or
|
||||
* {GYA}. */
|
||||
public String toString() {
|
||||
int sz = size();
|
||||
StringBuffer b = new StringBuffer();
|
||||
for (int i = 0; i < sz; i++) {
|
||||
b.append('{');
|
||||
b.append(get(i).recoverACIP());
|
||||
b.append('}');
|
||||
}
|
||||
return b.toString();
|
||||
}
|
||||
|
||||
/** Returns a human-readable representation.
|
||||
* @return something like [[(R . ), (D . O)], [(R . ), (J . E)]] */
|
||||
public String toString2() {
|
||||
return al.toString();
|
||||
}
|
||||
|
||||
/** Returns true if and only if either x is an TStackList
|
||||
* object representing the same TPairLists in the same
|
||||
* order or x is a String that is equals to the result of {@link
|
||||
* #toString()}. */
|
||||
public boolean equals(Object x) {
|
||||
if (x instanceof TStackList) {
|
||||
return al.equals(((TStackList)x).al);
|
||||
} else if (x instanceof String) {
|
||||
return toString().equals(x) || toString2().equals(x);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Returns a hashCode appropriate for use with our {@link
|
||||
* #equals(Object)} method. */
|
||||
public int hashCode() { return al.hashCode(); }
|
||||
|
||||
/** Returns an iterator for this list. Mutate this list while
|
||||
* iterating and you'll have to read the code to know what will
|
||||
* happen. */
|
||||
public ListIterator listIterator() { return al.listIterator(); }
|
||||
|
||||
/** Returns a pair with {@link BoolPair#isLegal} true if and only
|
||||
* if this list of stacks is a legal tsheg bar by the rules of
|
||||
* Tibetan syntax (sometimes called rules of spelling). If this
|
||||
* is legal, then {@link BoolPair#isLegalAndHasAVowelOnRoot} will
|
||||
* be true if and only if there is an explicit {A} vowel on the
|
||||
* root stack. */
|
||||
public BoolPair isLegalTshegBar() {
|
||||
// DLC handle PADMA and other Tibetanized Sanskrit fellows. Right now we only handle single-stack guys.
|
||||
|
||||
TTGCList tgcList = new TTGCList(this);
|
||||
StringBuffer warnings = new StringBuffer();
|
||||
String candidateType
|
||||
= TibTextUtils.getClassificationOfTshegBar(tgcList, warnings);
|
||||
// System.out.println("DLC: " + toString() + " has candidateType " + candidateType + " and warnings " + warnings);
|
||||
|
||||
// preliminary answer:
|
||||
boolean isLegal = (candidateType != "invalid");
|
||||
|
||||
if (isLegal) {
|
||||
if (isClearlyIllegal())
|
||||
isLegal = false;
|
||||
}
|
||||
|
||||
boolean isLegalAndHasAVowelOnRoot = false;
|
||||
if (isLegal) {
|
||||
int rootIndices[]
|
||||
= TibTextUtils.getIndicesOfRootForCandidateType(candidateType);
|
||||
for (int i = 0; i < 2; i++) {
|
||||
if (rootIndices[i] >= 0) {
|
||||
int pairListIndex = tgcList.getTPairListIndex(rootIndices[i]);
|
||||
TPairList pl = get(pairListIndex);
|
||||
TPair p = pl.get(pl.size() - 1);
|
||||
isLegalAndHasAVowelOnRoot
|
||||
= (p.getRight() != null && p.getRight().startsWith("A")); // could be {A:}, e.g.
|
||||
if (isLegalAndHasAVowelOnRoot)
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return new BoolPair(isLegal, isLegalAndHasAVowelOnRoot);
|
||||
}
|
||||
|
||||
/** Returns true if and only if this stack list contains a clearly
|
||||
* illegal construct, such as an TPair (V . something). */
|
||||
boolean isClearlyIllegal() {
|
||||
// check for {D}{VA} sorts of things:
|
||||
for (int i = 0; i < size(); i++) {
|
||||
if (get(i).getACIPError() != null) {
|
||||
System.out.println("DLC: error is " + get(i).getACIPError());
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
class BoolPair {
|
||||
boolean isLegal;
|
||||
boolean isLegalAndHasAVowelOnRoot;
|
||||
BoolPair(boolean isLegal, boolean isLegalAndHasAVowelOnRoot) {
|
||||
this.isLegal = isLegal;
|
||||
this.isLegalAndHasAVowelOnRoot = isLegalAndHasAVowelOnRoot;
|
||||
}
|
||||
}
|
86
source/org/thdl/tib/text/ttt/TStackListList.java
Normal file
86
source/org/thdl/tib/text/ttt/TStackListList.java
Normal file
|
@ -0,0 +1,86 @@
|
|||
/*
|
||||
The contents of this file are subject to the THDL Open Community License
|
||||
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License on the THDL web site
|
||||
(http://www.thdl.org/).
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||
License for the specific terms governing rights and limitations under the
|
||||
License.
|
||||
|
||||
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
||||
All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
*/
|
||||
|
||||
package org.thdl.tib.text.ttt;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.ListIterator;
|
||||
|
||||
/** A list of {@link #TStackList} objects, each of which is for a
|
||||
* stack (a grapheme cluster), typically corresponding to one
|
||||
* ambiguous section of a tsheg bar.
|
||||
*
|
||||
* @author David Chandler */
|
||||
class TStackListList {
|
||||
/** a fast, non-thread-safe, random-access list implementation: */
|
||||
private ArrayList al;
|
||||
|
||||
/** Creates an empty list. */
|
||||
public TStackListList() { al = new ArrayList(); }
|
||||
|
||||
/** Creates a list containing just p. */
|
||||
public TStackListList(TStackList p) {
|
||||
al = new ArrayList(1);
|
||||
add(p);
|
||||
}
|
||||
|
||||
/** Creates an empty list with the capacity to hold N items. */
|
||||
public TStackListList(int N) {
|
||||
al = new ArrayList(N);
|
||||
}
|
||||
|
||||
/** Returns the ith pair in this list. */
|
||||
public TStackList get(int i) { return (TStackList)al.get(i); }
|
||||
|
||||
/** Adds p to the end of this list. */
|
||||
public void add(TStackList p) { al.add(p); }
|
||||
|
||||
/** Returns the number of TStackList objects in this list. */
|
||||
public int size() { return al.size(); }
|
||||
|
||||
/** Returns true if and only if this list is empty. */
|
||||
public boolean isEmpty() { return al.isEmpty(); }
|
||||
|
||||
/** Returns a human-readable representation.
|
||||
* @return something like [[[(R . ), (D . O)], [(R . ), (J . E)]]] */
|
||||
public String toString() {
|
||||
return al.toString();
|
||||
}
|
||||
|
||||
/** Returns true if and only if either x is an TStackListList
|
||||
* object representing the same TStackList objects in the same
|
||||
* order or x is a String that is equals to the result of {@link
|
||||
* #toString()}. */
|
||||
public boolean equals(Object x) {
|
||||
if (x instanceof TStackListList) {
|
||||
return al.equals(((TStackListList)x).al);
|
||||
} else if (x instanceof String) {
|
||||
return toString().equals(x);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Returns a hashCode appropriate for use with our {@link
|
||||
* #equals(Object)} method. */
|
||||
public int hashCode() { return al.hashCode(); }
|
||||
|
||||
/** Returns an iterator for this list. Mutate this list while
|
||||
* iterating and you'll have to read the code to know what will
|
||||
* happen. */
|
||||
public ListIterator listIterator() { return al.listIterator(); }
|
||||
}
|
63
source/org/thdl/tib/text/ttt/TTGCList.java
Normal file
63
source/org/thdl/tib/text/ttt/TTGCList.java
Normal file
|
@ -0,0 +1,63 @@
|
|||
/*
|
||||
The contents of this file are subject to the THDL Open Community License
|
||||
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License on the THDL web site
|
||||
(http://www.thdl.org/).
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||
License for the specific terms governing rights and limitations under the
|
||||
License.
|
||||
|
||||
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
||||
All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
*/
|
||||
|
||||
package org.thdl.tib.text.ttt;
|
||||
|
||||
import org.thdl.tib.text.TGCList;
|
||||
import org.thdl.tib.text.TGCPair;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
/** A list of grapheme clusters.
|
||||
*
|
||||
* @author David Chandler */
|
||||
class TTGCList implements TGCList {
|
||||
// I could use one list of an ordered pair (TGCPair, int), but I
|
||||
// use two lists.
|
||||
private ArrayList al;
|
||||
private ArrayList stackIndices;
|
||||
|
||||
/** Don't use this. */
|
||||
private TTGCList() { }
|
||||
|
||||
/** Creates a TGCList. */
|
||||
public TTGCList(TStackList sl) {
|
||||
al = new ArrayList();
|
||||
stackIndices = new ArrayList();
|
||||
int sz = sl.size();
|
||||
for (int i = 0; i < sz; i++) {
|
||||
sl.get(i).populateWithTGCPairs(al, stackIndices, i);
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns the ith pair in this list. */
|
||||
public TGCPair get(int i) {
|
||||
return (TGCPair)al.get(i);
|
||||
}
|
||||
|
||||
/** Returns the number of TGCPairs in this list. */
|
||||
public int size() { return al.size(); }
|
||||
|
||||
/** Returns a zero-based index of an TPairList inside the stack
|
||||
* list from which this list was constructed. This pair list is
|
||||
* the one that caused the TGCPair at index tgcPairIndex to come
|
||||
* into existence. */
|
||||
public int getTPairListIndex(int tgcPairIndex) {
|
||||
return ((Integer)stackIndices.get(tgcPairIndex)).intValue();
|
||||
}
|
||||
}
|
31
source/org/thdl/tib/text/ttt/package.html
Normal file
31
source/org/thdl/tib/text/ttt/package.html
Normal file
|
@ -0,0 +1,31 @@
|
|||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
|
||||
<html>
|
||||
<head>
|
||||
<!--
|
||||
|
||||
@(#)package.html
|
||||
|
||||
Copyright 2003 Tibetan and Himalayan Digital Library
|
||||
|
||||
This software is the confidential and proprietary information of
|
||||
the Tibetan and Himalayan Digital Library. You shall use such
|
||||
information only in accordance with the terms of the license
|
||||
agreement you entered into with the THDL.
|
||||
|
||||
-->
|
||||
</head>
|
||||
<body bgcolor="white">
|
||||
|
||||
Provides classes and methods for converting Latin transliteration of
|
||||
Tibetan text into Tibetan.
|
||||
<p>
|
||||
This package (whose name, ttt, stands for transliteration-to-Tibetan)
|
||||
contains methods for converting ACIP transliteration into Tibetan
|
||||
Machine Web and methods for converting EWTS transliteration into
|
||||
Tibetan Machine Web. It has extensive tests, though probably not
|
||||
mentioned in these Javadoc documents.
|
||||
</p>
|
||||
<h2>Related Documentation</h2>
|
||||
@see <a href="../package-summary.html">org.thdl.tib.text</a>
|
||||
</body>
|
||||
</html>
|
Loading…
Reference in a new issue