From 5aab4acc93c17e4c66f1c4fddf1c9de08003bf01 Mon Sep 17 00:00:00 2001
From: dchandler
Date: Sun, 19 Oct 2003 20:48:22 +0000
Subject: [PATCH] I've undone the SNYAM'AM == SNYAMA'AM hack. The only
occurrence of SNYAM'AM in the ACIP texts I've got is likely a typo, says
Robert Chilton.
The code would be cleaner if I could bear to delete my terrible hack. Maybe in a month, when I don't feel so dumb for coding it up in the first place.
The correct solution for such things is to give the ACIP->Tibetan converters a pre-filter mechanism. This would be before the lexer or part of the lexer (maybe you only want to filter tsheg bars), and it would allow the end user to specify things like "s/SNYAM'AM/S+NYAMA'AMA/g".
---
.../org/thdl/tib/text/ttt/ACIPConverter.java | 2 +-
source/org/thdl/tib/text/ttt/PackageTest.java | 6 +--
.../thdl/tib/text/ttt/TPairListFactory.java | 44 +++++++++++++------
3 files changed, 34 insertions(+), 18 deletions(-)
diff --git a/source/org/thdl/tib/text/ttt/ACIPConverter.java b/source/org/thdl/tib/text/ttt/ACIPConverter.java
index fe9e048..834b59d 100644
--- a/source/org/thdl/tib/text/ttt/ACIPConverter.java
+++ b/source/org/thdl/tib/text/ttt/ACIPConverter.java
@@ -371,7 +371,7 @@ public class ACIPConverter {
Object[] duff = null;
if (stype == TString.TIBETAN_NON_PUNCTUATION) {
lastGuyWasNonPunct = true;
- TPairList pls[] = TPairListFactory.breakACIPIntoChunks(s.getText());
+ TPairList pls[] = TPairListFactory.breakACIPIntoChunks(s.getText(), false);
String acipError;
if ((acipError = pls[0].getACIPError()) != null
diff --git a/source/org/thdl/tib/text/ttt/PackageTest.java b/source/org/thdl/tib/text/ttt/PackageTest.java
index df8d864..1f9e130 100644
--- a/source/org/thdl/tib/text/ttt/PackageTest.java
+++ b/source/org/thdl/tib/text/ttt/PackageTest.java
@@ -94,7 +94,7 @@ public class PackageTest extends TestCase {
String[] expectedLegalParses,
String expectedBestParse,
int pairListToUse) {
- TPairList[] la = TPairListFactory.breakACIPIntoChunks(acip);
+ TPairList[] la = TPairListFactory.breakACIPIntoChunks(acip, true);
TPairList l = la[(pairListToUse == -1) ? 0 : ((pairListToUse >= 1) ? 1 : pairListToUse)];
if (sdebug || debug)
System.out.println("ACIP=" + acip + " and l'=" + l);
@@ -268,8 +268,8 @@ public class PackageTest extends TestCase {
tstHelper("9012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678");
}
- /** Tests {@link TPairListFactory#breakACIPIntoChunks(String)},
- * {@link TPairList#getACIPError()}, and {@link
+ /** Tests {@link TPairListFactory#breakACIPIntoChunks(String,
+ * boolean)}, {@link TPairList#getACIPError()}, and {@link
* TPairList#recoverACIP()}. */
public void testBreakACIPIntoChunks() {
tstHelper("GASN"); // ambiguous with regard to prefix rules
diff --git a/source/org/thdl/tib/text/ttt/TPairListFactory.java b/source/org/thdl/tib/text/ttt/TPairListFactory.java
index c9d6a86..09b14e3 100644
--- a/source/org/thdl/tib/text/ttt/TPairListFactory.java
+++ b/source/org/thdl/tib/text/ttt/TPairListFactory.java
@@ -25,9 +25,9 @@ class TPairListFactory {
/** This class is not instantiable. */
private TPairListFactory() { }
- /** Returns a new TPairList instance. Breaks an ACIP tsheg bar
- * (roughly a "syllable") into chunks; this computes l'
- * (for you design doc enthusiasts).
+ /** Returns one or two new TPairList instances. Breaks an ACIP
+ * tsheg bar (roughly a "syllable") into chunks; this
+ * computes l' (for you design doc enthusiasts).
*
* Here's a rough sketch of the algorithm: run along getting
* the current TPair as big as you can. If you get it very
@@ -41,16 +41,27 @@ class TPairListFactory {
* suboptimal", i.e. we use TPairList.hasSimpleError().
*
* There is one case where we break things up into two pair
- * lists -- I found out about this case too late to do anything
- * clean about it. SNYAM'AM, e.g., breaks up into [(S . ), (NY
- * . A), (M . 'A), (M . )], which is incorrect -- [(S . ), (NY
- * . A), (M . ), (' . A), (M . )] is correct. But we don't know
- * which is correct without parsing, so both are returned. The
- * clean treatment (low-priority FIXME) would be to lex into a
- * form that didn't insist 'A was either a vowel or a consonant.
- * Then the parser would figure it out.
+ * lists if and only if specialHandlingForAppendages is true -- I
+ * thought the converter had a bug because I saw SNYAM'AM in
+ * KD0003I2.ACT. I asked Robert Chilton, though, and he said
+ * "SNYAM'AM " was likely a typo for "SNYAM 'AM", so leave
+ * specialHandlingForAppendages false.
+ *
+ * I found out about (OK, as it turns out, imagined) this case
+ * too late to do anything clean about it. SNYAM'AM, e.g.,
+ * breaks up into [(S . ), (NY . A), (M . 'A), (M . )], which is
+ * incorrect -- [(S . ), (NY . A), (M . ), (' . A), (M . )] is
+ * correct. But we don't know which is correct without parsing,
+ * so both are returned. The clean treatment would be to lex
+ * into a form that didn't insist 'A was either a vowel or a
+ * consonant. Then the parser would figure it out. But don't
+ * bother, because specialHandlingForAppendages should be false
+ * always.
*
* @param acip a string of ACIP with no punctuation in it
+ * @param specialHandlingForAppendages true if and only if you
+ * want SNYAM'AM to ultimately parse as {S+NYA}{M}{'A}{M} instead
+ * of {S+NYA}{M'A}{M}
* @return an array of one or two pair lists, if the former, then
* the second element will be null, if the latter, the second
* element will have (* . ), (' . *) instead of (* . '*) which
@@ -58,11 +69,16 @@ class TPairListFactory {
* large for us to break into chunks (we're recursive, not
* iterative, so the boundary can be increased a lot if you care,
* but you don't) */
- static TPairList[] breakACIPIntoChunks(String acip) throws IllegalArgumentException {
+ static TPairList[] breakACIPIntoChunks(String acip,
+ boolean specialHandlingForAppendages)
+ throws IllegalArgumentException
+ {
try {
TPairList a = breakHelper(acip, true, false);
- TPairList b = breakHelper(acip, false, false);
- if (a.equals(b))
+ TPairList b = null;
+ if (specialHandlingForAppendages)
+ b = breakHelper(acip, false, false);
+ if (null != b && a.equals(b))
return new TPairList[] { a, null };
else
return new TPairList[] { a, b };