I've undone the SNYAM'AM == SNYAMA'AM hack. The only occurrence of SNYAM'AM in the ACIP texts I've got is likely a typo, says Robert Chilton.
The code would be cleaner if I could bear to delete my terrible hack. Maybe in a month, when I don't feel so dumb for coding it up in the first place. The correct solution for such things is to give the ACIP->Tibetan converters a pre-filter mechanism. This would be before the lexer or part of the lexer (maybe you only want to filter tsheg bars), and it would allow the end user to specify things like "s/SNYAM'AM/S+NYAMA'AMA/g".
This commit is contained in:
parent
4b1395e0ba
commit
5aab4acc93
3 changed files with 34 additions and 18 deletions
|
@ -371,7 +371,7 @@ public class ACIPConverter {
|
||||||
Object[] duff = null;
|
Object[] duff = null;
|
||||||
if (stype == TString.TIBETAN_NON_PUNCTUATION) {
|
if (stype == TString.TIBETAN_NON_PUNCTUATION) {
|
||||||
lastGuyWasNonPunct = true;
|
lastGuyWasNonPunct = true;
|
||||||
TPairList pls[] = TPairListFactory.breakACIPIntoChunks(s.getText());
|
TPairList pls[] = TPairListFactory.breakACIPIntoChunks(s.getText(), false);
|
||||||
String acipError;
|
String acipError;
|
||||||
|
|
||||||
if ((acipError = pls[0].getACIPError()) != null
|
if ((acipError = pls[0].getACIPError()) != null
|
||||||
|
|
|
@ -94,7 +94,7 @@ public class PackageTest extends TestCase {
|
||||||
String[] expectedLegalParses,
|
String[] expectedLegalParses,
|
||||||
String expectedBestParse,
|
String expectedBestParse,
|
||||||
int pairListToUse) {
|
int pairListToUse) {
|
||||||
TPairList[] la = TPairListFactory.breakACIPIntoChunks(acip);
|
TPairList[] la = TPairListFactory.breakACIPIntoChunks(acip, true);
|
||||||
TPairList l = la[(pairListToUse == -1) ? 0 : ((pairListToUse >= 1) ? 1 : pairListToUse)];
|
TPairList l = la[(pairListToUse == -1) ? 0 : ((pairListToUse >= 1) ? 1 : pairListToUse)];
|
||||||
if (sdebug || debug)
|
if (sdebug || debug)
|
||||||
System.out.println("ACIP=" + acip + " and l'=" + l);
|
System.out.println("ACIP=" + acip + " and l'=" + l);
|
||||||
|
@ -268,8 +268,8 @@ public class PackageTest extends TestCase {
|
||||||
tstHelper("9012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678");
|
tstHelper("9012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678");
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Tests {@link TPairListFactory#breakACIPIntoChunks(String)},
|
/** Tests {@link TPairListFactory#breakACIPIntoChunks(String,
|
||||||
* {@link TPairList#getACIPError()}, and {@link
|
* boolean)}, {@link TPairList#getACIPError()}, and {@link
|
||||||
* TPairList#recoverACIP()}. */
|
* TPairList#recoverACIP()}. */
|
||||||
public void testBreakACIPIntoChunks() {
|
public void testBreakACIPIntoChunks() {
|
||||||
tstHelper("GASN"); // ambiguous with regard to prefix rules
|
tstHelper("GASN"); // ambiguous with regard to prefix rules
|
||||||
|
|
|
@ -25,9 +25,9 @@ class TPairListFactory {
|
||||||
/** This class is not instantiable. */
|
/** This class is not instantiable. */
|
||||||
private TPairListFactory() { }
|
private TPairListFactory() { }
|
||||||
|
|
||||||
/** Returns a new TPairList instance. Breaks an ACIP tsheg bar
|
/** Returns one or two new TPairList instances. Breaks an ACIP
|
||||||
* (roughly a "syllable") into chunks; this computes l'
|
* tsheg bar (roughly a "syllable") into chunks; this
|
||||||
* (for you design doc enthusiasts).
|
* computes l' (for you design doc enthusiasts).
|
||||||
*
|
*
|
||||||
* <p>Here's a rough sketch of the algorithm: run along getting
|
* <p>Here's a rough sketch of the algorithm: run along getting
|
||||||
* the current TPair as big as you can. If you get it very
|
* the current TPair as big as you can. If you get it very
|
||||||
|
@ -41,16 +41,27 @@ class TPairListFactory {
|
||||||
* suboptimal", i.e. we use TPairList.hasSimpleError().</p>
|
* suboptimal", i.e. we use TPairList.hasSimpleError().</p>
|
||||||
*
|
*
|
||||||
* <p>There is one case where we break things up into two pair
|
* <p>There is one case where we break things up into two pair
|
||||||
* lists -- I found out about this case too late to do anything
|
* lists if and only if specialHandlingForAppendages is true -- I
|
||||||
* clean about it. SNYAM'AM, e.g., breaks up into [(S . ), (NY
|
* thought the converter had a bug because I saw SNYAM'AM in
|
||||||
* . A), (M . 'A), (M . )], which is incorrect -- [(S . ), (NY
|
* KD0003I2.ACT. I asked Robert Chilton, though, and he said
|
||||||
* . A), (M . ), (' . A), (M . )] is correct. But we don't know
|
* "SNYAM'AM " was likely a typo for "SNYAM 'AM", so leave
|
||||||
* which is correct without parsing, so both are returned. The
|
* specialHandlingForAppendages false.</p>
|
||||||
* clean treatment (low-priority FIXME) would be to lex into a
|
*
|
||||||
* form that didn't insist 'A was either a vowel or a consonant.
|
* <p>I found out about (OK, as it turns out, imagined) this case
|
||||||
* Then the parser would figure it out.</p>
|
* too late to do anything clean about it. SNYAM'AM, e.g.,
|
||||||
|
* breaks up into [(S . ), (NY . A), (M . 'A), (M . )], which is
|
||||||
|
* incorrect -- [(S . ), (NY . A), (M . ), (' . A), (M . )] is
|
||||||
|
* correct. But we don't know which is correct without parsing,
|
||||||
|
* so both are returned. The clean treatment would be to lex
|
||||||
|
* into a form that didn't insist 'A was either a vowel or a
|
||||||
|
* consonant. Then the parser would figure it out. But don't
|
||||||
|
* bother, because specialHandlingForAppendages should be false
|
||||||
|
* always.</p>
|
||||||
*
|
*
|
||||||
* @param acip a string of ACIP with no punctuation in it
|
* @param acip a string of ACIP with no punctuation in it
|
||||||
|
* @param specialHandlingForAppendages true if and only if you
|
||||||
|
* want SNYAM'AM to ultimately parse as {S+NYA}{M}{'A}{M} instead
|
||||||
|
* of {S+NYA}{M'A}{M}
|
||||||
* @return an array of one or two pair lists, if the former, then
|
* @return an array of one or two pair lists, if the former, then
|
||||||
* the second element will be null, if the latter, the second
|
* the second element will be null, if the latter, the second
|
||||||
* element will have (* . ), (' . *) instead of (* . '*) which
|
* element will have (* . ), (' . *) instead of (* . '*) which
|
||||||
|
@ -58,11 +69,16 @@ class TPairListFactory {
|
||||||
* large for us to break into chunks (we're recursive, not
|
* large for us to break into chunks (we're recursive, not
|
||||||
* iterative, so the boundary can be increased a lot if you care,
|
* iterative, so the boundary can be increased a lot if you care,
|
||||||
* but you don't) */
|
* but you don't) */
|
||||||
static TPairList[] breakACIPIntoChunks(String acip) throws IllegalArgumentException {
|
static TPairList[] breakACIPIntoChunks(String acip,
|
||||||
|
boolean specialHandlingForAppendages)
|
||||||
|
throws IllegalArgumentException
|
||||||
|
{
|
||||||
try {
|
try {
|
||||||
TPairList a = breakHelper(acip, true, false);
|
TPairList a = breakHelper(acip, true, false);
|
||||||
TPairList b = breakHelper(acip, false, false);
|
TPairList b = null;
|
||||||
if (a.equals(b))
|
if (specialHandlingForAppendages)
|
||||||
|
b = breakHelper(acip, false, false);
|
||||||
|
if (null != b && a.equals(b))
|
||||||
return new TPairList[] { a, null };
|
return new TPairList[] { a, null };
|
||||||
else
|
else
|
||||||
return new TPairList[] { a, b };
|
return new TPairList[] { a, b };
|
||||||
|
|
Loading…
Reference in a new issue