ACIP->Unicode, without going through TMW, is now possible, so long as

\, the Sanskrit virama, is not used.  Of the 1370-odd ACIP texts I've
got here, about 57% make it through the gauntlet (fewer if you demand
a vowel or disambiguator on every stack of a non-Tibetan tsheg bar).
This commit is contained in:
dchandler 2003-08-18 02:38:54 +00:00
parent 245aac4911
commit 1afb3a0fdd
12 changed files with 646 additions and 40 deletions

View file

@ -318,13 +318,18 @@ Contributor(s): ______________________________________.
<param name="my.included.source.file"
value="org/thdl/tib/text/TibetanHTML.java"/>
</antcall>
<!-- Put TibetanConverter in Jskad's jar for those who want
to use it. -->
<!-- Put TibetanConverter and ACIPConverter in Jskad's jar for
those who want to use them. -->
<antcall target="our-internal-javac-task">
<param name="mybin" value="${jskadbin}"/>
<param name="my.included.source.file"
value="org/thdl/tib/input/TibetanConverter.java"/>
</antcall>
<antcall target="our-internal-javac-task">
<param name="mybin" value="${jskadbin}"/>
<param name="my.included.source.file"
value="org/thdl/tib/text/ttt/ACIPConverter.java"/>
</antcall>
<antcall target="our-internal-javac-task">
<param name="mybin" value="${jskadbin}"/>
<param name="my.included.source.file"

View file

@ -341,7 +341,7 @@ public final class LegalTshegBar
EWC_ta, EWC_tha, EWC_da, EWC_na,
EWC_pa, EWC_pha, EWC_ba, EWC_ma,
EWC_tsa, EWC_tsha, EWC_dza, EWC_wa,
EWC_zha, EWC_za, EWC_achung, EWC_ya,
EWC_zha, EWC_za, EWC_achung, EWC_ya,
EWC_ra, EWC_la, EWC_sha, EWC_sa,
EWC_ha, EWC_a
});
@ -833,7 +833,7 @@ public final class LegalTshegBar
return internalThrowThing(throwIfIllegal,
errorBuf,
"Illegal suffix -- not one of the ten legal suffixes: "
+ UnicodeUtils.unicodeCodepointToString(suffix.charAt(0)));
+ UnicodeUtils.unicodeCodepointToString(suffix.charAt(0), false));
}
}
}

View file

@ -286,8 +286,41 @@ public class UnicodeUtils implements UnicodeConstants {
}
/** Returns a human-readable, ASCII form of the Unicode codepoint
cp. */
public static String unicodeCodepointToString(char cp) {
cp. If shortenIfPossible is true, then printable ASCII
characters will appear as themselves. */
public static String unicodeCodepointToString(char cp,
boolean shortenIfPossible) {
if (shortenIfPossible) {
if ((cp >= 'a' && cp <= 'z')
|| (cp >= 'A' && cp <= 'Z')
|| (cp >= '0' && cp <= '9')
|| cp == '.'
|| cp == ','
|| cp == ' '
|| cp == '\''
|| cp == '"'
|| cp == '+'
|| cp == '-'
|| cp == '='
|| cp == '_'
|| cp == '@'
|| cp == '!'
|| cp == '#'
|| cp == '$'
|| cp == '%'
|| cp == '^'
|| cp == '&'
|| cp == '*'
|| cp == '\t'
|| cp == ':'
|| cp == '['
|| cp == ']'
|| cp == '('
|| cp == ')'
|| cp == '{'
|| cp == '}')
return new String(new char[] { cp });
}
if (cp < '\u0010')
return "\\u000" + Integer.toHexString((int)cp);
else if (cp < '\u0100')
@ -304,7 +337,19 @@ public class UnicodeUtils implements UnicodeConstants {
public static String unicodeStringToString(String s) {
StringBuffer sb = new StringBuffer(s.length() * 6);
for (int i = 0; i < s.length(); i++) {
sb.append(unicodeCodepointToString(s.charAt(i)));
sb.append(unicodeCodepointToString(s.charAt(i), false));
}
return sb.toString();
}
/**
* Returns the most succinct possible, human-readable, ASCII form
* of the String s of Unicode codepoints. */
public static String unicodeStringToPrettyString(String s) {
if (s == null) return "null";
StringBuffer sb = new StringBuffer(s.length() * 6);
for (int i = 0; i < s.length(); i++) {
sb.append(unicodeCodepointToString(s.charAt(i), true));
}
return sb.toString();
}

View file

@ -321,15 +321,15 @@ public class UnicodeUtilsTest extends TestCase implements UnicodeConstants {
* Tests the {@link UnicodeUtils#unicodeCodepointToString(char)}
* method. */
public void testUnicodeCodepointToString() {
assertTrue(UnicodeUtils.unicodeCodepointToString('\u0000').equals("\\u0000"));
assertTrue(UnicodeUtils.unicodeCodepointToString('\u0001').equals("\\u0001"));
assertTrue(UnicodeUtils.unicodeCodepointToString('\u000F').equals("\\u000f"));
assertTrue(UnicodeUtils.unicodeCodepointToString('\u001F').equals("\\u001f"));
assertTrue(UnicodeUtils.unicodeCodepointToString('\u00fF').equals("\\u00ff"));
assertTrue(UnicodeUtils.unicodeCodepointToString('\u01fF').equals("\\u01ff"));
assertTrue(UnicodeUtils.unicodeCodepointToString('\u0ffF').equals("\\u0fff"));
assertTrue(UnicodeUtils.unicodeCodepointToString('\u1ffF').equals("\\u1fff"));
assertTrue(UnicodeUtils.unicodeCodepointToString('\ufffF').equals("\\uffff"));
assertTrue(UnicodeUtils.unicodeCodepointToString('\u0000', false).equals("\\u0000"));
assertTrue(UnicodeUtils.unicodeCodepointToString('\u0001', false).equals("\\u0001"));
assertTrue(UnicodeUtils.unicodeCodepointToString('\u000F', false).equals("\\u000f"));
assertTrue(UnicodeUtils.unicodeCodepointToString('\u001F', false).equals("\\u001f"));
assertTrue(UnicodeUtils.unicodeCodepointToString('\u00fF', false).equals("\\u00ff"));
assertTrue(UnicodeUtils.unicodeCodepointToString('\u01fF', false).equals("\\u01ff"));
assertTrue(UnicodeUtils.unicodeCodepointToString('\u0ffF', false).equals("\\u0fff"));
assertTrue(UnicodeUtils.unicodeCodepointToString('\u1ffF', false).equals("\\u1fff"));
assertTrue(UnicodeUtils.unicodeCodepointToString('\ufffF', false).equals("\\uffff"));
}
/**

View file

@ -0,0 +1,208 @@
/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text.ttt;
import java.io.*;
import java.util.ArrayList;
import java.util.Stack;
import org.thdl.util.ThdlDebug;
import org.thdl.util.ThdlOptions;
/**
* This class is able to convert an ACIP file into Tibetan Machine Web.
* From there, TMW->Unicode takes you to Unicode.
* @author David Chandler
*/
public class ACIPConverter {
static {
// We don't want to load the TM or TMW font files ourselves:
ThdlOptions.setUserPreference("thdl.rely.on.system.tmw.fonts", true);
ThdlOptions.setUserPreference("thdl.rely.on.system.tm.fonts", true);
ThdlOptions.setUserPreference("thdl.debug", true);
}
/** Command-line converter. Gives error messages on standard
* output about why we can't convert the document perfectly and
* exits with non-zero return code, or is silent otherwise and
* exits with code zero. <p>FIXME: not so efficient; copies the
* whole file into memory first. */
public static void main(String[] args)
throws IOException // DLC FIXME: give nice error messages
{
boolean verbose = true;
boolean strict = true;
if (args.length != 2
|| (!(strict = "--strict".equals(args[0])) && !"--lenient".equals(args[0]))) {
System.err.println("Bad args! Need '--strict filename' or '--lenient filename'.");
System.exit(1);
}
StringBuffer errors = new StringBuffer();
int maxErrors = 250;
ArrayList al = ACIPTshegBarScanner.scanFile(args[1], errors, strict, maxErrors - 1);
if (null == al) {
System.err.println(maxErrors + " or more errors occurred while scanning ACIP input file; is this");
System.err.println("Tibetan or English input?");
System.err.println("");
System.err.println("First " + maxErrors + " errors scanning ACIP input file: ");
System.err.println(errors);
System.err.println("Exiting with " + maxErrors + " or more errors; please fix input file and try again.");
System.exit(1);
}
if (errors.length() > 0) {
System.err.println("Errors scanning ACIP input file: ");
System.err.println(errors);
System.err.println("Exiting; please fix input file and try again.");
System.exit(1);
}
convertToUnicode(al, System.out, errors);
if (errors.length() > 0) {
System.err.println("Errors converting ACIP input file: ");
System.err.println(errors);
System.err.println("Exiting; please fix input file and try again.");
System.exit(2);
}
if (verbose) System.err.println("Converted " + args[1] + " perfectly.");
System.exit(0);
}
/** Writes TMW/Latin to out. If errors occur in converting a
* tsheg bar, then they are appended to errors if errors is
* non-null. Returns true upon perfect success, false if errors
* occurred.
* @throws IOException if we cannot write to out
*/
public static boolean convertToTMW(ArrayList scan, String latinFont,
OutputStream out, StringBuffer errors)
throws IOException
{
throw new Error("DLC UNIMPLEMENTED");
}
/** Returns UTF-8 encoded Unicode. A bit indirect, so use this
* for testing only if performance is a concern. If errors occur
* in scanning the ACIP or in converting a tsheg bar, then they
* are appended to errors if errors is non-null. Returns the
* conversion upon perfect success, null if errors occurred.
*/
public static String convertToUnicode(String acip,
StringBuffer errors) {
ByteArrayOutputStream sw = new ByteArrayOutputStream();
ArrayList al = ACIPTshegBarScanner.scan(acip, errors, true /* DLC FIXME */, -1);
try {
if (null != al && convertToUnicode(al, sw, errors)) {
return sw.toString("UTF-8");
} else {
System.out.println("DLC al is " + al + " and convertToUnicode returned null.");
return null;
}
} catch (Exception e) {
throw new Error(e.toString());
}
}
/** Writes Unicode to out. If errors occur in converting a
* tsheg bar, then they are appended to errors if errors is
* non-null. Returns true upon perfect success, false if errors
* occurred.
* @throws IOException if we cannot write to out
*/
public static boolean convertToUnicode(ArrayList scan,
OutputStream out,
StringBuffer errors)
throws IOException
{
int sz = scan.size();
boolean hasErrors = false;
BufferedWriter writer
= new BufferedWriter(new OutputStreamWriter(out, "UTF-8"));
for (int i = 0; i < sz; i++) {
ACIPString s = (ACIPString)scan.get(i);
int stype = s.getType();
if (stype == ACIPString.ERROR) {
hasErrors = true;
writer.write("[#ERROR CONVERTING ACIP DOCUMENT: ");
writer.write(s.getText());
writer.write("]");
} else {
// DLC FIXME: what about 'no A on root stack' and 'no A on such-and-such stack' warnings?
if (s.isLatin(stype)) {
if (stype == ACIPString.FOLIO_MARKER)
writer.write("{");
writer.write(s.getText());
if (stype == ACIPString.FOLIO_MARKER)
writer.write("}");
} else {
String unicode = null;
if (stype == ACIPString.TIBETAN_NON_PUNCTUATION) {
TPairList pl = TPairListFactory.breakACIPIntoChunks(s.getText());
String acipError;
if ((acipError = pl.getACIPError()) != null) {
hasErrors = true;
String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") " + s.getText() + " HAS THESE ERRORS: " + acipError + "]";
writer.write(errorMessage);
if (null != errors)
errors.append(errorMessage + "\n");
} else {
TParseTree pt = pl.getParseTree();
if (null == pt) {
hasErrors = true;
String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") " + s.getText() + " IS ESSENTIALLY NOTHING.]";
writer.write(errorMessage);
if (null != errors)
errors.append(errorMessage + "\n");
} else {
TStackList sl = pt.getBestParse();
if (null == sl) {
hasErrors = true;
String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") " + s.getText() + " HAS NO LEGAL PARSES.]";
writer.write(errorMessage);
if (null != errors)
errors.append(errorMessage + "\n");
} else {
unicode = sl.getUnicode();
if (null == unicode) throw new Error("DLC: HOW?");
}
}
}
} else {
if (stype == ACIPString.START_SLASH)
unicode = "\u0F3C";
else if (stype == ACIPString.END_SLASH)
unicode = "\u0F3D";
else
unicode = ACIPRules.getUnicodeFor(s.getText(), false);
if (null == unicode) throw new Error("DLC: HOW?");
}
if (null != unicode) {
writer.write(unicode);
}
}
}
}
writer.close();
return !hasErrors;
}
}
// DLC FIXME: putting Tibetan in black, Sanskrit in green, and Latin
// in yellow would help you quickly decide if ZHIGN maybe should've
// been ZHING.

View file

@ -28,9 +28,9 @@ class ACIPRules {
* three. */
public static int MAX_CONSONANT_LENGTH = 3;
/** {'im:}, the longest "vowel", has 4 characters, so this is
* four. */
public static int MAX_VOWEL_LENGTH = 4;
/** {'EEm:}, the longest "vowel", has 5 characters, so this is
* five. */
public static int MAX_VOWEL_LENGTH = 5;
/** For O(1) {@link #isVowel(String)} calls. */
private static HashSet acipVowels = null;
@ -42,18 +42,9 @@ class ACIPRules {
{ "U", "u" },
{ "E", "e" },
{ "O", "o" },
{ "'I", "I" },
{ "'U", "U" },
{ "EE", "ai" },
{ "OO", "au" },
{ "i", "-i" },
{ "'i", "-I" },
{ "'A", "A" },
{ "'O", "Ao" },
{ "'E", "Ae" }
// DLC I'm on my own with 'O and 'E, but GANG'O appears
// and I wonder... so here are 'O and 'E. It's
// consistent with 'I and 'A and 'U, at least.
{ "i", "-i" }
};
/** Returns true if and only if s is an ACIP "vowel". You can't
@ -61,14 +52,24 @@ class ACIPRules {
* ACIP, so you have to call this in the right context. */
public static boolean isVowel(String s) {
if (null == acipVowels) {
acipVowels = new HashSet();
acipVowels = new HashSet(baseVowels.length * 8);
for (int i = 0; i < baseVowels.length; i++) {
acipVowels.add(baseVowels[i][0]);
acipVowels.add(baseVowels[i][0] + 'm');
acipVowels.add(baseVowels[i][0] + ':');
acipVowels.add(baseVowels[i][0] + "m:");
// DLC '\' for visarga? how shall we do \ the visarga? like a vowel or not?
// DLC I'm on my own with 'O and 'E and 'OO and 'EE, but
// GANG'O appears and I wonder... so here they are. It's
// consistent with 'I and 'A and 'U, at least: all the vowels
// may appear as K'vowel.
acipVowels.add(baseVowels[i][0]);
acipVowels.add('\'' + baseVowels[i][0]);
acipVowels.add(baseVowels[i][0] + 'm');
acipVowels.add('\'' + baseVowels[i][0] + 'm');
acipVowels.add(baseVowels[i][0] + ':');
acipVowels.add('\'' + baseVowels[i][0] + ':');
acipVowels.add(baseVowels[i][0] + "m:");
acipVowels.add('\'' + baseVowels[i][0] + "m:");
// DLC keep this code in sync with getUnicodeFor.
// DLC '\' for visarga? how shall we do \ the visarga? like a vowel or not?
}
}
return (acipVowels.contains(s));
@ -204,4 +205,212 @@ class ACIPRules {
}
return (String)acipVowel2wylie.get(acip);
}
private static HashMap superACIP2unicode = null;
private static HashMap subACIP2unicode = null;
/** If acip is an ACIP consonant or vowel or punctuation mark,
* then this returns the Unicode for it. The Unicode for the
* subscribed form of the glyph is returned if subscribed is
* true. Returns null if acip is unknown. */
static String getUnicodeFor(String acip, boolean subscribed) {
if (superACIP2unicode == null) {
superACIP2unicode = new HashMap(144);
subACIP2unicode = new HashMap(42);
// oddball:
subACIP2unicode.put("V", "\u0FAD");
superACIP2unicode.put("DH", "\u0F52");
subACIP2unicode.put("DH", "\u0FA2");
superACIP2unicode.put("BH", "\u0F57");
subACIP2unicode.put("BH", "\u0FA7");
superACIP2unicode.put("dH", "\u0F4D");
subACIP2unicode.put("dH", "\u0F9D");
superACIP2unicode.put("DZH", "\u0F5C");
subACIP2unicode.put("DZH", "\u0FAC");
superACIP2unicode.put("Ksh", "\u0F69");
subACIP2unicode.put("Ksh", "\u0FB9");
superACIP2unicode.put("GH", "\u0F43");
subACIP2unicode.put("GH", "\u0F93");
superACIP2unicode.put("K", "\u0F40");
subACIP2unicode.put("K", "\u0F90");
superACIP2unicode.put("KH", "\u0F41");
subACIP2unicode.put("KH", "\u0F91");
superACIP2unicode.put("G", "\u0F42");
subACIP2unicode.put("G", "\u0F92");
superACIP2unicode.put("NG", "\u0F44");
subACIP2unicode.put("NG", "\u0F94");
superACIP2unicode.put("C", "\u0F45");
subACIP2unicode.put("C", "\u0F95");
superACIP2unicode.put("CH", "\u0F46");
subACIP2unicode.put("CH", "\u0F96");
superACIP2unicode.put("J", "\u0F47");
subACIP2unicode.put("J", "\u0F97");
superACIP2unicode.put("NY", "\u0F49");
subACIP2unicode.put("NY", "\u0F99");
superACIP2unicode.put("T", "\u0F4F");
subACIP2unicode.put("T", "\u0F9F");
superACIP2unicode.put("TH", "\u0F50");
subACIP2unicode.put("TH", "\u0FA0");
superACIP2unicode.put("D", "\u0F51");
subACIP2unicode.put("D", "\u0FA1");
superACIP2unicode.put("N", "\u0F53");
subACIP2unicode.put("N", "\u0FA3");
superACIP2unicode.put("P", "\u0F54");
subACIP2unicode.put("P", "\u0FA4");
superACIP2unicode.put("PH", "\u0F55");
subACIP2unicode.put("PH", "\u0FA5");
superACIP2unicode.put("B", "\u0F56");
subACIP2unicode.put("B", "\u0FA6");
superACIP2unicode.put("M", "\u0F58");
subACIP2unicode.put("M", "\u0FA8");
superACIP2unicode.put("TZ", "\u0F59");
subACIP2unicode.put("TZ", "\u0FA9");
superACIP2unicode.put("TS", "\u0F5A");
subACIP2unicode.put("TS", "\u0FAA");
superACIP2unicode.put("DZ", "\u0F5B");
subACIP2unicode.put("DZ", "\u0FAB");
superACIP2unicode.put("W", "\u0F5D");
subACIP2unicode.put("W", "\u0FBA"); // oddball
superACIP2unicode.put("ZH", "\u0F5E");
subACIP2unicode.put("ZH", "\u0FAE");
superACIP2unicode.put("Z", "\u0F5F");
subACIP2unicode.put("Z", "\u0FAF");
superACIP2unicode.put("'", "\u0F60");
subACIP2unicode.put("'", "\u0FB0");
superACIP2unicode.put("Y", "\u0F61");
subACIP2unicode.put("Y", "\u0FB1");
superACIP2unicode.put("R", "\u0F62");
subACIP2unicode.put("R", "\u0FB2");
superACIP2unicode.put("L", "\u0F63");
subACIP2unicode.put("L", "\u0FB3");
superACIP2unicode.put("SH", "\u0F64");
subACIP2unicode.put("SH", "\u0FB4");
superACIP2unicode.put("S", "\u0F66");
subACIP2unicode.put("S", "\u0FB6");
superACIP2unicode.put("H", "\u0F67");
subACIP2unicode.put("H", "\u0FB7");
superACIP2unicode.put("A", "\u0F68");
subACIP2unicode.put("A", "\u0FB8");
superACIP2unicode.put("t", "\u0F4A");
subACIP2unicode.put("t", "\u0F9A");
superACIP2unicode.put("th", "\u0F4B");
subACIP2unicode.put("th", "\u0F9B");
superACIP2unicode.put("d", "\u0F4C");
subACIP2unicode.put("d", "\u0F9C");
superACIP2unicode.put("n", "\u0F4E");
subACIP2unicode.put("n", "\u0F9E");
superACIP2unicode.put("sh", "\u0F65");
subACIP2unicode.put("sh", "\u0FB5");
superACIP2unicode.put("I", "\u0F72");
superACIP2unicode.put("E", "\u0F7A");
superACIP2unicode.put("O", "\u0F7C");
superACIP2unicode.put("U", "\u0F74");
superACIP2unicode.put("OO", "\u0F7D");
superACIP2unicode.put("EE", "\u0F7B");
superACIP2unicode.put("i", "\u0F80");
superACIP2unicode.put("'A", "\u0F71");
superACIP2unicode.put("'I", "\u0F71\u0F72");
superACIP2unicode.put("'E", "\u0F71\u0F7A");
superACIP2unicode.put("'O", "\u0F71\u0F7C");
superACIP2unicode.put("'U", "\u0F71\u0F74");
superACIP2unicode.put("'OO", "\u0F71\u0F7D");
superACIP2unicode.put("'EE", "\u0F71\u0F7B");
superACIP2unicode.put("'i", "\u0F71\u0F80");
superACIP2unicode.put("Im", "\u0F72\u0F7E");
superACIP2unicode.put("Em", "\u0F7A\u0F7E");
superACIP2unicode.put("Om", "\u0F7C\u0F7E");
superACIP2unicode.put("Um", "\u0F74\u0F7E");
superACIP2unicode.put("OOm", "\u0F7D\u0F7E");
superACIP2unicode.put("EEm", "\u0F7B\u0F7E");
superACIP2unicode.put("im", "\u0F80\u0F7E");
superACIP2unicode.put("'Am", "\u0F71\u0F7E");
superACIP2unicode.put("'Im", "\u0F71\u0F72\u0F7E");
superACIP2unicode.put("'Em", "\u0F71\u0F7A\u0F7E");
superACIP2unicode.put("'Om", "\u0F71\u0F7C\u0F7E");
superACIP2unicode.put("'Um", "\u0F71\u0F74\u0F7E");
superACIP2unicode.put("'OOm", "\u0F71\u0F7D\u0F7E");
superACIP2unicode.put("'EEm", "\u0F71\u0F7B\u0F7E");
superACIP2unicode.put("'im", "\u0F71\u0F80\u0F7E");
superACIP2unicode.put("I:", "\u0F72\u0F7F");
superACIP2unicode.put("E:", "\u0F7A\u0F7F");
superACIP2unicode.put("O:", "\u0F7C\u0F7F");
superACIP2unicode.put("U:", "\u0F74\u0F7F");
superACIP2unicode.put("OO:", "\u0F7D\u0F7F");
superACIP2unicode.put("EE:", "\u0F7B\u0F7F");
superACIP2unicode.put("i:", "\u0F80\u0F7F");
superACIP2unicode.put("'A:", "\u0F71\u0F7F");
superACIP2unicode.put("'I:", "\u0F71\u0F72\u0F7F");
superACIP2unicode.put("'E:", "\u0F71\u0F7A\u0F7F");
superACIP2unicode.put("'O:", "\u0F71\u0F7C\u0F7F");
superACIP2unicode.put("'U:", "\u0F71\u0F74\u0F7F");
superACIP2unicode.put("'OO:", "\u0F71\u0F7D\u0F7F");
superACIP2unicode.put("'EE:", "\u0F71\u0F7B\u0F7F");
superACIP2unicode.put("'i:", "\u0F71\u0F80\u0F7F");
superACIP2unicode.put("Im:", "\u0F72\u0F7E\u0F7F");
superACIP2unicode.put("Em:", "\u0F7A\u0F7E\u0F7F");
superACIP2unicode.put("Om:", "\u0F7C\u0F7E\u0F7F");
superACIP2unicode.put("Um:", "\u0F74\u0F7E\u0F7F");
superACIP2unicode.put("OOm:", "\u0F7D\u0F7E\u0F7F");
superACIP2unicode.put("EEm:", "\u0F7B\u0F7E\u0F7F");
superACIP2unicode.put("im:", "\u0F80\u0F7E\u0F7F");
superACIP2unicode.put("'Am:", "\u0F71\u0F7E\u0F7F");
superACIP2unicode.put("'Im:", "\u0F71\u0F72\u0F7E\u0F7F");
superACIP2unicode.put("'Em:", "\u0F71\u0F7A\u0F7E\u0F7F");
superACIP2unicode.put("'Om:", "\u0F71\u0F7C\u0F7E\u0F7F");
superACIP2unicode.put("'Um:", "\u0F71\u0F74\u0F7E\u0F7F");
superACIP2unicode.put("'OOm:", "\u0F71\u0F7D\u0F7E\u0F7F");
superACIP2unicode.put("'EEm:", "\u0F71\u0F7B\u0F7E\u0F7F");
superACIP2unicode.put("'im:", "\u0F71\u0F80\u0F7E\u0F7F");
// :m does not appear, though you'd think it's as valid as m:.
// I doubt these will occur alone:
superACIP2unicode.put("m", "\u0F7E");
superACIP2unicode.put(":", "\u0F7F");
superACIP2unicode.put("Am", "\u0F7E");
superACIP2unicode.put("A:", "\u0F7F");
superACIP2unicode.put("0", "\u0F20");
superACIP2unicode.put("1", "\u0F21");
superACIP2unicode.put("2", "\u0F22");
superACIP2unicode.put("3", "\u0F23");
superACIP2unicode.put("4", "\u0F24");
superACIP2unicode.put("5", "\u0F25");
superACIP2unicode.put("6", "\u0F26");
superACIP2unicode.put("7", "\u0F27");
superACIP2unicode.put("8", "\u0F28");
superACIP2unicode.put("9", "\u0F29");
// DLC punctuation
superACIP2unicode.put("&", "\u0F85");
superACIP2unicode.put(",", "\u0F0D");
superACIP2unicode.put(" ", "\u0F0B");
superACIP2unicode.put(".", "\u0F0C");
superACIP2unicode.put("`", "\u0F08");
superACIP2unicode.put("`", "\u0F08");
superACIP2unicode.put("*", "\u0F04\u0F05");
superACIP2unicode.put("#", "\u0F04\u0F05\u0F05");
superACIP2unicode.put("%", "\u0F35");
superACIP2unicode.put(";", "\u0F11");
superACIP2unicode.put("\r", "\r");
superACIP2unicode.put("\t", "\t");
superACIP2unicode.put("\n", "\n");
superACIP2unicode.put("\\", "\u0F84"); // DLC FIXME: make this like a vowel
// DLC FIXME: what's the Unicode for caret, ^?
// DLC FIXME: what's the Unicode for o?
// DLC FIXME: what's the Unicode for x?
}
if (subscribed) {
String u = (String)subACIP2unicode.get(acip);
if (null != u) return u;
}
return (String)superACIP2unicode.get(acip);
}
}

View file

@ -30,6 +30,15 @@ public class ACIPString {
private int type;
private String text;
/** Returns true if and only if an ACIPString with type type is to
* be converted to Latin, not Tibetan, text. */
public static boolean isLatin(int type) {
return (type != TIBETAN_NON_PUNCTUATION
&& type != TIBETAN_PUNCTUATION
&& type != START_SLASH
&& type != END_SLASH);
}
/** For [#COMMENTS] */
public static final int COMMENT = 0;
/** For Folio markers like @012B */

View file

@ -57,7 +57,6 @@ public class ACIPTshegBarScanner {
System.out.println(errors);
System.out.println("Exiting with " + maxErrors + " or more errors; please fix input file and try again.");
System.exit(1);
} else {
}
if (errors.length() > 0) {
System.out.println("Errors scanning ACIP input file: ");
@ -90,6 +89,7 @@ public class ACIPTshegBarScanner {
while (-1 != (amt = in.read(ch))) {
s.append(ch, 0, amt);
}
in.close();
return scan(s.toString(), errors, !strict, maxErrors);
}
@ -621,6 +621,18 @@ public class ACIPTshegBarScanner {
}
if (startSlashIndex >= 0) {
if (startSlashIndex + 1 == i) {
/* //NYA\\ appears in ACIP input, and I think
* it means /NYA/. We warn about // for this
* reason. \\ causes a tsheg-bar error (DLC
* FIXME: verify this is so). */
al.add(new ACIPString("//", ACIPString.ERROR));
if (errors != null) {
errors.append("Offset " + i + ": "
+ "Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\n");
}
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
}
al.add(new ACIPString(s.substring(i, i+1),
ACIPString.END_SLASH));
startOfString = i+1;
@ -766,6 +778,9 @@ public class ACIPTshegBarScanner {
if ((int)ch == 65533) {
errors.append("Offset " + i + ": "
+ "Found an illegal, unprintable character.\n");
} else if ('\\' == ch) {
errors.append("Offset " + i + ": "
+ "Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n");
} else {
errors.append("Offset " + i + ": "
+ "Found an illegal character, " + ch + ", with ordinal " + (int)ch + ".\n");
@ -849,7 +864,7 @@ public class ACIPTshegBarScanner {
|| ch == 'x'
|| ch == ':'
|| ch == '^'
|| ch == '\\'
// DLC FIXME: we must treat this guy like a vowel, a special vowel that numerals can take on. Until then, warn. || ch == '\\'
|| ch == '-'
|| ch == '+'

View file

@ -292,6 +292,12 @@ public class PackageTest extends TestCase {
new String[] { "{SH}{LO}", "{SH+LO}" },
new String[] { "{SH+LO}" });
tstHelper("ZLUM", "{Z}{LU}{M}", new String[] { "{Z}{LU}{M}", "{Z+LU}{M}" }, new String[] { "{Z+LU}{M}" });
tstHelper("K'EE", "{K'EE}");
tstHelper("K'O", "{K'O}");
tstHelper("K'OO", "{K'OO}");
tstHelper("K'II", "{K'I}{I}");
tstHelper("K'i", "{K'i}");
tstHelper("K'A", "{K'A}");
tstHelper("B+DDZ", "{B+}{D}{DZ}",
new String[] { "{B+D}{DZ}",
"{B+D+DZ}" }); // we're conservative.
@ -6984,7 +6990,7 @@ tstHelper("ZUR");
shelp("DD]",
"Offset 2: Found a truly unmatched close bracket, ] or }.\nOffset 2: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
shelp("///NYA", "Offset END: Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.\n");
shelp("///NYA", "Offset 1: Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\nOffset END: Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.\n");
shelp("/NYA/", "");
shelp("[?][BP][LS][DD1][DD2][DDD][DR][# (<{A COMMENT)}>]", "");
shelp("[LS][# A [[[[[COMMENT][LS]",
@ -7029,14 +7035,26 @@ tstHelper("ZUR");
shelp("?", "", "[QUESTION:{?}]");
shelp("KHAN~ BAR ", "Offset 4: Found an illegal character, ~, with ordinal 126.\n");
shelp("[* Correction with []]",
"Offset 5: Found an illegal character, r, with ordinal 114.\nOffset 6: Found an illegal character, r, with ordinal 114.\nOffset 7: Found an illegal character, e, with ordinal 101.\nOffset 8: Found an illegal character, c, with ordinal 99.\nOffset 14: Found an illegal character, w, with ordinal 119.\nOffset 15: Found an illegal character, i, with ordinal 105.\nOffset 19: Found an illegal open bracket (in context, this is []]). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset 21: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
"Offset 5: Found an illegal character, r, with ordinal 114.\nOffset 6: Found an illegal character, r, with ordinal 114.\nOffset 7: Found an illegal character, e, with ordinal 101.\nOffset 8: Found an illegal character, c, with ordinal 99.\nOffset 14: Found an illegal character, w, with ordinal 119.\nOffset 19: Found an illegal open bracket (in context, this is []]). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset 21: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
// DLC FIXME: the line SDIG PA'I GROGS PO'I LAG TU SON PAR 'GYUR PA is followed by a blank line. Note that it's "PA", not "PA ", ending it. Autocorrect to the latter.
// DLC FIXME: @0B1 isn't handled correctly!
shelp(",NGES ? PA", "", "[TIBETAN_PUNCTUATION:{,}, TIBETAN_NON_PUNCTUATION:{NGES}, TIBETAN_PUNCTUATION:{ }, QUESTION:{?}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{PA}]");
shelp("K\\,", "", "[TIBETAN_NON_PUNCTUATION:{K\\}, TIBETAN_PUNCTUATION:{,}]");
// FIXME: just until we treat viramas correctly:
if (false) {
uhelp("1\\", "\u0f21\u0f84");
uhelp(" 1\\ ", "\u0f0b\u0f21\u0f84\u0f0b");
}
shelp("K\\,",
"Offset 1: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n",
"[TIBETAN_NON_PUNCTUATION:{K}, ERROR:{\\}, TIBETAN_PUNCTUATION:{,}]");
shelp("MTHAR%", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR%}]");
shelp("PHYIR;", "", "[TIBETAN_NON_PUNCTUATION:{PHYIR}, TIBETAN_PUNCTUATION:{;}]");
shelp("......,DAM ",
@ -7078,8 +7096,70 @@ tstHelper("ZUR");
shelp("{ DD }", "", "[DD:{{ DD }}]"); // TD3790E2.ACT
shelp("{ BP }", "", "[BP:{{ BP }}]"); // TD3790E2.ACT
shelp("//NYA\\\\",
"Offset 1: Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\nOffset 5: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\nOffset 6: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n",
"[START_SLASH:{/}, ERROR:{//}, END_SLASH:{/}, TIBETAN_NON_PUNCTUATION:{NYA}, ERROR:{\\}, ERROR:{\\}]");
}
private static void uhelp(String acip) {
uhelp(acip, null);
}
private static void uhelp(String acip, String expectedUnicode) {
StringBuffer errors = new StringBuffer();
String unicode = ACIPConverter.convertToUnicode(acip, errors);
if (null == unicode) {
if (null != expectedUnicode && "none" != expectedUnicode) {
System.out.println("No unicode exists for " + acip + " but you expected " + org.thdl.tib.text.tshegbar.UnicodeUtils.unicodeStringToPrettyString(expectedUnicode));
assertTrue(false);
}
System.out.println("DLC: Unicode for " + acip + " can't be had; errors are " + errors);
} else {
if (null != expectedUnicode && !expectedUnicode.equals(unicode)) {
System.out.println("The unicode for " + acip + " is " + org.thdl.tib.text.tshegbar.UnicodeUtils.unicodeStringToPrettyString(unicode) + ", but you expected " + org.thdl.tib.text.tshegbar.UnicodeUtils.unicodeStringToPrettyString(expectedUnicode));
assertTrue(false);
}
}
}
public void testACIPConversion() {
uhelp("G+DHA", "\u0f42\u0fa2");
uhelp("P'EE", "\u0f54\u0f71\u0f7b");
uhelp("KA", "\u0f40");
uhelp("KI", "\u0f40\u0f72");
uhelp("KO", "\u0f40\u0f7c");
uhelp("KE", "\u0f40\u0f7a");
uhelp("KU", "\u0f40\u0f74");
uhelp("KOO", "\u0f40\u0f7d");
uhelp("KEE", "\u0f40\u0f7b");
uhelp("KEEm", "\u0f40\u0f7b\u0f7e");
uhelp("KEEm:", "\u0f40\u0f7b\u0f7e\u0f7f");
uhelp("KEE:", "\u0f40\u0f7b\u0f7f");
uhelp("K'I", "\u0f40\u0f71\u0f72");
uhelp("K'O", "\u0f40\u0f71\u0f7c");
uhelp("K'E", "\u0f40\u0f71\u0f7a");
uhelp("K'U", "\u0f40\u0f71\u0f74");
uhelp("K'OO", "\u0f40\u0f71\u0f7d");
uhelp("K'EE", "\u0f40\u0f71\u0f7b");
uhelp("K'EEm", "\u0f40\u0f71\u0f7b\u0f7e");
tstHelper("K'EEm:", "{K'EEm:}",
new String[] { "{K'EEm:}" },
new String[] { },
"{K'EEm:}");
uhelp("K'EEm:", "\u0f40\u0f71\u0f7b\u0f7e\u0f7f");
uhelp("K'EE:", "\u0f40\u0f71\u0f7b\u0f7f");
uhelp("K'A:", "\u0f40\u0f71\u0f7f");
// DLC FIXME: in ACIP RTF files, (PARENTHESES) seem to make
// text go from 24-point to 18-point. Thus, ACIP->Unicode.txt
// is fundamentally flawed, whereas ACIP->Unicode.rtf is OK.
uhelp("/NY'EE/", "\u0f3C\u0f49\u0F71\u0F7B\u0f3D");
uhelp("*#HUm: G+DHOO GRO`;.,", "\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b\u0f42\u0fa2\u0f7d\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d");
uhelp("*#HUm: K+DHA GRO`;.,", "none");
}
/** Tests some more tsheg bars, these from Dr. Lacey's critical
edition of Mahavyutpatti.

View file

@ -167,4 +167,19 @@ class TPair {
if (null == rightWylie) rightWylie = "";
return leftWylie + rightWylie;
}
/** Appends legal Unicode corresponding to this (possible
* subscribed) pair to sb. DLC FIXME: which normalization form,
* if any? */
void getUnicode(StringBuffer sb, boolean subscribed) {
if (null != getLeft()) {
String x = ACIPRules.getUnicodeFor(getLeft(), subscribed);
if (null != x) sb.append(x);
}
if (null != getRight()
&& !("-".equals(getRight()) || "A".equals(getRight()))) {
String x = ACIPRules.getUnicodeFor(getRight(), subscribed);
if (null != x) sb.append(x);
}
}
}

View file

@ -603,5 +603,16 @@ class TPairList {
}
}
}
/** Appends legal Unicode corresponding to this stack to sb. DLC
* FIXME: which normalization form, if any? */
void getUnicode(StringBuffer sb) {
boolean subscribed = false;
for (int i = 0; i < size(); i++) {
get(i).getUnicode(sb, subscribed);
subscribed = true;
}
}
}
// DLC FIXME: handle 'o' and 'x', e.g. KAo and NYAx.

View file

@ -205,6 +205,15 @@ class TStackList {
throw new IllegalArgumentException("opl (" + opl + ") is bad for this stack list (" + toString() + ")");
return false;
}
/** Returns legal Unicode corresponding to this tsheg bar. DLC FIXME: which normalization form, if any? */
String getUnicode() {
StringBuffer u = new StringBuffer(size());
for (int i = 0; i < size(); i++) {
get(i).getUnicode(u);
}
return u.toString();
}
}
class BoolPair {