ACIP->TMW and ACIP->Unicode now allow for Unicode escapes like K\u0F84. This means that the lack of support for ACIP's backslash, '\\', is mitigated because you can turn ACIP {K\} into ACIP {K\u0F84}.
Support for U+F021-U+F0FF, the PUA that the latest EWTS uses, is not provided.
This commit is contained in:
parent
946d8cbc72
commit
dfaae4be93
6 changed files with 845 additions and 16 deletions
|
@ -266,6 +266,8 @@ public class ACIPConverter {
|
|||
throws IOException
|
||||
{
|
||||
try {
|
||||
if (null != tdoc && (toUnicode && !toRTF))
|
||||
throw new Error("Doing both at once might work, but it's not been tested. I bet some 'continue;' statements will need to go.");
|
||||
if (toUnicode && toRTF)
|
||||
throw new Error("FIXME: support this ACIP->Unicode.rtf mode so that KA (GA) shows up in two different font sizes. See RFE 838591.");
|
||||
if (!toUnicode && !toRTF)
|
||||
|
@ -363,7 +365,7 @@ public class ACIPConverter {
|
|||
warnings.append('\n');
|
||||
}
|
||||
} else {
|
||||
if (s.isLatin(stype)) {
|
||||
if (s.isLatin()) {
|
||||
lastGuyWasNonPunct = false;
|
||||
lastGuy = null;
|
||||
String text
|
||||
|
@ -576,7 +578,7 @@ public class ACIPConverter {
|
|||
tdoc.appendRoman(tdocLocation[0], s.getText(),
|
||||
Color.BLACK);
|
||||
tdocLocation[0] += s.getText().length();
|
||||
continue;
|
||||
continue; // FIXME: this means the unicode above doesn't go into the output if null != writer && null != tdoc?
|
||||
} else {
|
||||
String wy = ACIPRules.getWylieForACIPOther(s.getText());
|
||||
if (null == wy) throw new Error("No wylie for ACIP " + s.getText());
|
||||
|
@ -594,6 +596,24 @@ public class ACIPConverter {
|
|||
tdoc.setTibetanFontSize(regularFontSize);
|
||||
}
|
||||
continue;
|
||||
} else if (stype == TString.UNICODE_CHARACTER) {
|
||||
if (null != writer) {
|
||||
unicode = s.getText();
|
||||
}
|
||||
if (null != tdoc) {
|
||||
duff = TibetanMachineWeb.mapUnicodeToTMW(s.getText().charAt(0));
|
||||
if (null == duff) {
|
||||
hasErrors = true;
|
||||
String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: The Unicode escape with ordinal " + (int)s.getText().charAt(0) + " does not match up with any TibetanMachineWeb glyph.]";
|
||||
tdoc.appendRoman(tdocLocation[0],
|
||||
errorMessage,
|
||||
Color.RED);
|
||||
tdocLocation[0] += errorMessage.length();
|
||||
if (null != errors)
|
||||
errors.append(errorMessage + "\n");
|
||||
continue; // FIXME: if null != writer, we dropped some output.
|
||||
}
|
||||
}
|
||||
} else {
|
||||
throw new Error("forgot a case");
|
||||
}
|
||||
|
|
|
@ -23,6 +23,7 @@ import java.util.ArrayList;
|
|||
import java.util.Stack;
|
||||
|
||||
import org.thdl.util.ThdlDebug;
|
||||
import org.thdl.util.ThdlOptions;
|
||||
|
||||
/**
|
||||
* This class is able to break up Strings of ACIP text (for example, an
|
||||
|
@ -903,11 +904,31 @@ public class ACIPTshegBarScanner {
|
|||
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
|
||||
+ "Found an illegal, unprintable character.\n");
|
||||
} else if ('\\' == ch) {
|
||||
al.add(new TString("Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.",
|
||||
TString.ERROR));
|
||||
if (null != errors)
|
||||
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
|
||||
+ "Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n");
|
||||
int x = -1;
|
||||
if (!ThdlOptions.getBooleanOption("thdl.tib.text.disallow.unicode.character.escapes.in.acip")
|
||||
&& i + 5 < sl && 'u' == s.charAt(i+1)) {
|
||||
try {
|
||||
if (!((x = Integer.parseInt(s.substring(i+2, i+6), 16)) >= 0x0000 && x <= 0xFFFF))
|
||||
x = -1;
|
||||
} catch (NumberFormatException e) {
|
||||
// Though this is unlikely to be
|
||||
// legal, we allow it through.
|
||||
// (FIXME: warn.)
|
||||
}
|
||||
}
|
||||
if (x >= 0) {
|
||||
al.add(new TString(new String(new char[] { (char)x }),
|
||||
TString.UNICODE_CHARACTER));
|
||||
i += "uXXXX".length();
|
||||
startOfString = i+1;
|
||||
break;
|
||||
} else {
|
||||
al.add(new TString("Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.",
|
||||
TString.ERROR));
|
||||
if (null != errors)
|
||||
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
|
||||
+ "Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n");
|
||||
}
|
||||
} else {
|
||||
al.add(new TString("Found an illegal character, " + ch + ", with ordinal " + (int)ch + ".",
|
||||
TString.ERROR));
|
||||
|
|
|
@ -19,6 +19,8 @@ Contributor(s): ______________________________________.
|
|||
package org.thdl.tib.text.ttt;
|
||||
|
||||
import org.thdl.util.ThdlOptions;
|
||||
import org.thdl.util.ThdlDebug;
|
||||
import org.thdl.tib.text.tshegbar.UnicodeUtils;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.io.*;
|
||||
|
@ -35,16 +37,19 @@ public class TString {
|
|||
private int type;
|
||||
private String text;
|
||||
|
||||
/** Returns true if and only if an TString with type type is to
|
||||
* be converted to Latin, not Tibetan, text. */
|
||||
public static boolean isLatin(int type) {
|
||||
/** Returns true if and only if an TString with type <i>type</i>
|
||||
* is to be converted to something other than Tibetan text.
|
||||
* (Chinese Unicode, Latin, etc. all qualify as non-Tibetan.) */
|
||||
public boolean isLatin() {
|
||||
return (type != TIBETAN_NON_PUNCTUATION
|
||||
&& type != TIBETAN_PUNCTUATION
|
||||
&& type != TSHEG_BAR_ADORNMENT
|
||||
&& type != START_PAREN
|
||||
&& type != END_PAREN
|
||||
&& type != START_SLASH
|
||||
&& type != END_SLASH);
|
||||
&& type != END_SLASH
|
||||
&& (type != UNICODE_CHARACTER
|
||||
|| !UnicodeUtils.isInTibetanRange(getText().charAt(0))));
|
||||
}
|
||||
|
||||
/** For ACIP [#COMMENTS] and EWTS (DLC FIXME: what are EWTS comments?) */
|
||||
|
@ -87,13 +92,15 @@ public class TString {
|
|||
public static final int WARNING = 17;
|
||||
/** For ACIP %, o, and x or EWTS (DLC FIXME: what are EWTS adornments?) */
|
||||
public static final int TSHEG_BAR_ADORNMENT = 18;
|
||||
/** For "\\uMNOP", this TString will contain the string that has
|
||||
just the sole character "\\uMNOP". */
|
||||
public static final int UNICODE_CHARACTER = 19;
|
||||
/** For things that are not legal syntax, such as a file that
|
||||
* contains just "[# HALF A COMMEN" */
|
||||
public static final int ERROR = 19;
|
||||
* contains just "[# HALF A COMMEN". THIS MUST COME LAST. */
|
||||
public static final int ERROR = 20;
|
||||
|
||||
/** Returns true if and only if this string is Latin (usually
|
||||
* English). Returns false if this string is transliteration of
|
||||
* Tibetan. */
|
||||
/** Returns the type of this string, which is one of the
|
||||
enumerated integer static final members of this class. */
|
||||
public int getType() {
|
||||
return type;
|
||||
}
|
||||
|
@ -126,6 +133,8 @@ public class TString {
|
|||
String ftext = (TIBETAN_NON_PUNCTUATION == type)
|
||||
? MidLexSubstitution.getFinalValueForTibetanNonPunctuationToken(text)
|
||||
: text;
|
||||
// FIXME: assert this
|
||||
ThdlDebug.verify(type != UNICODE_CHARACTER || text.length() == 1);
|
||||
setText(ftext);
|
||||
if ((outputAllTshegBars || outputUniqueTshegBars) && TIBETAN_NON_PUNCTUATION == type)
|
||||
outputTshegBar(ftext);
|
||||
|
@ -182,6 +191,7 @@ public class TString {
|
|||
if (type == END_PAREN) typeString = "END_PAREN";
|
||||
if (type == WARNING) typeString = "WARNING";
|
||||
if (type == TSHEG_BAR_ADORNMENT) typeString = "TSHEG_BAR_ADORNMENT";
|
||||
if (type == UNICODE_CHARACTER) typeString = "UNICODE_CHARACTER";
|
||||
if (type == ERROR) typeString = "ERROR";
|
||||
return typeString + ":{" + getText() + "}";
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue