Changed converters from unicode non-breaking tsheg to unicode non-breaking wylie space.
This commit is contained in:
parent
ffb32b3207
commit
835e74c0cd
7 changed files with 56 additions and 23 deletions
|
@ -46,7 +46,6 @@ public class BasicTibetanTranscriptionConverter implements FontConverterConstant
|
||||||
private static final int WYLIE_TO_ACIP=2;
|
private static final int WYLIE_TO_ACIP=2;
|
||||||
private static final int UNICODE_TO_WYLIE=3;
|
private static final int UNICODE_TO_WYLIE=3;
|
||||||
private static final int WYLIE_TO_UNICODE=4;
|
private static final int WYLIE_TO_UNICODE=4;
|
||||||
private static final int TIBETAN_UNICODE_RANGE[] = {3840, 4095};
|
|
||||||
|
|
||||||
/** Converts from the Acip transliteration scheme to EWTS.*/
|
/** Converts from the Acip transliteration scheme to EWTS.*/
|
||||||
public static String acipToWylie(String acip)
|
public static String acipToWylie(String acip)
|
||||||
|
@ -253,19 +252,7 @@ public class BasicTibetanTranscriptionConverter implements FontConverterConstant
|
||||||
nuevaPalabra = Manipulate.fixWazur(nuevaPalabra);
|
nuevaPalabra = Manipulate.fixWazur(nuevaPalabra);
|
||||||
return nuevaPalabra;*/
|
return nuevaPalabra;*/
|
||||||
}
|
}
|
||||||
|
|
||||||
private static int getTibetanUnicodeStart(String unicode, int pos)
|
|
||||||
{
|
|
||||||
for(; pos < unicode.length(); pos++ ) if(unicode.codePointAt(pos)>=TIBETAN_UNICODE_RANGE[0] && unicode.codePointAt(pos)<=TIBETAN_UNICODE_RANGE[1]) return pos;
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static int getTibetanUnicodeEnd(String unicode, int pos)
|
|
||||||
{
|
|
||||||
for(; pos < unicode.length(); pos++ ) if(unicode.codePointAt(pos)<TIBETAN_UNICODE_RANGE[0] || unicode.codePointAt(pos)>TIBETAN_UNICODE_RANGE[1]) return pos;
|
|
||||||
return pos;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Converts Tibetan Unicode to EWTS. */
|
/** Converts Tibetan Unicode to EWTS. */
|
||||||
public static String unicodeToWylie(String unicode)
|
public static String unicodeToWylie(String unicode)
|
||||||
{
|
{
|
||||||
|
@ -274,9 +261,9 @@ public class BasicTibetanTranscriptionConverter implements FontConverterConstant
|
||||||
TibetanDocument tibDoc;
|
TibetanDocument tibDoc;
|
||||||
StringBuffer errors;
|
StringBuffer errors;
|
||||||
int posStart=0, posEnd;
|
int posStart=0, posEnd;
|
||||||
while((posStart = getTibetanUnicodeStart(unicode, posStart))>=0)
|
while((posStart = Manipulate.getTibetanUnicodeStart(unicode, posStart))>=0)
|
||||||
{
|
{
|
||||||
posEnd = getTibetanUnicodeEnd(unicode, posStart+1);
|
posEnd = Manipulate.getTibetanUnicodeEnd(unicode, posStart+1);
|
||||||
startString = unicode.substring(0, posStart);
|
startString = unicode.substring(0, posStart);
|
||||||
tibetanString = unicode.substring(posStart, posEnd);
|
tibetanString = unicode.substring(posStart, posEnd);
|
||||||
endString = unicode.substring(posEnd);
|
endString = unicode.substring(posEnd);
|
||||||
|
|
|
@ -28,6 +28,7 @@ public class Manipulate
|
||||||
private static String bracketMarks = "<>(){}[]";
|
private static String bracketMarks = "<>(){}[]";
|
||||||
private static String endOfSyllableMarks = " _\t";
|
private static String endOfSyllableMarks = " _\t";
|
||||||
private static String allStopMarkers = endOfSyllableMarks + endOfParagraphMarks + bracketMarks;
|
private static String allStopMarkers = endOfSyllableMarks + endOfParagraphMarks + bracketMarks;
|
||||||
|
private static final int TIBETAN_UNICODE_RANGE[] = {3840, 4095};
|
||||||
|
|
||||||
/* public static String[] parseFields (String s, char delimiter)
|
/* public static String[] parseFields (String s, char delimiter)
|
||||||
{
|
{
|
||||||
|
@ -204,6 +205,18 @@ public class Manipulate
|
||||||
return ch>=0xF00 && ch<=0xFFF;
|
return ch>=0xF00 && ch<=0xFFF;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static boolean isTibetanUnicodeLetter(char ch)
|
||||||
|
{
|
||||||
|
|
||||||
|
return ch>=0xF40 && ch<=0xFBC || ch>=0xF00 && ch<=0xF03;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static boolean isTibetanUnicodeDigit(char ch)
|
||||||
|
{
|
||||||
|
|
||||||
|
return ch>=0xF20 && ch<=0xF33;
|
||||||
|
}
|
||||||
|
|
||||||
public static boolean guessIfUnicode(String line)
|
public static boolean guessIfUnicode(String line)
|
||||||
{
|
{
|
||||||
char ch;
|
char ch;
|
||||||
|
@ -415,4 +428,36 @@ public class Manipulate
|
||||||
}
|
}
|
||||||
return ncr.toString();
|
return ncr.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static String unescape(String s) {
|
||||||
|
int i=0,len=s.length();
|
||||||
|
char c;
|
||||||
|
StringBuffer sb = new StringBuffer(len);
|
||||||
|
while (i<len) {
|
||||||
|
c = s.charAt(i++);
|
||||||
|
if (c=='\\') {
|
||||||
|
if (i<len) {
|
||||||
|
c = s.charAt(i++);
|
||||||
|
if (c=='u') {
|
||||||
|
c = (char) Integer.parseInt(s.substring(i,i+4),16);
|
||||||
|
i += 4;
|
||||||
|
} // add other cases here as desired...
|
||||||
|
}} // fall through: \ escapes itself, quotes any character but u
|
||||||
|
sb.append(c);
|
||||||
|
}
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static int getTibetanUnicodeStart(String unicode, int pos)
|
||||||
|
{
|
||||||
|
for(; pos < unicode.length(); pos++ ) if(unicode.codePointAt(pos)>=TIBETAN_UNICODE_RANGE[0] && unicode.codePointAt(pos)<=TIBETAN_UNICODE_RANGE[1]) return pos;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static int getTibetanUnicodeEnd(String unicode, int pos)
|
||||||
|
{
|
||||||
|
for(; pos < unicode.length(); pos++ ) if(unicode.codePointAt(pos)<TIBETAN_UNICODE_RANGE[0] || unicode.codePointAt(pos)>TIBETAN_UNICODE_RANGE[1]) return pos;
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -40,6 +40,7 @@ import org.thdl.util.ThdlDebug;
|
||||||
import org.thdl.util.ThdlOptions;
|
import org.thdl.util.ThdlOptions;
|
||||||
import org.thdl.util.Trie;
|
import org.thdl.util.Trie;
|
||||||
import org.thdl.tib.scanner.BasicTibetanTranscriptionConverter;
|
import org.thdl.tib.scanner.BasicTibetanTranscriptionConverter;
|
||||||
|
import org.thdl.tib.scanner.Manipulate;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Interfaces between Extended Wylie and the TibetanMachineWeb fonts.
|
* Interfaces between Extended Wylie and the TibetanMachineWeb fonts.
|
||||||
|
@ -221,7 +222,7 @@ public class TibetanMachineWeb implements THDLWylieConstants {
|
||||||
/** comma-delimited list of supported punctuation and
|
/** comma-delimited list of supported punctuation and
|
||||||
miscellaneous characters: */
|
miscellaneous characters: */
|
||||||
private static final String others
|
private static final String others
|
||||||
= "_, ,/,|,!,:,;,@,#,$,%,(,),H,M,&,@#,?,=,{,},*,~X,X"; // FIXME: not yet supporting all these...
|
= "_, ,/,|,!,:,;,@,#,$,%,(,),H,M,&,@#,?,=,{,},\u00A0,~X,X"; // FIXME: not yet supporting all these...
|
||||||
|
|
||||||
/** comma-delimited list of supported vowels: */
|
/** comma-delimited list of supported vowels: */
|
||||||
private static final String vowels
|
private static final String vowels
|
||||||
|
@ -760,7 +761,7 @@ public class TibetanMachineWeb implements THDLWylieConstants {
|
||||||
+ DELIMITER
|
+ DELIMITER
|
||||||
+ " which means that no Wylie is assigned. That isn't supported.");
|
+ " which means that no Wylie is assigned. That isn't supported.");
|
||||||
if (hashOn) {
|
if (hashOn) {
|
||||||
tibHash.put(wylie, duffCodes);
|
tibHash.put(Manipulate.unescape(wylie), duffCodes);
|
||||||
}
|
}
|
||||||
if (isTibetan) {
|
if (isTibetan) {
|
||||||
// Delete the dashes:
|
// Delete the dashes:
|
||||||
|
@ -783,7 +784,7 @@ public class TibetanMachineWeb implements THDLWylieConstants {
|
||||||
+ " has a line with wylie " + wylie + " but no TMW; that's not allowed");
|
+ " has a line with wylie " + wylie + " but no TMW; that's not allowed");
|
||||||
int font = duffCodes[TMW].getFontNum();
|
int font = duffCodes[TMW].getFontNum();
|
||||||
int code = duffCodes[TMW].getCharNum()-32;
|
int code = duffCodes[TMW].getCharNum()-32;
|
||||||
toHashKey[font][code] = wylie;
|
toHashKey[font][code] = Manipulate.unescape(wylie);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -113,7 +113,7 @@ __TILDE__M`~242,1~~8,95~~~~~~~0F82
|
||||||
// dzud.rtags.me.long.can:
|
// dzud.rtags.me.long.can:
|
||||||
\u0F13~94,5~~9,92~~~~~~~0F13
|
\u0F13~94,5~~9,92~~~~~~~0F13
|
||||||
// hard tsheg:
|
// hard tsheg:
|
||||||
*~205,1~~1,108~~~~~~~0F0C
|
\u00A0~205,1~~1,108~~~~~~~0F0C
|
||||||
|
|
||||||
|
|
||||||
<?Input:Tibetan?>
|
<?Input:Tibetan?>
|
||||||
|
|
|
@ -85,7 +85,7 @@ public class UnicodeCodepointToThdlWylie {
|
||||||
case '\u0F09': return "\\u0F09";
|
case '\u0F09': return "\\u0F09";
|
||||||
case '\u0F0A': return "\\u0F0A";
|
case '\u0F0A': return "\\u0F0A";
|
||||||
case '\u0F0B': return " ";
|
case '\u0F0B': return " ";
|
||||||
case '\u0F0C': return "*"; // DLC NOW: Jskad does not support this!
|
case '\u0F0C': return "\\u00A0"; // AMP: Non-break space. Does Jskad support this?
|
||||||
case '\u0F0D': return "/";
|
case '\u0F0D': return "/";
|
||||||
case '\u0F0E': return "//"; // DLC FIXME: this is kind of a hack-- the Unicode standard says the spacing for this construct is different than the spacing for "\u0F0D\u0F0D"
|
case '\u0F0E': return "//"; // DLC FIXME: this is kind of a hack-- the Unicode standard says the spacing for this construct is different than the spacing for "\u0F0D\u0F0D"
|
||||||
case '\u0F0F': return ";";
|
case '\u0F0F': return ";";
|
||||||
|
|
|
@ -115,7 +115,7 @@ class EWTSTshegBarScanner extends TTshegBarScanner {
|
||||||
|| (sb.charAt(i) >= '\u0fcf' && sb.charAt(i) <= '\u0fd1')
|
|| (sb.charAt(i) >= '\u0fcf' && sb.charAt(i) <= '\u0fd1')
|
||||||
|| (THDLWylieConstants.SAUVASTIKA == sb.charAt(i))
|
|| (THDLWylieConstants.SAUVASTIKA == sb.charAt(i))
|
||||||
|| (THDLWylieConstants.SWASTIKA == sb.charAt(i))
|
|| (THDLWylieConstants.SWASTIKA == sb.charAt(i))
|
||||||
|| (" /;|!:=_@#$%<>(){}*&\r\n\t\u0f36\u0f38\u0f89\u0f8a\u0f8b".indexOf(sb.charAt(i))
|
|| (" /;|!:=_@#$%<>(){}*&\r\n\t\u0f36\u0f38\u0f89\u0f8a\u0f8b\u00a0".indexOf(sb.charAt(i))
|
||||||
>= 0)) {
|
>= 0)) {
|
||||||
al.add(new TString("EWTS", sb.substring(i, i+1),
|
al.add(new TString("EWTS", sb.substring(i, i+1),
|
||||||
TString.TIBETAN_PUNCTUATION));
|
TString.TIBETAN_PUNCTUATION));
|
||||||
|
|
|
@ -90,7 +90,7 @@ public class Trie
|
||||||
{
|
{
|
||||||
|
|
||||||
/** Size of the m_nextChar array. */
|
/** Size of the m_nextChar array. */
|
||||||
public static final int ALPHA_SIZE = 128;
|
public static final int ALPHA_SIZE = 161;
|
||||||
|
|
||||||
/** The root node of the tree. */
|
/** The root node of the tree. */
|
||||||
Node m_Root;
|
Node m_Root;
|
||||||
|
|
Loading…
Reference in a new issue