The ACIP {NYA%} is supported. {NYAo} and {NYAx} are confusing to me,
because I don't know which glyphs o and x correspond to. For that reason, they cause ERRORs. The proposed THDL Extended Wylie ~X and X is now used for U+0F35 and U+0F37 respectively.
This commit is contained in:
		
							parent
							
								
									f57cdda867
								
							
						
					
					
						commit
						07e360d9a8
					
				
					 8 changed files with 96 additions and 32 deletions
				
			
		|  | @ -144,7 +144,7 @@ public class TMW_RTF_TO_THDL_WYLIETest extends TestCase { | |||
|     /** Tests the --to-wylie converter mode of {@link | ||||
|      *  org.thdl.tib.input.TibetanConverter}. */ | ||||
|     public void testConverterMode() { | ||||
|         helper("--to-wylie", "Conversion", 44); | ||||
|         helper("--to-wylie", "Conversion", 0); | ||||
|     } | ||||
| 
 | ||||
|     /** Tests the --to-tibetan-machine converter mode of {@link | ||||
|  |  | |||
|  | @ -37,8 +37,8 @@ rgyal ba kun dngos mtsho skyes rdo rje bstan pa'i rtsa lag thams cad mkhyen pa z | |||
| bka' drin gzugs can dbyig 'dzin lto 'dir shong 'gyur min na kun mkhyen srang las gang gis gzhal//\par | ||||
| \par | ||||
| li khri'i lcug phran mkhyen pa'i snang ba can//\par | ||||
| 'jam mgon blo<<[[JSKAD_TMW_TO_WYLIE_ERROR_NO_SUCH_WYLIE: Cannot convert DuffCode <duffcode font=TibetanMachineWeb8 charNum=101 character=e/> to THDL Extended Wylie.  Please see the documentation for the TMW font and transcribe this yourself.]]>>. yi lang tsho baza<<[[JSKAD_TMW_TO_WYLIE_ERROR_NO_SUCH_WYLIE: Cannot convert DuffCode <duffcode font=TibetanMachineWeb8 charNum=101 character=e/> to THDL Extended Wylie.  Please see the documentation for the TMW font and transcribe this yourself.]]>>.nga po'i tshon//\par | ||||
| kha dog so sor bkra ba'i graga<<[[JSKAD_TMW_TO_WYLIE_ERROR_NO_SUCH_WYLIE: Cannot convert DuffCode <duffcode font=TibetanMachineWeb8 charNum=101 character=e/> to THDL Extended Wylie.  Please see the documentation for the TMW font and transcribe this yourself.]]>>.sa pa<<[[JSKAD_TMW_TO_WYLIE_ERROR_NO_SUCH_WYLIE: Cannot convert DuffCode <duffcode font=TibetanMachineWeb8 charNum=101 character=e/> to THDL Extended Wylie.  Please see the documentation for the TMW font and transcribe this yourself.]]>>.'i rgyan//\par | ||||
| 'jam mgon bloX. yi lang tsho bazaX.nga po'i tshon//\par | ||||
| kha dog so sor bkra ba'i gragaX.sa paX.'i rgyan//\par | ||||
| phyogs bral rna lung 'god mkhas rtag tu rgyal//\f2\fs44\i0\b0\ul0\cf0\par | ||||
| \par | ||||
| \f1\fs28\i0\b0\ul0 dpal ldan chos kyi rang bzhin ngos yangs par//\par | ||||
|  |  | |||
|  | @ -5,6 +5,10 @@ | |||
| //   - initial // marks a comment | ||||
| //   - blank lines should be ignored | ||||
| //   - <?x?> marks a command | ||||
| // | ||||
| // If you change the Wylie here, it can break the ACIP->TMW and | ||||
| // ACIP->Unicode conversion.  So keep ACIPRules in sync with this, and be | ||||
| // sure to run 'ant clean check' after your change. | ||||
| 
 | ||||
| <?Input:Punctuation?> | ||||
| //_~32,1~0,32 | ||||
|  | @ -34,6 +38,10 @@ $~38,5~~9,41~~~~~~~0F06 | |||
| // dbu.khang.g-yas: (If this changes, edit ACIPConverter) | ||||
| )~209,1~~9,94~~~~~~~0F3D | ||||
| H~239,1~~8,92~~~~~~~0F7F | ||||
| // mtshan.rtags: | ||||
| X~101,5~~9,101~~~~~~~0F37 | ||||
| // mtshan.rtags zhes.sa: | ||||
| __TILDE__X~102,5~~9,102~~~~~~~0F35 | ||||
| 
 | ||||
| // 8,91 is the small bindu.  We say that this, and not 8,90 (large | ||||
| // anusvara) is the glyph that M yields.  This is because [8,90] is | ||||
|  | @ -971,10 +979,6 @@ r~176,4~~8,71~~~~~~~0FB2 | |||
| // mchan rtags leading: | ||||
| \tmw8100~100,5~~9,100~~~~~~~none | ||||
| 
 | ||||
| // mtshan.rtags: | ||||
| \tmw8101~101,5~~9,101~~~~~~~0F37 | ||||
| // mtshan.rtags zhes.sa: | ||||
| \tmw8102~102,5~~9,102~~~~~~~0F35 | ||||
| // che.mgo: | ||||
| \tmw8103~103,5~~9,103~~~~~~~0F38 | ||||
| // kuruka: | ||||
|  |  | |||
|  | @ -186,10 +186,10 @@ public class ACIPConverter { | |||
|         ByteArrayOutputStream sw = new ByteArrayOutputStream(); | ||||
|         ArrayList al = ACIPTshegBarScanner.scan(acip, errors, -1); | ||||
|         try { | ||||
|             if (null != al | ||||
|                 && convertToUnicode(al, sw, errors, | ||||
|             if (null != al) { | ||||
|                 convertToUnicode(al, sw, errors, | ||||
|                                  warnings, writeWarningsToResult, | ||||
|                                     warningLevel)) { | ||||
|                                  warningLevel); | ||||
|                 return sw.toString("UTF-8"); | ||||
|             } else { | ||||
|                 return null; | ||||
|  | @ -282,6 +282,33 @@ public class ACIPConverter { | |||
|                 String text = "[#ERROR CONVERTING ACIP DOCUMENT: Lexical error: " + s.getText() + "]"; | ||||
|                 if (null != writer) writer.write(text); | ||||
|                 if (null != tdoc) tdoc.appendRoman(text, Color.RED); | ||||
|             } else if (stype == ACIPString.TSHEG_BAR_ADORNMENT) { | ||||
|                 if (lastGuyWasNonPunct) { | ||||
|                     String err = "[#ERROR CONVERTING ACIP DOCUMENT: This converter cannot yet convert " + s.getText() + " because the converter's author is unclear what the result should be.]"; | ||||
|                     if (null != writer) { | ||||
|                         String uni = ACIPRules.getUnicodeFor(s.getText(), false); | ||||
|                         if (null == uni) { | ||||
|                             hasErrors = true; | ||||
|                             uni = err; | ||||
|                         } | ||||
|                         if (null != writer) writer.write(uni); | ||||
|                     } | ||||
|                     if (null != tdoc) { | ||||
|                         String wylie | ||||
|                             = ACIPRules.getWylieForACIPOther(s.getText()); | ||||
|                         if (null == wylie) { | ||||
|                             hasErrors = true; | ||||
|                             tdoc.appendRoman(err, Color.RED); | ||||
|                         } else { | ||||
|                             tdoc.appendDuffCodes(new DuffCode[] { TibetanMachineWeb.getGlyph(wylie) }, | ||||
|                                                  Color.BLACK); | ||||
|                         } | ||||
|                     } | ||||
|                 } else { | ||||
|                     hasErrors = true; | ||||
|                 } | ||||
|                 lastGuyWasNonPunct = true; // this stuff is not really punctuation | ||||
|                 lastGuy = null; | ||||
|             } else if (stype == ACIPString.WARNING) { | ||||
|                 lastGuyWasNonPunct = false; | ||||
|                 lastGuy = null; | ||||
|  |  | |||
|  | @ -236,7 +236,7 @@ class ACIPRules { | |||
|             acipOther2wylie.put(";", ";"); | ||||
|             acipOther2wylie.put("*", "@"); | ||||
|             acipOther2wylie.put("#", "@#"); | ||||
|             acipOther2wylie.put("%", "%"); | ||||
|             acipOther2wylie.put("%", "~X"); | ||||
|             acipOther2wylie.put("&", "&"); | ||||
| 
 | ||||
|             acipOther2wylie.put("0", "0"); | ||||
|  |  | |||
|  | @ -77,9 +77,11 @@ public class ACIPString { | |||
|     public static final int END_PAREN = 16; | ||||
|     /** For things that may not be legal syntax, such as {KA . KHA} */ | ||||
|     public static final int WARNING = 17; | ||||
|     /** For ACIP %, o, and x */ | ||||
|     public static final int TSHEG_BAR_ADORNMENT = 18; | ||||
|     /** For things that are not legal syntax, such as a file that | ||||
|      * contains just "[# HALF A COMMEN" */ | ||||
|     public static final int ERROR = 18; | ||||
|     public static final int ERROR = 19; | ||||
| 
 | ||||
|     /** Returns true if and only if this string is Latin (usually | ||||
|      *  English).  Returns false if this string is transliteration of | ||||
|  | @ -135,6 +137,7 @@ public class ACIPString { | |||
|         if (type == START_PAREN) typeString = "START_PAREN"; | ||||
|         if (type == END_PAREN) typeString = "END_PAREN"; | ||||
|         if (type == WARNING) typeString = "WARNING"; | ||||
|         if (type == TSHEG_BAR_ADORNMENT) typeString = "TSHEG_BAR_ADORNMENT"; | ||||
|         if (type == ERROR) typeString = "ERROR"; | ||||
|         return typeString + ":{" + getText() + "}"; | ||||
|     } | ||||
|  |  | |||
|  | @ -767,9 +767,16 @@ public class ACIPTshegBarScanner { | |||
|             case ';': | ||||
|             case '`': | ||||
|             case '#': | ||||
|             case '%': | ||||
|             case 'x': | ||||
|             case 'o': | ||||
| 
 | ||||
|                 boolean legalTshegBarAdornment = false; | ||||
|                 // The tsheg bar ends here; new token. | ||||
|                 if (startOfString < i) { | ||||
|                     if (currentType == ACIPString.TIBETAN_NON_PUNCTUATION | ||||
|                         && isTshegBarAdornment(ch)) | ||||
|                         legalTshegBarAdornment = true; | ||||
|                     al.add(new ACIPString(s.substring(startOfString, i), | ||||
|                                           currentType)); | ||||
|                 } | ||||
|  | @ -780,7 +787,8 @@ public class ACIPTshegBarScanner { | |||
|                 if (('\r' == ch | ||||
|                      || ('\n' == ch && i > 0 && s.charAt(i - 1) != '\r')) | ||||
|                     && !al.isEmpty() | ||||
|                     && ((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TIBETAN_NON_PUNCTUATION) { | ||||
|                     && (((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TIBETAN_NON_PUNCTUATION | ||||
|                         || ((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TSHEG_BAR_ADORNMENT)) { | ||||
|                     al.add(new ACIPString(" ", ACIPString.TIBETAN_PUNCTUATION)); | ||||
|                 } | ||||
| 
 | ||||
|  | @ -788,7 +796,8 @@ public class ACIPTshegBarScanner { | |||
|                 if (('\r' == ch | ||||
|                      || ('\n' == ch && i > 0 && s.charAt(i - 1) != '\r')) | ||||
|                     && !al.isEmpty() | ||||
|                     && ((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TIBETAN_PUNCTUATION | ||||
|                     && (((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TIBETAN_PUNCTUATION | ||||
|                         || ((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TSHEG_BAR_ADORNMENT) | ||||
|                     && ((ACIPString)al.get(al.size() - 1)).getText().equals(",") | ||||
|                     && s.charAt(i-1) == ',' | ||||
|                     && (i + (('\r' == ch) ? 2 : 1) < sl | ||||
|  | @ -804,9 +813,17 @@ public class ACIPTshegBarScanner { | |||
|                     || (realNewline | ||||
|                         = ((rn = ('\n' == ch && i >= 3 && s.charAt(i-3) == '\r' && s.charAt(i-2) == '\n' && s.charAt(i-1) == '\r')) | ||||
|                            || ('\n' == ch && i >= 1 && s.charAt(i-1) == '\n')))) { | ||||
|                     for (int h = 0; h < (realNewline ? 2 : 1); h++) | ||||
|                     for (int h = 0; h < (realNewline ? 2 : 1); h++) { | ||||
|                         if (isTshegBarAdornment(ch) && !legalTshegBarAdornment) { | ||||
|                             al.add(new ACIPString("The ACIP " + ch + " must be glued to the end of a tsheg bar, but this one was not", | ||||
|                                                   ACIPString.ERROR)); | ||||
|                         } else { | ||||
|                             al.add(new ACIPString(rn ? s.substring(i - 1, i+1) : s.substring(i, i+1), | ||||
|                                               ACIPString.TIBETAN_PUNCTUATION)); | ||||
|                                                   (legalTshegBarAdornment | ||||
|                                                    ? ACIPString.TSHEG_BAR_ADORNMENT | ||||
|                                                    : ACIPString.TIBETAN_PUNCTUATION))); | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|                 startOfString = i+1; | ||||
|                 currentType = ACIPString.ERROR; | ||||
|  | @ -910,15 +927,17 @@ public class ACIPTshegBarScanner { | |||
|         return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n'; | ||||
|     } | ||||
| 
 | ||||
|     /** See implementation. */ | ||||
|     private static boolean isTshegBarAdornment(char ch) { | ||||
|         return (ch == '%' || ch == 'o' || ch == 'x'); | ||||
|     } | ||||
| 
 | ||||
|     /** See implementation. */ | ||||
|     private static boolean isAlpha(char ch) { | ||||
|         return ch == '\'' // 23rd consonant | ||||
| 
 | ||||
|             // combining punctuation, vowels: | ||||
|             || ch == '%' | ||||
|             || ch == 'o' | ||||
|             || ch == 'm' | ||||
|             || ch == 'x' | ||||
|             || ch == ':' | ||||
|             || ch == '^' | ||||
|             // DLC FIXME: we must treat this guy like a vowel, a special vowel that numerals can take on.  Until then, warn.            || ch == '\\' | ||||
|  |  | |||
|  | @ -7171,7 +7171,13 @@ tstHelper("ZUR"); | |||
|               "[TIBETAN_NON_PUNCTUATION:{K}, ERROR:{Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly.  Sorry!  Please do complain to the maintainers.}, TIBETAN_PUNCTUATION:{,}]"); | ||||
| 
 | ||||
| 
 | ||||
|         shelp("MTHAR%", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR%}]"); | ||||
|         shelp("MTHAR%", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR}, TSHEG_BAR_ADORNMENT:{%}]"); | ||||
|         shelp("MTHARo", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR}, TSHEG_BAR_ADORNMENT:{o}]"); | ||||
|         shelp("MTHARx", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR}, TSHEG_BAR_ADORNMENT:{x}]"); | ||||
| 
 | ||||
|         shelp("MTHAR\n%", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR}, TIBETAN_PUNCTUATION:{ }, ERROR:{The ACIP % must be glued to the end of a tsheg bar, but this one was not}]"); | ||||
|         shelp("MTHAR x", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR}, TIBETAN_PUNCTUATION:{ }, ERROR:{The ACIP x must be glued to the end of a tsheg bar, but this one was not}]"); | ||||
| 
 | ||||
|         shelp("PHYIR;", "", "[TIBETAN_NON_PUNCTUATION:{PHYIR}, TIBETAN_PUNCTUATION:{;}]"); | ||||
|         shelp("......,DAM ", | ||||
|               "", | ||||
|  | @ -7254,6 +7260,10 @@ tstHelper("ZUR"); | |||
|     } | ||||
| 
 | ||||
|     public void testACIPConversion() { | ||||
|         uhelp("KA%\nKHA", "\u0f40\u0f35\u0f0b\u0f41"); | ||||
|         uhelp("KA%", "\u0f40\u0f35"); | ||||
|         uhelp("KAo", "\u0f40[#ERROR CONVERTING ACIP DOCUMENT: This converter cannot yet convert o because the converter's author is unclear what the result should be.]"); | ||||
|         uhelp("KAx", "\u0f40[#ERROR CONVERTING ACIP DOCUMENT: This converter cannot yet convert x because the converter's author is unclear what the result should be.]"); | ||||
|         uhelp("G+DHA", "\u0f42\u0fa2"); | ||||
|         uhelp("P'EE", "\u0f54\u0f71\u0f7b"); | ||||
| 
 | ||||
|  | @ -7284,13 +7294,11 @@ tstHelper("ZUR"); | |||
| 
 | ||||
|         uhelp("K'A:", "\u0f40\u0f71\u0f7f"); | ||||
| 
 | ||||
|         // DLC FIXME: in ACIP RTF files, (PARENTHESES) seem to make | ||||
|         // text go from 24-point to 18-point.  Thus, ACIP->Unicode.txt | ||||
|         // is fundamentally flawed, whereas ACIP->Unicode.rtf is OK. | ||||
| 
 | ||||
|         uhelp("/NY'EE/", "\u0f3C\u0f49\u0F71\u0F7B\u0f3D"); | ||||
|         uhelp("*#HUm: G+DHOO GRO`;.,", "\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b\u0f42\u0fa2\u0f7d\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d"); | ||||
|         uhelp("*#HUm: K+DHA GRO`;.,", "none"); | ||||
|         uhelp("*#HUm: G+DHOO GRO`;.,", | ||||
|               "\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b\u0f42\u0fa2\u0f7d\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d"); | ||||
|         uhelp("*#HUm: K+DHA GRO`;.,", | ||||
|               "\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") K+DHA IS ESSENTIALLY NOTHING.]\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d"); | ||||
|     } | ||||
| 
 | ||||
|     /** Tests some more tsheg bars, these from Dr. Lacey's critical | ||||
|  | @ -8861,6 +8869,9 @@ tstHelper("shKA"); | |||
| } | ||||
| /* DLC FIXME: add test cases: from R0021F.ACE: ambiguous Tibetan/Sanskrit: | ||||
| 
 | ||||
| DLC NOW: warn, in "All" mode, about each occurrence of BD, DB, DG, | ||||
| DGR, DGY, DM, GD, GN, MN (but not B+D etc.) | ||||
| 
 | ||||
| BDA' | ||||
| B+DA | ||||
| DBANG | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue