I'm now stricter about accepting alphabetic characters. F, Q, X, a,
b, c, d, e, ... do not belong in ACIP, so the scanner rejects them. This should make it even easier to distinguish automatically between Tibetan and English texts.
This commit is contained in:
parent
39451d8879
commit
245aac4911
2 changed files with 10 additions and 4 deletions
|
@ -845,6 +845,7 @@ public class ACIPTshegBarScanner {
|
||||||
// combining punctuation, vowels:
|
// combining punctuation, vowels:
|
||||||
|| ch == '%'
|
|| ch == '%'
|
||||||
|| ch == 'o'
|
|| ch == 'o'
|
||||||
|
|| ch == 'm'
|
||||||
|| ch == 'x'
|
|| ch == 'x'
|
||||||
|| ch == ':'
|
|| ch == ':'
|
||||||
|| ch == '^'
|
|| ch == '^'
|
||||||
|
@ -852,8 +853,13 @@ public class ACIPTshegBarScanner {
|
||||||
|
|
||||||
|| ch == '-'
|
|| ch == '-'
|
||||||
|| ch == '+'
|
|| ch == '+'
|
||||||
|
|| ((ch >= 'A' && ch <= 'Z') && ch != 'X' && ch != 'Q' && ch != 'F')
|
||||||
|| (ch >= 'A' && ch <= 'Z')
|
|| ch == 'i'
|
||||||
|| (ch >= 'a' && ch <= 'z');
|
|| ch == 't'
|
||||||
|
|| ch == 'h'
|
||||||
|
|| ch == 'd'
|
||||||
|
|| ch == 'n'
|
||||||
|
|| ch == 's'
|
||||||
|
|| ch == 'h';
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -7029,7 +7029,7 @@ tstHelper("ZUR");
|
||||||
shelp("?", "", "[QUESTION:{?}]");
|
shelp("?", "", "[QUESTION:{?}]");
|
||||||
shelp("KHAN~ BAR ", "Offset 4: Found an illegal character, ~, with ordinal 126.\n");
|
shelp("KHAN~ BAR ", "Offset 4: Found an illegal character, ~, with ordinal 126.\n");
|
||||||
shelp("[* Correction with []]",
|
shelp("[* Correction with []]",
|
||||||
"Offset 19: Found an illegal open bracket (in context, this is []]). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset 21: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
|
"Offset 5: Found an illegal character, r, with ordinal 114.\nOffset 6: Found an illegal character, r, with ordinal 114.\nOffset 7: Found an illegal character, e, with ordinal 101.\nOffset 8: Found an illegal character, c, with ordinal 99.\nOffset 14: Found an illegal character, w, with ordinal 119.\nOffset 15: Found an illegal character, i, with ordinal 105.\nOffset 19: Found an illegal open bracket (in context, this is []]). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset 21: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
|
||||||
|
|
||||||
// DLC FIXME: the line SDIG PA'I GROGS PO'I LAG TU SON PAR 'GYUR PA is followed by a blank line. Note that it's "PA", not "PA ", ending it. Autocorrect to the latter.
|
// DLC FIXME: the line SDIG PA'I GROGS PO'I LAG TU SON PAR 'GYUR PA is followed by a blank line. Note that it's "PA", not "PA ", ending it. Autocorrect to the latter.
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue