2003-08-14 05:10:47 +00:00
|
|
|
/*
|
|
|
|
The contents of this file are subject to the THDL Open Community License
|
|
|
|
Version 1.0 (the "License"); you may not use this file except in compliance
|
|
|
|
with the License. You may obtain a copy of the License on the THDL web site
|
|
|
|
(http://www.thdl.org/).
|
|
|
|
|
|
|
|
Software distributed under the License is distributed on an "AS IS" basis,
|
|
|
|
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
|
|
|
License for the specific terms governing rights and limitations under the
|
|
|
|
License.
|
|
|
|
|
|
|
|
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
|
|
|
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
|
|
|
All Rights Reserved.
|
|
|
|
|
|
|
|
Contributor(s): ______________________________________.
|
|
|
|
*/
|
|
|
|
|
|
|
|
package org.thdl.tib.text.ttt;
|
|
|
|
|
2003-10-26 02:42:06 +00:00
|
|
|
import java.util.HashSet;
|
2005-06-20 06:18:00 +00:00
|
|
|
|
|
|
|
import org.thdl.tib.text.tshegbar.UnicodeUtils;
|
|
|
|
import org.thdl.util.ThdlDebug;
|
|
|
|
import org.thdl.util.ThdlOptions;
|
2003-10-26 02:42:06 +00:00
|
|
|
|
2003-08-14 05:10:47 +00:00
|
|
|
/**
|
2003-10-04 01:22:59 +00:00
|
|
|
* An TString is some Latin text and a type, the type stating whether
|
|
|
|
* said text is Latin (usually English) or transliteration of Tibetan,
|
|
|
|
* which transliteration system (ACIP or EWTS), and which particular
|
|
|
|
* kind. Scanning errors are also encoded as TStrings using a special
|
|
|
|
* type.
|
2003-08-14 05:10:47 +00:00
|
|
|
*
|
2004-06-06 21:39:06 +00:00
|
|
|
* <p><em>Note well</em> that when parsing ACIP, certain types of
|
|
|
|
* TStrings (corrections, comments, question, dd, bp, etc.) will not be
|
|
|
|
* encountered if {@link
|
|
|
|
* ACIPTshegBarScanner#BRACKETED_SECTIONS_PASS_THROUGH_UNMODIFIED} is
|
|
|
|
* true.</p>
|
|
|
|
*
|
2003-10-04 01:22:59 +00:00
|
|
|
* @author David Chandler */
|
|
|
|
public class TString {
|
2003-08-14 05:10:47 +00:00
|
|
|
private int type;
|
|
|
|
private String text;
|
2004-01-17 19:28:54 +00:00
|
|
|
// "EWTS" or "ACIP", interned (for quick, '==' equality checking:
|
|
|
|
private String encoding;
|
|
|
|
|
|
|
|
/** Returns "EWTS" if this TString is encoded in EWTS, or,
|
|
|
|
otherwise, "ACIP" if this TString is encoded in ACIP. Returns
|
|
|
|
an interned string for quick equality checking via the
|
|
|
|
<code>==</code> operator. */
|
|
|
|
public String getEncoding() {
|
|
|
|
return encoding;
|
|
|
|
}
|
|
|
|
|
2005-02-21 01:16:10 +00:00
|
|
|
/** Returns true if and only if a TString with type <i>type</i>
|
2003-11-29 22:56:18 +00:00
|
|
|
* is to be converted to something other than Tibetan text.
|
|
|
|
* (Chinese Unicode, Latin, etc. all qualify as non-Tibetan.) */
|
|
|
|
public boolean isLatin() {
|
2003-12-08 07:15:27 +00:00
|
|
|
char ch;
|
2003-08-18 02:38:54 +00:00
|
|
|
return (type != TIBETAN_NON_PUNCTUATION
|
|
|
|
&& type != TIBETAN_PUNCTUATION
|
2003-09-07 18:30:59 +00:00
|
|
|
&& type != TSHEG_BAR_ADORNMENT
|
|
|
|
&& type != START_PAREN
|
|
|
|
&& type != END_PAREN
|
2003-08-18 02:38:54 +00:00
|
|
|
&& type != START_SLASH
|
2003-11-29 22:56:18 +00:00
|
|
|
&& type != END_SLASH
|
|
|
|
&& (type != UNICODE_CHARACTER
|
2003-12-08 07:15:27 +00:00
|
|
|
|| !(UnicodeUtils.isInTibetanRange(ch = getText().charAt(0))
|
|
|
|
// EWTS maps some TMW glyphs to this Unicode
|
|
|
|
// private-use area (PUA):
|
|
|
|
|| (ch >= '\uF021' && ch <= '\uF0FF'))));
|
2003-08-18 02:38:54 +00:00
|
|
|
}
|
|
|
|
|
2003-11-09 01:07:45 +00:00
|
|
|
/** For ACIP [#COMMENTS] and EWTS (DLC FIXME: what are EWTS comments?) */
|
2003-08-14 05:10:47 +00:00
|
|
|
public static final int COMMENT = 0;
|
2003-10-04 01:22:59 +00:00
|
|
|
/** For Folio markers like @012B in ACIP */
|
2003-08-14 05:10:47 +00:00
|
|
|
public static final int FOLIO_MARKER = 1;
|
Improved the ACIP scanner (the part of the converter that says, "This
is a correction, that's a comment, this is Tibetan, that's Latin
(English), that's Tibetan inter-tsheg-bar punctuation, etc.) It now
accepts more real-world ACIP files, i.e. it handles illegal
constructs. The error checking is more user-friendly. There are now
tests.
Added some tsheg bars that Peter E. Hauer of Linguasoft sent me to the
tests. Many thanks, Peter. I still need to implement rules that say,
"This is not Tibetan, it must be Sanskrit, because that letter doesn't
take a MA prefix."
2003-08-17 01:45:55 +00:00
|
|
|
/** For Latin letters and numbers etc. [*LINE BREAK?] uses this,
|
2003-10-04 01:22:59 +00:00
|
|
|
* for example. Or in EWTS, \f uses this. */
|
Improved the ACIP scanner (the part of the converter that says, "This
is a correction, that's a comment, this is Tibetan, that's Latin
(English), that's Tibetan inter-tsheg-bar punctuation, etc.) It now
accepts more real-world ACIP files, i.e. it handles illegal
constructs. The error checking is more user-friendly. There are now
tests.
Added some tsheg bars that Peter E. Hauer of Linguasoft sent me to the
tests. Many thanks, Peter. I still need to implement rules that say,
"This is not Tibetan, it must be Sanskrit, because that letter doesn't
take a MA prefix."
2003-08-17 01:45:55 +00:00
|
|
|
public static final int LATIN = 2;
|
2003-08-14 05:10:47 +00:00
|
|
|
/** For Tibetan letters and numbers etc. */
|
Improved the ACIP scanner (the part of the converter that says, "This
is a correction, that's a comment, this is Tibetan, that's Latin
(English), that's Tibetan inter-tsheg-bar punctuation, etc.) It now
accepts more real-world ACIP files, i.e. it handles illegal
constructs. The error checking is more user-friendly. There are now
tests.
Added some tsheg bars that Peter E. Hauer of Linguasoft sent me to the
tests. Many thanks, Peter. I still need to implement rules that say,
"This is not Tibetan, it must be Sanskrit, because that letter doesn't
take a MA prefix."
2003-08-17 01:45:55 +00:00
|
|
|
public static final int TIBETAN_NON_PUNCTUATION = 3;
|
2003-08-14 05:10:47 +00:00
|
|
|
/** For tshegs, whitespace and the like, but not combining
|
2003-10-04 01:22:59 +00:00
|
|
|
* punctutation like ACIP %, o, :, m, and x */
|
Improved the ACIP scanner (the part of the converter that says, "This
is a correction, that's a comment, this is Tibetan, that's Latin
(English), that's Tibetan inter-tsheg-bar punctuation, etc.) It now
accepts more real-world ACIP files, i.e. it handles illegal
constructs. The error checking is more user-friendly. There are now
tests.
Added some tsheg bars that Peter E. Hauer of Linguasoft sent me to the
tests. Many thanks, Peter. I still need to implement rules that say,
"This is not Tibetan, it must be Sanskrit, because that letter doesn't
take a MA prefix."
2003-08-17 01:45:55 +00:00
|
|
|
public static final int TIBETAN_PUNCTUATION = 4;
|
2003-10-04 01:22:59 +00:00
|
|
|
/** For the start of a [*probable correction] or [*possible correction?] in ACIP */
|
2003-08-14 05:10:47 +00:00
|
|
|
public static final int CORRECTION_START = 5;
|
2003-10-04 01:22:59 +00:00
|
|
|
/** Denotes the end of a [*probable correction] in ACIP */
|
2003-08-14 05:10:47 +00:00
|
|
|
public static final int PROBABLE_CORRECTION = 6;
|
2003-10-04 01:22:59 +00:00
|
|
|
/** Denotes the end of a [*possible correction?] in ACIP*/
|
2003-08-14 05:10:47 +00:00
|
|
|
public static final int POSSIBLE_CORRECTION = 7;
|
2003-10-04 01:22:59 +00:00
|
|
|
/** For [BP] -- blank page in ACIP*/
|
2003-08-14 05:10:47 +00:00
|
|
|
public static final int BP = 8;
|
2003-10-04 01:22:59 +00:00
|
|
|
/** For [LS] -- Lanycha script on page in ACIP*/
|
2003-08-14 05:10:47 +00:00
|
|
|
public static final int LS = 9;
|
2003-10-04 01:22:59 +00:00
|
|
|
/** For [DR] -- picture (without caption) on page in ACIP*/
|
2003-08-14 05:10:47 +00:00
|
|
|
public static final int DR = 10;
|
2003-10-04 01:22:59 +00:00
|
|
|
/** For [DD], [DDD], [DD1], [DD2], et cetera -- picture with caption on page in ACIP */
|
2003-08-14 05:10:47 +00:00
|
|
|
public static final int DD = 11;
|
2003-10-04 01:22:59 +00:00
|
|
|
/** For [?] in ACIP */
|
2003-08-14 05:10:47 +00:00
|
|
|
public static final int QUESTION = 12;
|
2003-10-04 01:22:59 +00:00
|
|
|
/** For the first / in /NYA/ in ACIP */
|
2003-08-14 05:10:47 +00:00
|
|
|
public static final int START_SLASH = 13;
|
2003-10-04 01:22:59 +00:00
|
|
|
/** For the last / in /NYA/ in ACIP */
|
2003-08-14 05:10:47 +00:00
|
|
|
public static final int END_SLASH = 14;
|
2003-10-04 01:22:59 +00:00
|
|
|
/** For the opening ( in (NYA) in ACIP */
|
2003-08-14 05:10:47 +00:00
|
|
|
public static final int START_PAREN = 15;
|
2003-10-04 01:22:59 +00:00
|
|
|
/** For the closing ) in (NYA) in ACIP */
|
2003-08-14 05:10:47 +00:00
|
|
|
public static final int END_PAREN = 16;
|
2003-08-24 06:40:53 +00:00
|
|
|
/** For things that may not be legal syntax, such as {KA . KHA} */
|
|
|
|
public static final int WARNING = 17;
|
2003-11-09 01:07:45 +00:00
|
|
|
/** For ACIP %, o, and x or EWTS (DLC FIXME: what are EWTS adornments?) */
|
2003-09-07 16:19:50 +00:00
|
|
|
public static final int TSHEG_BAR_ADORNMENT = 18;
|
2003-11-29 22:56:18 +00:00
|
|
|
/** For "\\uMNOP", this TString will contain the string that has
|
|
|
|
just the sole character "\\uMNOP". */
|
|
|
|
public static final int UNICODE_CHARACTER = 19;
|
2003-08-14 05:10:47 +00:00
|
|
|
/** For things that are not legal syntax, such as a file that
|
2003-11-29 22:56:18 +00:00
|
|
|
* contains just "[# HALF A COMMEN". THIS MUST COME LAST. */
|
|
|
|
public static final int ERROR = 20;
|
2003-08-14 05:10:47 +00:00
|
|
|
|
2003-11-29 22:56:18 +00:00
|
|
|
/** Returns the type of this string, which is one of the
|
|
|
|
enumerated integer static final members of this class. */
|
2003-08-14 05:10:47 +00:00
|
|
|
public int getType() {
|
|
|
|
return type;
|
|
|
|
}
|
|
|
|
|
|
|
|
/** Returns the non-null, non-empty String of text associated with
|
|
|
|
* this string. */
|
|
|
|
public String getText() {
|
|
|
|
return text;
|
|
|
|
}
|
|
|
|
|
|
|
|
private void setType(int t) {
|
|
|
|
if (t < COMMENT || t > ERROR)
|
|
|
|
throw new IllegalArgumentException("Bad type");
|
|
|
|
type = t;
|
|
|
|
}
|
|
|
|
|
|
|
|
private void setText(String t) {
|
|
|
|
if (t == null || "".equals(t))
|
|
|
|
throw new IllegalArgumentException("null or empty text, DD should have text [DD] e.g.");
|
|
|
|
text = t;
|
|
|
|
}
|
|
|
|
|
2003-10-26 02:17:19 +00:00
|
|
|
/** Don't instantiate using this constructor. */
|
2003-10-04 01:22:59 +00:00
|
|
|
private TString() { }
|
2003-08-14 05:10:47 +00:00
|
|
|
|
2004-01-17 19:28:54 +00:00
|
|
|
/** Creates a new TString with source text <i>text</i>, encoded
|
|
|
|
* using the Roman transliteration system specified by
|
2004-04-17 15:48:50 +00:00
|
|
|
* <i>encoding</i> (see {@link #getEncoding()}) and type
|
2003-09-10 01:19:05 +00:00
|
|
|
* <i>type</i> being a characterization like {@link #DD}. */
|
2004-01-17 19:28:54 +00:00
|
|
|
public TString(String encoding, String text, int type) {
|
|
|
|
this.encoding = encoding;
|
2003-08-14 05:10:47 +00:00
|
|
|
setType(type);
|
2003-10-26 02:42:06 +00:00
|
|
|
String ftext = (TIBETAN_NON_PUNCTUATION == type)
|
|
|
|
? MidLexSubstitution.getFinalValueForTibetanNonPunctuationToken(text)
|
|
|
|
: text;
|
2004-01-17 19:28:54 +00:00
|
|
|
// FIXME: assert these
|
2003-11-29 22:56:18 +00:00
|
|
|
ThdlDebug.verify(type != UNICODE_CHARACTER || text.length() == 1);
|
2004-01-17 19:28:54 +00:00
|
|
|
ThdlDebug.verify("EWTS" == encoding || "ACIP" == encoding);
|
2003-10-26 02:42:06 +00:00
|
|
|
setText(ftext);
|
|
|
|
if ((outputAllTshegBars || outputUniqueTshegBars) && TIBETAN_NON_PUNCTUATION == type)
|
|
|
|
outputTshegBar(ftext);
|
2003-08-14 05:10:47 +00:00
|
|
|
}
|
2003-10-26 02:42:06 +00:00
|
|
|
|
2003-10-26 06:02:48 +00:00
|
|
|
/** Prints x to standard error if and only if we have never
|
2003-10-26 02:42:06 +00:00
|
|
|
encountered x before. */
|
|
|
|
private static void outputTshegBar(String x) {
|
|
|
|
if (outputAllTshegBars) {
|
2003-10-26 06:02:48 +00:00
|
|
|
System.err.println(outputTshegBarsPrefix + x);
|
2003-10-26 02:42:06 +00:00
|
|
|
} else if (outputUniqueTshegBars) {
|
|
|
|
if (!tshegBars.contains(x)) {
|
|
|
|
tshegBars.add(x);
|
2003-10-26 06:02:48 +00:00
|
|
|
System.err.println(outputTshegBarsPrefix + x);
|
2003-10-26 02:42:06 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2003-11-09 01:07:45 +00:00
|
|
|
/** For generating frequency info: */
|
2003-10-26 02:42:06 +00:00
|
|
|
private static boolean outputAllTshegBars
|
2003-11-09 01:07:45 +00:00
|
|
|
= ThdlOptions.getBooleanOption("org.thdl.tib.text.ttt.OutputAllTshegBars");
|
2003-10-26 02:42:06 +00:00
|
|
|
|
2003-11-09 01:07:45 +00:00
|
|
|
/** For generating info about which tsheg bars were converted, but
|
|
|
|
not how many times: */
|
2003-10-26 02:42:06 +00:00
|
|
|
private static boolean outputUniqueTshegBars
|
2003-11-09 01:07:45 +00:00
|
|
|
= ThdlOptions.getBooleanOption("org.thdl.tib.text.ttt.OutputUniqueTshegBars");
|
2003-10-26 02:42:06 +00:00
|
|
|
|
2003-11-09 01:07:45 +00:00
|
|
|
/** Affects what appears on the console when either {@link
|
|
|
|
#outputUniqueTshegBars} or {@link #outputAllTshegBars} is in
|
|
|
|
use. */
|
2003-10-26 02:42:06 +00:00
|
|
|
private static String outputTshegBarsPrefix
|
2003-11-09 01:07:45 +00:00
|
|
|
= ThdlOptions.getStringOption("org.thdl.tib.text.ttt.PrefixForOutputTshegBars", "");
|
2003-10-26 02:42:06 +00:00
|
|
|
|
|
|
|
private static final HashSet tshegBars = new HashSet();
|
|
|
|
|
2003-08-14 05:10:47 +00:00
|
|
|
public String toString() {
|
|
|
|
String typeString = "HUH?????";
|
|
|
|
if (type == COMMENT) typeString = "COMMENT";
|
|
|
|
if (type == FOLIO_MARKER) typeString = "FOLIO_MARKER";
|
Improved the ACIP scanner (the part of the converter that says, "This
is a correction, that's a comment, this is Tibetan, that's Latin
(English), that's Tibetan inter-tsheg-bar punctuation, etc.) It now
accepts more real-world ACIP files, i.e. it handles illegal
constructs. The error checking is more user-friendly. There are now
tests.
Added some tsheg bars that Peter E. Hauer of Linguasoft sent me to the
tests. Many thanks, Peter. I still need to implement rules that say,
"This is not Tibetan, it must be Sanskrit, because that letter doesn't
take a MA prefix."
2003-08-17 01:45:55 +00:00
|
|
|
if (type == LATIN) typeString = "LATIN";
|
2003-08-14 05:10:47 +00:00
|
|
|
if (type == TIBETAN_NON_PUNCTUATION) typeString = "TIBETAN_NON_PUNCTUATION";
|
|
|
|
if (type == TIBETAN_PUNCTUATION) typeString = "TIBETAN_PUNCTUATION";
|
|
|
|
if (type == CORRECTION_START) typeString = "CORRECTION_START";
|
|
|
|
if (type == PROBABLE_CORRECTION) typeString = "PROBABLE_CORRECTION";
|
|
|
|
if (type == POSSIBLE_CORRECTION) typeString = "POSSIBLE_CORRECTION";
|
|
|
|
if (type == BP) typeString = "BP";
|
|
|
|
if (type == LS) typeString = "LS";
|
|
|
|
if (type == DR) typeString = "DR";
|
|
|
|
if (type == DD) typeString = "DD";
|
|
|
|
if (type == QUESTION) typeString = "QUESTION";
|
|
|
|
if (type == START_SLASH) typeString = "START_SLASH";
|
|
|
|
if (type == END_SLASH) typeString = "END_SLASH";
|
|
|
|
if (type == START_PAREN) typeString = "START_PAREN";
|
|
|
|
if (type == END_PAREN) typeString = "END_PAREN";
|
2003-08-24 06:40:53 +00:00
|
|
|
if (type == WARNING) typeString = "WARNING";
|
2003-09-07 16:19:50 +00:00
|
|
|
if (type == TSHEG_BAR_ADORNMENT) typeString = "TSHEG_BAR_ADORNMENT";
|
2003-11-29 22:56:18 +00:00
|
|
|
if (type == UNICODE_CHARACTER) typeString = "UNICODE_CHARACTER";
|
2003-08-14 05:10:47 +00:00
|
|
|
if (type == ERROR) typeString = "ERROR";
|
Improved the ACIP scanner (the part of the converter that says, "This
is a correction, that's a comment, this is Tibetan, that's Latin
(English), that's Tibetan inter-tsheg-bar punctuation, etc.) It now
accepts more real-world ACIP files, i.e. it handles illegal
constructs. The error checking is more user-friendly. There are now
tests.
Added some tsheg bars that Peter E. Hauer of Linguasoft sent me to the
tests. Many thanks, Peter. I still need to implement rules that say,
"This is not Tibetan, it must be Sanskrit, because that letter doesn't
take a MA prefix."
2003-08-17 01:45:55 +00:00
|
|
|
return typeString + ":{" + getText() + "}";
|
2003-08-14 05:10:47 +00:00
|
|
|
}
|
|
|
|
}
|