131 lines
4.4 KiB
Java
131 lines
4.4 KiB
Java
package org.thdl.lucene;
|
|
|
|
import org.apache.lucene.analysis.*;
|
|
import org.apache.lucene.analysis.Token;
|
|
import org.thdl.tib.scanner.*;
|
|
import java.io.*;
|
|
import java.util.*;
|
|
|
|
/**
|
|
* Takes stream of Unicode Tibetan text and tokenizes it
|
|
* into "syllables" or tsheg bars. Note that this is not
|
|
* equivalent to tokenizing into "words" since words frequently
|
|
* consist of more than one tsheg bar.
|
|
* <p>
|
|
* Non-Tibetan text and Tibetan punctuation is ignored by this
|
|
* class.
|
|
*
|
|
* @author Edward Garrett
|
|
*/
|
|
public class TshegBarTokenizer extends Tokenizer {
|
|
public int offset = 0;
|
|
|
|
public static void main(String[] args) {
|
|
try {
|
|
Writer out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("out.txt"), "UTF-8"));
|
|
TshegBarTokenizer tok = new TshegBarTokenizer(new StringReader(args[0]));
|
|
Token next = tok.next();
|
|
while (next != null) {
|
|
out.write(next.termText() + "\n");
|
|
next = tok.next();
|
|
}
|
|
out.flush();
|
|
out.close();
|
|
} catch (IOException ioe) {
|
|
ioe.printStackTrace();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Constructs a <code>TshegBarTokenizer</code> from a <code>Reader</code>
|
|
*/
|
|
public TshegBarTokenizer(Reader in) {
|
|
super(in);
|
|
}
|
|
|
|
/**
|
|
* Processes a stream of Tibetan text into a sequence of tsheg bars.
|
|
* Note that tsheg bars are returned in citation (dictionary) form,
|
|
* meaning in the normal case that final tshegs are added if not already
|
|
* present.
|
|
*
|
|
* @return the next tsheg bar in the stream
|
|
*/
|
|
public Token next() throws IOException {
|
|
int c;
|
|
do {
|
|
c = input.read();
|
|
offset++;
|
|
} while (c!=-1 && !isPartOfTshegBar((char)c));
|
|
if (c==-1) return null; //reached end of stream without finding token
|
|
|
|
//otherwise, this is the start of the tsheg bar
|
|
int start = offset-1;
|
|
StringBuffer buffy = new StringBuffer();
|
|
buffy.append((char)c);
|
|
|
|
c = input.read();
|
|
offset++;
|
|
while (c!=-1 && isPartOfTshegBar((char)c)) {
|
|
buffy.append((char)c);
|
|
c = input.read();
|
|
offset++;
|
|
}
|
|
buffy.append('\u0F0B'); //add tsheg to end of token
|
|
|
|
String token = buffy.toString();
|
|
if (c == '\u0F0B') {
|
|
return new Token(token.toString(), start, offset, "?"); //include tsheg for purposes of highlighting
|
|
} else {
|
|
return new Token(token.toString(), start, offset-1, "?"); //type "?" means not yet tagged
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Determines whether or not passed character belongs to the "inner" (contentful)
|
|
* part of a Tibetan tsheg bar.
|
|
*
|
|
* @return <code>true</code> if <code>c</code> is both Tibetan and
|
|
* not a <code>Character.NON_SPACING_MARK</code>; <code>false</code>
|
|
* otherwise
|
|
*/
|
|
public static boolean isPartOfTshegBar(char c) {
|
|
return (Character.UnicodeBlock.of(c) == Character.UnicodeBlock.TIBETAN &&
|
|
(Character.isLetterOrDigit(c) || Character.getType(c) == Character.NON_SPACING_MARK))
|
|
? true : false;
|
|
}
|
|
|
|
//returns tsheg bar if string represents single tsheg bar; otherwise returns null
|
|
/**
|
|
* Reduces a series of possibly multiple tsheg bars to a single tsheg bar,
|
|
* ignoring all tsheg bars after the first.
|
|
*
|
|
* @return the first tsheg bar within the passed
|
|
* <code>String</code>
|
|
*/
|
|
public static String getSingleTshegBar(String val) throws IOException {
|
|
TshegBarTokenizer tokenizer = new TshegBarTokenizer(new StringReader(val));
|
|
Token next = tokenizer.next();
|
|
if (next!=null) {
|
|
if (tokenizer.next()==null) {
|
|
return next.termText();
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @return array containing tsheg bars occurring in passed <code>String</String>
|
|
*/
|
|
public static String[] getTshegBars(String text_bo) throws IOException {
|
|
TshegBarTokenizer tok = new TshegBarTokenizer(new StringReader(text_bo));
|
|
List<String> tokens = new ArrayList<String>();
|
|
Token next = tok.next();
|
|
while (next != null) {
|
|
tokens.add(next.termText());
|
|
next = tok.next();
|
|
}
|
|
return (String[])tokens.toArray(new String[0]);
|
|
}
|
|
}
|