package org.thdl.lucene; import org.apache.lucene.analysis.*; import org.apache.lucene.analysis.Token; import org.thdl.tib.scanner.*; import java.io.*; import java.util.*; /** * Takes stream of Unicode Tibetan text and tokenizes it * into "syllables" or tsheg bars. Note that this is not * equivalent to tokenizing into "words" since words frequently * consist of more than one tsheg bar. *

* Non-Tibetan text and Tibetan punctuation is ignored by this * class. * * @author Edward Garrett */ public class TshegBarTokenizer extends Tokenizer { public int offset = 0; public static void main(String[] args) { try { Writer out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("out.txt"), "UTF-8")); TshegBarTokenizer tok = new TshegBarTokenizer(new StringReader(args[0])); Token next = tok.next(); while (next != null) { out.write(next.termText() + "\n"); next = tok.next(); } out.flush(); out.close(); } catch (IOException ioe) { ioe.printStackTrace(); } } /** * Constructs a TshegBarTokenizer from a Reader */ public TshegBarTokenizer(Reader in) { super(in); } /** * Processes a stream of Tibetan text into a sequence of tsheg bars. * Note that tsheg bars are returned in citation (dictionary) form, * meaning in the normal case that final tshegs are added if not already * present. * * @return the next tsheg bar in the stream */ public Token next() throws IOException { int c; do { c = input.read(); offset++; } while (c!=-1 && !isPartOfTshegBar((char)c)); if (c==-1) return null; //reached end of stream without finding token //otherwise, this is the start of the tsheg bar int start = offset-1; StringBuffer buffy = new StringBuffer(); buffy.append((char)c); c = input.read(); offset++; while (c!=-1 && isPartOfTshegBar((char)c)) { buffy.append((char)c); c = input.read(); offset++; } buffy.append('\u0F0B'); //add tsheg to end of token String token = buffy.toString(); if (c == '\u0F0B') { return new Token(token.toString(), start, offset, "?"); //include tsheg for purposes of highlighting } else { return new Token(token.toString(), start, offset-1, "?"); //type "?" means not yet tagged } } /** * Determines whether or not passed character belongs to the "inner" (contentful) * part of a Tibetan tsheg bar. * * @return true if c is both Tibetan and * not a Character.NON_SPACING_MARK; false * otherwise */ public static boolean isPartOfTshegBar(char c) { return (Character.UnicodeBlock.of(c) == Character.UnicodeBlock.TIBETAN && (Character.isLetterOrDigit(c) || Character.getType(c) == Character.NON_SPACING_MARK)) ? true : false; } //returns tsheg bar if string represents single tsheg bar; otherwise returns null /** * Reduces a series of possibly multiple tsheg bars to a single tsheg bar, * ignoring all tsheg bars after the first. * * @return the first tsheg bar within the passed * String */ public static String getSingleTshegBar(String val) throws IOException { TshegBarTokenizer tokenizer = new TshegBarTokenizer(new StringReader(val)); Token next = tokenizer.next(); if (next!=null) { if (tokenizer.next()==null) { return next.termText(); } } return null; } /** * * @return array containing tsheg bars occurring in passed String */ public static String[] getTshegBars(String text_bo) throws IOException { TshegBarTokenizer tok = new TshegBarTokenizer(new StringReader(text_bo)); List tokens = new ArrayList(); Token next = tok.next(); while (next != null) { tokens.add(next.termText()); next = tok.next(); } return (String[])tokens.toArray(new String[0]); } }