added lucene & solr libraries as well as initial tibetan language processing code + new build file
This commit is contained in:
parent
3cd1f09087
commit
030f279e28
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,27 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
|
||||
<project name="thdl-concordancer" default="index-for-solr" basedir=".">
|
||||
|
||||
<import file="build.xml"/>
|
||||
|
||||
<property name="lucene-thdl.bin" location="${bin}/lucene-thdl"/>
|
||||
|
||||
<path id="lucene.classpath">
|
||||
<fileset id="lucene.extensions" dir="${ext}/apache">
|
||||
<include name="*.jar"/>
|
||||
</fileset>
|
||||
</path>
|
||||
|
||||
<!-- concordance program -->
|
||||
<target name="lucene-thdl-compile" depends="init">
|
||||
<mkdir dir="${lucene-thdl.bin}"/>
|
||||
<javac srcdir="${source}" destdir="${lucene-thdl.bin}" includes="org/thdl/lucene/**.java" debug="on">
|
||||
<classpath refid="lucene.classpath"/>
|
||||
</javac>
|
||||
</target>
|
||||
|
||||
<target name="lucene-thdl-jar" depends="lucene-thdl-compile">
|
||||
<jar destfile="${vanillalib}/lucene-thdl.jar" basedir="${lucene-thdl.bin}/"/>
|
||||
</target>
|
||||
|
||||
</project>
|
|
@ -0,0 +1,56 @@
|
|||
package org.thdl.lucene;
|
||||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import java.io.*;
|
||||
import java.net.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Trims excess tshegs and other punctuation from Tibetan
|
||||
* words, leaving them in their proper citation form.
|
||||
*
|
||||
* @author Edward Garrett
|
||||
*/
|
||||
public class EdgeTshegTrimmer extends TokenFilter {
|
||||
public EdgeTshegTrimmer(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return next token in TokenStream, stripped of superfluous
|
||||
* tshegs
|
||||
*/
|
||||
public Token next() throws IOException {
|
||||
while (true) {
|
||||
Token token = input.next();
|
||||
if (token == null)
|
||||
return null;
|
||||
int length=token.termText().length();
|
||||
int start=0;
|
||||
while (start<length && !TshegBarTokenizer.isPartOfTshegBar(token.termText().charAt(start))) start++;
|
||||
int end=length-1;
|
||||
while (end>-1 && !TshegBarTokenizer.isPartOfTshegBar(token.termText().charAt(end))) end--;
|
||||
if (start<=end) {
|
||||
return new Token(addFinalTshegIfNecessary(token.termText().substring(start,end+1)), token.startOffset(), token.endOffset());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a tsheg to a <code>String</code> that doesn't
|
||||
* already end in one.
|
||||
*
|
||||
* @return original <code>String</code> with final tsheg
|
||||
* added if necessary
|
||||
*/
|
||||
public static String addFinalTshegIfNecessary(String s) {
|
||||
if (s.charAt(s.length()-1) == '\u0F0B')
|
||||
return s;
|
||||
else
|
||||
return s += "\u0F0B";
|
||||
// if (last == '\u0F42' || last == '\u0F0B')
|
||||
// return s;
|
||||
// else
|
||||
}
|
||||
}
|
|
@ -0,0 +1,12 @@
|
|||
package org.thdl.lucene;
|
||||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.solr.analysis.*;
|
||||
import java.io.*;
|
||||
|
||||
public class EdgeTshegTrimmerFactory extends BaseTokenFilterFactory {
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new EdgeTshegTrimmer(input);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,32 @@
|
|||
package org.thdl.lucene;
|
||||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import java.text.DecimalFormat;
|
||||
import java.io.*;
|
||||
|
||||
public class NumberPadder extends TokenFilter {
|
||||
public static final String NUMBER_TYPE = "Number";
|
||||
private static final DecimalFormat formatter = new DecimalFormat("0000000000");
|
||||
|
||||
public static String pad(int n) {
|
||||
return formatter.format(n);
|
||||
}
|
||||
|
||||
public NumberPadder(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
public Token next() throws IOException {
|
||||
Token token = input.next();
|
||||
if (token == null)
|
||||
return null;
|
||||
try {
|
||||
int i = Integer.parseInt(token.termText());
|
||||
Token replace = new Token(pad(i), token.startOffset(), token.endOffset(), NUMBER_TYPE);
|
||||
replace.setPositionIncrement(token.getPositionIncrement());
|
||||
return replace;
|
||||
} catch (NumberFormatException nfe) {
|
||||
return token;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,10 @@
|
|||
package org.thdl.lucene;
|
||||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.solr.analysis.*;
|
||||
|
||||
public class NumberPadderFactory extends BaseTokenFilterFactory {
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new NumberPadder(input);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,131 @@
|
|||
package org.thdl.lucene;
|
||||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.thdl.tib.scanner.*;
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Takes stream of Unicode Tibetan text and tokenizes it
|
||||
* into "syllables" or tsheg bars. Note that this is not
|
||||
* equivalent to tokenizing into "words" since words frequently
|
||||
* consist of more than one tsheg bar.
|
||||
* <p>
|
||||
* Non-Tibetan text and Tibetan punctuation is ignored by this
|
||||
* class.
|
||||
*
|
||||
* @author Edward Garrett
|
||||
*/
|
||||
public class TshegBarTokenizer extends Tokenizer {
|
||||
public int offset = 0;
|
||||
|
||||
public static void main(String[] args) {
|
||||
try {
|
||||
Writer out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("out.txt"), "UTF-8"));
|
||||
TshegBarTokenizer tok = new TshegBarTokenizer(new StringReader(args[0]));
|
||||
Token next = tok.next();
|
||||
while (next != null) {
|
||||
out.write(next.termText() + "\n");
|
||||
next = tok.next();
|
||||
}
|
||||
out.flush();
|
||||
out.close();
|
||||
} catch (IOException ioe) {
|
||||
ioe.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a <code>TshegBarTokenizer</code> from a <code>Reader</code>
|
||||
*/
|
||||
public TshegBarTokenizer(Reader in) {
|
||||
super(in);
|
||||
}
|
||||
|
||||
/**
|
||||
* Processes a stream of Tibetan text into a sequence of tsheg bars.
|
||||
* Note that tsheg bars are returned in citation (dictionary) form,
|
||||
* meaning in the normal case that final tshegs are added if not already
|
||||
* present.
|
||||
*
|
||||
* @return the next tsheg bar in the stream
|
||||
*/
|
||||
public Token next() throws IOException {
|
||||
int c;
|
||||
do {
|
||||
c = input.read();
|
||||
offset++;
|
||||
} while (c!=-1 && !isPartOfTshegBar((char)c));
|
||||
if (c==-1) return null; //reached end of stream without finding token
|
||||
|
||||
//otherwise, this is the start of the tsheg bar
|
||||
int start = offset-1;
|
||||
StringBuffer buffy = new StringBuffer();
|
||||
buffy.append((char)c);
|
||||
|
||||
c = input.read();
|
||||
offset++;
|
||||
while (c!=-1 && isPartOfTshegBar((char)c)) {
|
||||
buffy.append((char)c);
|
||||
c = input.read();
|
||||
offset++;
|
||||
}
|
||||
buffy.append('\u0F0B'); //add tsheg to end of token
|
||||
|
||||
String token = buffy.toString();
|
||||
if (c == '\u0F0B') {
|
||||
return new Token(token.toString(), start, offset, "?"); //include tsheg for purposes of highlighting
|
||||
} else {
|
||||
return new Token(token.toString(), start, offset-1, "?"); //type "?" means not yet tagged
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether or not passed character belongs to the "inner" (contentful)
|
||||
* part of a Tibetan tsheg bar.
|
||||
*
|
||||
* @return <code>true</code> if <code>c</code> is both Tibetan and
|
||||
* not a <code>Character.NON_SPACING_MARK</code>; <code>false</code>
|
||||
* otherwise
|
||||
*/
|
||||
public static boolean isPartOfTshegBar(char c) {
|
||||
return (Character.UnicodeBlock.of(c) == Character.UnicodeBlock.TIBETAN &&
|
||||
(Character.isLetterOrDigit(c) || Character.getType(c) == Character.NON_SPACING_MARK))
|
||||
? true : false;
|
||||
}
|
||||
|
||||
//returns tsheg bar if string represents single tsheg bar; otherwise returns null
|
||||
/**
|
||||
* Reduces a series of possibly multiple tsheg bars to a single tsheg bar,
|
||||
* ignoring all tsheg bars after the first.
|
||||
*
|
||||
* @return the first tsheg bar within the passed
|
||||
* <code>String</code>
|
||||
*/
|
||||
public static String getSingleTshegBar(String val) throws IOException {
|
||||
TshegBarTokenizer tokenizer = new TshegBarTokenizer(new StringReader(val));
|
||||
Token next = tokenizer.next();
|
||||
if (next!=null) {
|
||||
if (tokenizer.next()==null) {
|
||||
return next.termText();
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return array containing tsheg bars occurring in passed <code>String</String>
|
||||
*/
|
||||
public static String[] getTshegBars(String text_bo) throws IOException {
|
||||
TshegBarTokenizer tok = new TshegBarTokenizer(new StringReader(text_bo));
|
||||
List<String> tokens = new ArrayList<String>();
|
||||
Token next = tok.next();
|
||||
while (next != null) {
|
||||
tokens.add(next.termText());
|
||||
next = tok.next();
|
||||
}
|
||||
return (String[])tokens.toArray(new String[0]);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
package org.thdl.lucene;
|
||||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.solr.analysis.*;
|
||||
import java.io.*;
|
||||
|
||||
public class TshegBarTokenizerFactory extends BaseTokenizerFactory {
|
||||
public TshegBarTokenizerFactory() {
|
||||
}
|
||||
public TokenStream create(Reader input) {
|
||||
return new TshegBarTokenizer(input);
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue