added lucene & solr libraries as well as initial tibetan language processing code + new build file
This commit is contained in:
parent
3cd1f09087
commit
030f279e28
14 changed files with 282 additions and 0 deletions
BIN
extensions/apache/apache-solr.jar
Normal file
BIN
extensions/apache/apache-solr.jar
Normal file
Binary file not shown.
BIN
extensions/apache/apache-solr.war
Normal file
BIN
extensions/apache/apache-solr.war
Normal file
Binary file not shown.
BIN
extensions/apache/lucene-core.jar
Normal file
BIN
extensions/apache/lucene-core.jar
Normal file
Binary file not shown.
BIN
extensions/apache/lucene-highlighter.jar
Normal file
BIN
extensions/apache/lucene-highlighter.jar
Normal file
Binary file not shown.
BIN
extensions/apache/lucene-snowball.jar
Normal file
BIN
extensions/apache/lucene-snowball.jar
Normal file
Binary file not shown.
BIN
extensions/apache/servlet-api.jar
Normal file
BIN
extensions/apache/servlet-api.jar
Normal file
Binary file not shown.
BIN
extensions/apache/xpp3.jar
Normal file
BIN
extensions/apache/xpp3.jar
Normal file
Binary file not shown.
27
lucene-thdl-build.xml
Normal file
27
lucene-thdl-build.xml
Normal file
|
@ -0,0 +1,27 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
|
||||
<project name="thdl-concordancer" default="index-for-solr" basedir=".">
|
||||
|
||||
<import file="build.xml"/>
|
||||
|
||||
<property name="lucene-thdl.bin" location="${bin}/lucene-thdl"/>
|
||||
|
||||
<path id="lucene.classpath">
|
||||
<fileset id="lucene.extensions" dir="${ext}/apache">
|
||||
<include name="*.jar"/>
|
||||
</fileset>
|
||||
</path>
|
||||
|
||||
<!-- concordance program -->
|
||||
<target name="lucene-thdl-compile" depends="init">
|
||||
<mkdir dir="${lucene-thdl.bin}"/>
|
||||
<javac srcdir="${source}" destdir="${lucene-thdl.bin}" includes="org/thdl/lucene/**.java" debug="on">
|
||||
<classpath refid="lucene.classpath"/>
|
||||
</javac>
|
||||
</target>
|
||||
|
||||
<target name="lucene-thdl-jar" depends="lucene-thdl-compile">
|
||||
<jar destfile="${vanillalib}/lucene-thdl.jar" basedir="${lucene-thdl.bin}/"/>
|
||||
</target>
|
||||
|
||||
</project>
|
56
source/org/thdl/lucene/EdgeTshegTrimmer.java
Normal file
56
source/org/thdl/lucene/EdgeTshegTrimmer.java
Normal file
|
@ -0,0 +1,56 @@
|
|||
package org.thdl.lucene;
|
||||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import java.io.*;
|
||||
import java.net.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Trims excess tshegs and other punctuation from Tibetan
|
||||
* words, leaving them in their proper citation form.
|
||||
*
|
||||
* @author Edward Garrett
|
||||
*/
|
||||
public class EdgeTshegTrimmer extends TokenFilter {
|
||||
public EdgeTshegTrimmer(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return next token in TokenStream, stripped of superfluous
|
||||
* tshegs
|
||||
*/
|
||||
public Token next() throws IOException {
|
||||
while (true) {
|
||||
Token token = input.next();
|
||||
if (token == null)
|
||||
return null;
|
||||
int length=token.termText().length();
|
||||
int start=0;
|
||||
while (start<length && !TshegBarTokenizer.isPartOfTshegBar(token.termText().charAt(start))) start++;
|
||||
int end=length-1;
|
||||
while (end>-1 && !TshegBarTokenizer.isPartOfTshegBar(token.termText().charAt(end))) end--;
|
||||
if (start<=end) {
|
||||
return new Token(addFinalTshegIfNecessary(token.termText().substring(start,end+1)), token.startOffset(), token.endOffset());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a tsheg to a <code>String</code> that doesn't
|
||||
* already end in one.
|
||||
*
|
||||
* @return original <code>String</code> with final tsheg
|
||||
* added if necessary
|
||||
*/
|
||||
public static String addFinalTshegIfNecessary(String s) {
|
||||
if (s.charAt(s.length()-1) == '\u0F0B')
|
||||
return s;
|
||||
else
|
||||
return s += "\u0F0B";
|
||||
// if (last == '\u0F42' || last == '\u0F0B')
|
||||
// return s;
|
||||
// else
|
||||
}
|
||||
}
|
12
source/org/thdl/lucene/EdgeTshegTrimmerFactory.java
Normal file
12
source/org/thdl/lucene/EdgeTshegTrimmerFactory.java
Normal file
|
@ -0,0 +1,12 @@
|
|||
package org.thdl.lucene;
|
||||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.solr.analysis.*;
|
||||
import java.io.*;
|
||||
|
||||
public class EdgeTshegTrimmerFactory extends BaseTokenFilterFactory {
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new EdgeTshegTrimmer(input);
|
||||
}
|
||||
}
|
||||
|
32
source/org/thdl/lucene/NumberPadder.java
Normal file
32
source/org/thdl/lucene/NumberPadder.java
Normal file
|
@ -0,0 +1,32 @@
|
|||
package org.thdl.lucene;
|
||||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import java.text.DecimalFormat;
|
||||
import java.io.*;
|
||||
|
||||
public class NumberPadder extends TokenFilter {
|
||||
public static final String NUMBER_TYPE = "Number";
|
||||
private static final DecimalFormat formatter = new DecimalFormat("0000000000");
|
||||
|
||||
public static String pad(int n) {
|
||||
return formatter.format(n);
|
||||
}
|
||||
|
||||
public NumberPadder(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
public Token next() throws IOException {
|
||||
Token token = input.next();
|
||||
if (token == null)
|
||||
return null;
|
||||
try {
|
||||
int i = Integer.parseInt(token.termText());
|
||||
Token replace = new Token(pad(i), token.startOffset(), token.endOffset(), NUMBER_TYPE);
|
||||
replace.setPositionIncrement(token.getPositionIncrement());
|
||||
return replace;
|
||||
} catch (NumberFormatException nfe) {
|
||||
return token;
|
||||
}
|
||||
}
|
||||
}
|
10
source/org/thdl/lucene/NumberPadderFactory.java
Normal file
10
source/org/thdl/lucene/NumberPadderFactory.java
Normal file
|
@ -0,0 +1,10 @@
|
|||
package org.thdl.lucene;
|
||||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.solr.analysis.*;
|
||||
|
||||
public class NumberPadderFactory extends BaseTokenFilterFactory {
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new NumberPadder(input);
|
||||
}
|
||||
}
|
131
source/org/thdl/lucene/TshegBarTokenizer.java
Normal file
131
source/org/thdl/lucene/TshegBarTokenizer.java
Normal file
|
@ -0,0 +1,131 @@
|
|||
package org.thdl.lucene;
|
||||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.thdl.tib.scanner.*;
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Takes stream of Unicode Tibetan text and tokenizes it
|
||||
* into "syllables" or tsheg bars. Note that this is not
|
||||
* equivalent to tokenizing into "words" since words frequently
|
||||
* consist of more than one tsheg bar.
|
||||
* <p>
|
||||
* Non-Tibetan text and Tibetan punctuation is ignored by this
|
||||
* class.
|
||||
*
|
||||
* @author Edward Garrett
|
||||
*/
|
||||
public class TshegBarTokenizer extends Tokenizer {
|
||||
public int offset = 0;
|
||||
|
||||
public static void main(String[] args) {
|
||||
try {
|
||||
Writer out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("out.txt"), "UTF-8"));
|
||||
TshegBarTokenizer tok = new TshegBarTokenizer(new StringReader(args[0]));
|
||||
Token next = tok.next();
|
||||
while (next != null) {
|
||||
out.write(next.termText() + "\n");
|
||||
next = tok.next();
|
||||
}
|
||||
out.flush();
|
||||
out.close();
|
||||
} catch (IOException ioe) {
|
||||
ioe.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a <code>TshegBarTokenizer</code> from a <code>Reader</code>
|
||||
*/
|
||||
public TshegBarTokenizer(Reader in) {
|
||||
super(in);
|
||||
}
|
||||
|
||||
/**
|
||||
* Processes a stream of Tibetan text into a sequence of tsheg bars.
|
||||
* Note that tsheg bars are returned in citation (dictionary) form,
|
||||
* meaning in the normal case that final tshegs are added if not already
|
||||
* present.
|
||||
*
|
||||
* @return the next tsheg bar in the stream
|
||||
*/
|
||||
public Token next() throws IOException {
|
||||
int c;
|
||||
do {
|
||||
c = input.read();
|
||||
offset++;
|
||||
} while (c!=-1 && !isPartOfTshegBar((char)c));
|
||||
if (c==-1) return null; //reached end of stream without finding token
|
||||
|
||||
//otherwise, this is the start of the tsheg bar
|
||||
int start = offset-1;
|
||||
StringBuffer buffy = new StringBuffer();
|
||||
buffy.append((char)c);
|
||||
|
||||
c = input.read();
|
||||
offset++;
|
||||
while (c!=-1 && isPartOfTshegBar((char)c)) {
|
||||
buffy.append((char)c);
|
||||
c = input.read();
|
||||
offset++;
|
||||
}
|
||||
buffy.append('\u0F0B'); //add tsheg to end of token
|
||||
|
||||
String token = buffy.toString();
|
||||
if (c == '\u0F0B') {
|
||||
return new Token(token.toString(), start, offset, "?"); //include tsheg for purposes of highlighting
|
||||
} else {
|
||||
return new Token(token.toString(), start, offset-1, "?"); //type "?" means not yet tagged
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether or not passed character belongs to the "inner" (contentful)
|
||||
* part of a Tibetan tsheg bar.
|
||||
*
|
||||
* @return <code>true</code> if <code>c</code> is both Tibetan and
|
||||
* not a <code>Character.NON_SPACING_MARK</code>; <code>false</code>
|
||||
* otherwise
|
||||
*/
|
||||
public static boolean isPartOfTshegBar(char c) {
|
||||
return (Character.UnicodeBlock.of(c) == Character.UnicodeBlock.TIBETAN &&
|
||||
(Character.isLetterOrDigit(c) || Character.getType(c) == Character.NON_SPACING_MARK))
|
||||
? true : false;
|
||||
}
|
||||
|
||||
//returns tsheg bar if string represents single tsheg bar; otherwise returns null
|
||||
/**
|
||||
* Reduces a series of possibly multiple tsheg bars to a single tsheg bar,
|
||||
* ignoring all tsheg bars after the first.
|
||||
*
|
||||
* @return the first tsheg bar within the passed
|
||||
* <code>String</code>
|
||||
*/
|
||||
public static String getSingleTshegBar(String val) throws IOException {
|
||||
TshegBarTokenizer tokenizer = new TshegBarTokenizer(new StringReader(val));
|
||||
Token next = tokenizer.next();
|
||||
if (next!=null) {
|
||||
if (tokenizer.next()==null) {
|
||||
return next.termText();
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return array containing tsheg bars occurring in passed <code>String</String>
|
||||
*/
|
||||
public static String[] getTshegBars(String text_bo) throws IOException {
|
||||
TshegBarTokenizer tok = new TshegBarTokenizer(new StringReader(text_bo));
|
||||
List<String> tokens = new ArrayList<String>();
|
||||
Token next = tok.next();
|
||||
while (next != null) {
|
||||
tokens.add(next.termText());
|
||||
next = tok.next();
|
||||
}
|
||||
return (String[])tokens.toArray(new String[0]);
|
||||
}
|
||||
}
|
14
source/org/thdl/lucene/TshegBarTokenizerFactory.java
Normal file
14
source/org/thdl/lucene/TshegBarTokenizerFactory.java
Normal file
|
@ -0,0 +1,14 @@
|
|||
package org.thdl.lucene;
|
||||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.solr.analysis.*;
|
||||
import java.io.*;
|
||||
|
||||
public class TshegBarTokenizerFactory extends BaseTokenizerFactory {
|
||||
public TshegBarTokenizerFactory() {
|
||||
}
|
||||
public TokenStream create(Reader input) {
|
||||
return new TshegBarTokenizer(input);
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in a new issue