added lucene & solr libraries as well as initial tibetan language processing code + new build file

2007-05-14 11:40:24 +00:00 · 2007-05-14 11:40:24 +00:00 · 030f279e28
commit 030f279e28
parent 3cd1f09087
14 changed files with 282 additions and 0 deletions
--- a/extensions/apache/apache-solr.jar
+++ b/extensions/apache/apache-solr.jar
--- a/extensions/apache/apache-solr.war
+++ b/extensions/apache/apache-solr.war
--- a/extensions/apache/lucene-core.jar
+++ b/extensions/apache/lucene-core.jar
--- a/extensions/apache/lucene-highlighter.jar
+++ b/extensions/apache/lucene-highlighter.jar
--- a/extensions/apache/lucene-snowball.jar
+++ b/extensions/apache/lucene-snowball.jar
--- a/extensions/apache/servlet-api.jar
+++ b/extensions/apache/servlet-api.jar
--- a/extensions/apache/xpp3.jar
+++ b/extensions/apache/xpp3.jar
--- a/lucene-thdl-build.xml
+++ b/lucene-thdl-build.xml
@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="utf-8"?>
+
+<project name="thdl-concordancer" default="index-for-solr" basedir=".">
+
+    <import file="build.xml"/>
+    
+    <property name="lucene-thdl.bin" location="${bin}/lucene-thdl"/>
+    
+    <path id="lucene.classpath">
+		<fileset id="lucene.extensions" dir="${ext}/apache">
+			<include name="*.jar"/>
+		</fileset>
+	</path>
+    
+    <!-- concordance program -->
+	<target name="lucene-thdl-compile" depends="init">
+        <mkdir dir="${lucene-thdl.bin}"/>
+		<javac srcdir="${source}" destdir="${lucene-thdl.bin}" includes="org/thdl/lucene/**.java" debug="on">
+			<classpath refid="lucene.classpath"/>
+		</javac>
+	</target>
+    
+    <target name="lucene-thdl-jar" depends="lucene-thdl-compile">
+        <jar destfile="${vanillalib}/lucene-thdl.jar" basedir="${lucene-thdl.bin}/"/>
+    </target>
+    
+</project>
--- a/source/org/thdl/lucene/EdgeTshegTrimmer.java
+++ b/source/org/thdl/lucene/EdgeTshegTrimmer.java
@ -0,0 +1,56 @@
+package org.thdl.lucene;
+
+import org.apache.lucene.analysis.*;
+import org.apache.lucene.analysis.Token;
+import java.io.*;
+import java.net.*;
+import java.util.*;
+
+/**
+ * Trims excess tshegs and other punctuation from Tibetan
+ * words, leaving them in their proper citation form.
+ *
+ * @author Edward Garrett
+ */
+public class EdgeTshegTrimmer extends TokenFilter {
+    public EdgeTshegTrimmer(TokenStream input) {
+        super(input);
+    }
+    
+    /**
+     * @return next token in TokenStream, stripped of superfluous
+     * tshegs
+     */
+    public Token next() throws IOException {
+        while (true) {
+            Token token = input.next();
+            if (token == null)
+                return null;
+            int length=token.termText().length();
+            int start=0;
+            while (start<length && !TshegBarTokenizer.isPartOfTshegBar(token.termText().charAt(start))) start++;
+            int end=length-1;
+            while (end>-1 && !TshegBarTokenizer.isPartOfTshegBar(token.termText().charAt(end))) end--;
+            if (start<=end) {
+                return new Token(addFinalTshegIfNecessary(token.termText().substring(start,end+1)), token.startOffset(), token.endOffset());
+            }
+        }
+    }
+    
+    /**
+     * Adds a tsheg to a <code>String</code> that doesn't 
+     * already end in one.
+     *
+     * @return original <code>String</code> with final tsheg 
+     * added if necessary
+     */
+    public static String addFinalTshegIfNecessary(String s) {
+        if (s.charAt(s.length()-1) == '\u0F0B')
+            return s;
+        else
+            return s += "\u0F0B";
+       // if (last == '\u0F42' || last == '\u0F0B')
+       //     return s;
+       // else
+    }
+}
--- a/source/org/thdl/lucene/EdgeTshegTrimmerFactory.java
+++ b/source/org/thdl/lucene/EdgeTshegTrimmerFactory.java
@ -0,0 +1,12 @@
+package org.thdl.lucene;
+
+import org.apache.lucene.analysis.*;
+import org.apache.solr.analysis.*;
+import java.io.*;
+
+public class EdgeTshegTrimmerFactory extends BaseTokenFilterFactory {
+    public TokenStream create(TokenStream input) {
+        return new EdgeTshegTrimmer(input);
+    }
+}
+
--- a/source/org/thdl/lucene/NumberPadder.java
+++ b/source/org/thdl/lucene/NumberPadder.java
@ -0,0 +1,32 @@
+package org.thdl.lucene;
+
+import org.apache.lucene.analysis.*;
+import java.text.DecimalFormat;
+import java.io.*;
+
+public class NumberPadder extends TokenFilter {
+    public static final String NUMBER_TYPE = "Number";
+    private static final DecimalFormat formatter = new DecimalFormat("0000000000");
+
+    public static String pad(int n) {
+        return formatter.format(n);
+    }    
+
+    public NumberPadder(TokenStream input) {
+        super(input);
+    }
+    
+    public Token next() throws IOException {
+        Token token = input.next();
+        if (token == null) 
+            return null;
+        try {
+            int i = Integer.parseInt(token.termText());
+            Token replace = new Token(pad(i), token.startOffset(), token.endOffset(), NUMBER_TYPE);
+            replace.setPositionIncrement(token.getPositionIncrement());
+            return replace;
+        } catch (NumberFormatException nfe) {
+            return token;
+        }
+    }
+}
--- a/source/org/thdl/lucene/NumberPadderFactory.java
+++ b/source/org/thdl/lucene/NumberPadderFactory.java
@ -0,0 +1,10 @@
+package org.thdl.lucene;
+
+import org.apache.lucene.analysis.*;
+import org.apache.solr.analysis.*;
+
+public class NumberPadderFactory extends BaseTokenFilterFactory {
+    public TokenStream create(TokenStream input) {
+        return new NumberPadder(input);
+    }
+}
--- a/source/org/thdl/lucene/TshegBarTokenizer.java
+++ b/source/org/thdl/lucene/TshegBarTokenizer.java
@ -0,0 +1,131 @@
+package org.thdl.lucene;
+
+import org.apache.lucene.analysis.*;
+import org.apache.lucene.analysis.Token;
+import org.thdl.tib.scanner.*;
+import java.io.*;
+import java.util.*;
+
+/**
+ * Takes stream of Unicode Tibetan text and tokenizes it
+ * into "syllables" or tsheg bars. Note that this is not
+ * equivalent to tokenizing into "words" since words frequently
+ * consist of more than one tsheg bar.
+ * <p>
+ * Non-Tibetan text and Tibetan punctuation is ignored by this
+ * class.
+ *
+ * @author Edward Garrett
+ */
+public class TshegBarTokenizer extends Tokenizer {
+    public int offset = 0;
+    
+    public static void main(String[] args) {
+        try {
+            Writer out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("out.txt"), "UTF-8"));
+            TshegBarTokenizer tok = new TshegBarTokenizer(new StringReader(args[0]));
+            Token next = tok.next();
+            while (next != null) {
+                out.write(next.termText() + "\n");
+                next = tok.next();
+            }
+            out.flush();
+            out.close();
+        } catch (IOException ioe) {
+            ioe.printStackTrace();
+        }
+    }
+    
+    /**
+     * Constructs a <code>TshegBarTokenizer</code> from a <code>Reader</code>
+     */
+    public TshegBarTokenizer(Reader in) {
+        super(in);
+    }
+    
+    /**
+     * Processes a stream of Tibetan text into a sequence of tsheg bars.
+     * Note that tsheg bars are returned in citation (dictionary) form,
+     * meaning in the normal case that final tshegs are added if not already
+     * present.
+     *
+     * @return the next tsheg bar in the stream
+     */
+    public Token next() throws IOException {
+        int c;
+        do {
+            c = input.read();
+            offset++;
+        } while (c!=-1 && !isPartOfTshegBar((char)c));
+        if (c==-1) return null; //reached end of stream without finding token
+        
+        //otherwise, this is the start of the tsheg bar
+        int start = offset-1;
+        StringBuffer buffy = new StringBuffer();
+        buffy.append((char)c);
+  
+        c = input.read();
+        offset++;
+        while (c!=-1 && isPartOfTshegBar((char)c)) {
+            buffy.append((char)c);
+            c = input.read();
+            offset++;
+        }
+        buffy.append('\u0F0B'); //add tsheg to end of token
+        
+        String token = buffy.toString();
+        if (c == '\u0F0B') {
+            return new Token(token.toString(), start, offset, "?"); //include tsheg for purposes of highlighting
+        } else {
+            return new Token(token.toString(), start, offset-1, "?"); //type "?" means not yet tagged
+        }
+    }
+    
+    /**
+     * Determines whether or not passed character belongs to the "inner" (contentful)
+     * part of a Tibetan tsheg bar.
+     *
+     * @return <code>true</code> if <code>c</code> is both Tibetan and
+     *          not a <code>Character.NON_SPACING_MARK</code>; <code>false</code>
+     *          otherwise
+     */
+    public static boolean isPartOfTshegBar(char c) {
+        return (Character.UnicodeBlock.of(c) == Character.UnicodeBlock.TIBETAN && 
+                (Character.isLetterOrDigit(c) || Character.getType(c) == Character.NON_SPACING_MARK)) 
+                ? true : false;
+    }
+    
+    //returns tsheg bar if string represents single tsheg bar; otherwise returns null
+    /**
+     * Reduces a series of possibly multiple tsheg bars to a single tsheg bar,
+     * ignoring all tsheg bars after the first.
+     *
+     * @return the first tsheg bar within the passed
+     *          <code>String</code>
+     */
+    public static String getSingleTshegBar(String val) throws IOException {
+        TshegBarTokenizer tokenizer = new TshegBarTokenizer(new StringReader(val));
+        Token next = tokenizer.next();
+        if (next!=null) {
+            if (tokenizer.next()==null) {
+                return next.termText();
+            }
+        }
+        return null;
+    }
+    
+    /**
+     * 
+     * @return array containing tsheg bars occurring in passed <code>String</String>
+     */
+    public static String[] getTshegBars(String text_bo) throws IOException {
+        TshegBarTokenizer tok = new TshegBarTokenizer(new StringReader(text_bo));
+        List<String> tokens = new ArrayList<String>();
+        Token next = tok.next();
+        while (next != null) {
+            tokens.add(next.termText());
+            next = tok.next();
+        }
+        return (String[])tokens.toArray(new String[0]);
+    }
+}
--- a/source/org/thdl/lucene/TshegBarTokenizerFactory.java
+++ b/source/org/thdl/lucene/TshegBarTokenizerFactory.java
@ -0,0 +1,14 @@
+package org.thdl.lucene;
+
+import org.apache.lucene.analysis.*;
+import org.apache.solr.analysis.*;
+import java.io.*;
+
+public class TshegBarTokenizerFactory extends BaseTokenizerFactory {
+    public TshegBarTokenizerFactory() {
+    }
+    public TokenStream create(Reader input) {
+        return new TshegBarTokenizer(input);
+    }
+}
+