2007-05-14 11:40:24 +00:00
|
|
|
<?xml version="1.0" encoding="utf-8"?>
|
|
|
|
|
2007-05-15 17:48:15 +00:00
|
|
|
<project name="lucene-thdl" default="lucene-thdl-jar" basedir=".">
|
2007-05-14 11:40:24 +00:00
|
|
|
<import file="build.xml"/>
|
2007-05-15 18:56:04 +00:00
|
|
|
|
2007-05-17 18:20:12 +00:00
|
|
|
<taskdef resource="net/sf/antcontrib/antcontrib.properties">
|
|
|
|
<classpath>
|
|
|
|
<pathelement location="${ext}/to-be-installed-with-ant/ant-contrib.jar"/>
|
|
|
|
</classpath>
|
2007-05-15 18:56:04 +00:00
|
|
|
</taskdef>
|
|
|
|
|
2007-05-14 11:40:24 +00:00
|
|
|
<property name="lucene-thdl.bin" location="${bin}/lucene-thdl"/>
|
2007-05-15 12:28:43 +00:00
|
|
|
<property name="archive" location="archive"/>
|
|
|
|
<property name="wylie" location="${archive}/wylie"/>
|
|
|
|
<property name="unicode" location="${archive}/unicode"/>
|
|
|
|
<property name="solr" location="${archive}/solr"/>
|
2007-05-15 17:48:15 +00:00
|
|
|
<property name="styles" location="${archive}/styles"/>
|
2007-05-15 12:28:43 +00:00
|
|
|
<property name="get.title.metadata" value="http://thdl.org/avarch/mediaflowcat/title_metadata.php"/>
|
|
|
|
<property name="url.to.transcripts" value="http://www.thdl.org/avarch/transcripts"/>
|
|
|
|
<property name="url.to.media.high" value="http://www.thdl.org/media/high"/>
|
|
|
|
<property name="url.to.media.low" value="http://www.thdl.org/media/low"/>
|
|
|
|
<property name="url.to.media.audio" value="http://www.thdl.org/media/audio"/>
|
2007-05-14 11:40:24 +00:00
|
|
|
|
|
|
|
<path id="lucene.classpath">
|
|
|
|
<fileset id="lucene.extensions" dir="${ext}/apache">
|
|
|
|
<include name="*.jar"/>
|
|
|
|
</fileset>
|
|
|
|
</path>
|
|
|
|
|
2007-05-17 18:20:12 +00:00
|
|
|
<path id="saxon.classpath">
|
|
|
|
<fileset id="saxon.extensions" dir="${ext}/saxon">
|
|
|
|
<include name="*.jar"/>
|
|
|
|
</fileset>
|
|
|
|
</path>
|
|
|
|
|
2007-05-15 12:47:43 +00:00
|
|
|
|
2007-05-15 12:28:43 +00:00
|
|
|
<!-- archive tasks -->
|
|
|
|
<target name="archive-get-metadata">
|
|
|
|
<mkdir dir="${archive}"/>
|
|
|
|
<get src="${get.title.metadata}" dest="${archive}/title_metadata.xml" verbose="on"/>
|
|
|
|
</target>
|
2007-05-16 23:49:23 +00:00
|
|
|
|
|
|
|
<target name="archive-get-list-of-transcripts">
|
2007-05-22 08:56:06 +00:00
|
|
|
<!-- <java classname="net.sf.saxon.Transform" fork="yes">
|
|
|
|
<arg value="-s"/>
|
|
|
|
<arg value="${archive}/title_metadata.xml"/>
|
|
|
|
<arg value="-o"/>
|
|
|
|
<arg value="${archive}/titles_as_list.txt"/>
|
|
|
|
<arg value="${styles}/get-list-of-transcripts.xsl"/>
|
|
|
|
<classpath>
|
|
|
|
<path refid="saxon.classpath"/>
|
|
|
|
</classpath>
|
|
|
|
</java>-->
|
|
|
|
<xslt in="${archive}/title_metadata.xml" out="${archive}/titles_as_list.txt" style="${styles}/get-list-of-transcripts.xsl"/>
|
|
|
|
<xslt in="${archive}/title_metadata.xml" out="${archive}/titles_as_list2.txt" style="${styles}/get-list-of-transcripts2.xsl"/>
|
2007-05-16 23:49:23 +00:00
|
|
|
</target>
|
|
|
|
|
|
|
|
<target name="archive-get-transcripts">
|
2007-05-22 08:56:06 +00:00
|
|
|
<mkdir dir="${wylie}"/>
|
|
|
|
<loadfile property="transcript-list" srcFile="${archive}/titles_as_list2.txt"/> <!-- encoding="UTF-8" -->
|
2007-05-16 23:49:23 +00:00
|
|
|
<foreach list="${transcript-list}" delimiter=" " param="filename" target="archive-get-one-transcript"/>
|
2007-05-22 08:56:06 +00:00
|
|
|
<!--<foreach list="${transcript-list}" delimiter=" " param="id.plus.filename" target="archive-get-one-transcript"/>-->
|
2007-05-15 18:56:04 +00:00
|
|
|
</target>
|
|
|
|
|
|
|
|
<target name="archive-get-one-transcript">
|
2007-05-22 08:56:06 +00:00
|
|
|
<!--<propertyregex property="filename" input="${id.plus.filename}" regexp=".+/(.+)" select="\1"/>-->
|
2007-05-17 18:20:12 +00:00
|
|
|
<get src="${url.to.transcripts}/${filename}" dest="${wylie}/${filename}" ignoreerrors="true"/>
|
2007-05-15 17:48:15 +00:00
|
|
|
</target>
|
|
|
|
|
2007-05-22 08:56:06 +00:00
|
|
|
<target name="saxon-test">
|
2007-05-17 18:20:12 +00:00
|
|
|
<mkdir dir="${unicode}"/>
|
|
|
|
<java classname="net.sf.saxon.Transform" fork="yes">
|
2007-05-22 08:56:06 +00:00
|
|
|
<arg value="-s"/>
|
|
|
|
<arg value="${archive}/title_metadata.xml"/>
|
|
|
|
<arg value="${styles}/mergeMetadataAndData.xsl"/>
|
|
|
|
<arg value="transcript.location=http://www.thdl.org/avarch/transcripts/"/>
|
|
|
|
<arg value="transform.to.dir=${unicode}"/>
|
2007-05-17 18:20:12 +00:00
|
|
|
<classpath>
|
|
|
|
<path refid="saxon.classpath"/>
|
2007-05-22 08:56:06 +00:00
|
|
|
<pathelement location="${vanillalib}/Jskad.jar"/>
|
2007-05-17 18:20:12 +00:00
|
|
|
</classpath>
|
|
|
|
</java>
|
|
|
|
</target>
|
2007-05-16 23:49:23 +00:00
|
|
|
|
2007-05-17 18:20:12 +00:00
|
|
|
<target name="archive-transcripts-to-unicode">
|
2007-05-22 08:56:06 +00:00
|
|
|
<mkdir dir="${unicode}"/>
|
|
|
|
<antcall target="jskad-dist">
|
|
|
|
<param name="my.jar.suffix" value=""/>
|
|
|
|
</antcall>
|
|
|
|
<mkdir dir="${lucene-thdl.bin}"/>
|
|
|
|
<javac srcdir="${source}" destdir="${lucene-thdl.bin}" includes="org/thdl/util/ant/SaxonLiaison.java" debug="on"/>
|
|
|
|
<loadfile property="transcript-list" srcFile="${archive}/titles_as_list.txt"/> <!-- encoding="UTF-8" -->
|
|
|
|
<foreach list="${transcript-list}" delimiter=" " param="id.plus.filename" target="archive-one-transcript-to-unicode"/>
|
|
|
|
</target>
|
|
|
|
|
|
|
|
<target name="archive-one-transcript-to-unicode">
|
|
|
|
<propertyregex property="title.id" input="${id.plus.filename}" regexp="(.+)/.+" select="\1"/>
|
|
|
|
<propertyregex property="filename" input="${id.plus.filename}" regexp=".+/(.+)" select="\1"/>
|
|
|
|
<!-- note: this processor is used to get around bug described here:
|
|
|
|
This class is a hack to work around Ant bug #41314: http://issues.apache.org/bugzilla/show_bug.cgi?id=41314
|
|
|
|
See Trevor's bike shed: http://www.vocaro.com/trevor/blog/2007/01/08/how-to-use-saxon-with-ant/
|
|
|
|
-->
|
|
|
|
<if>
|
|
|
|
<available file="${wylie}/${filename}" property="transcript.exists"/>
|
|
|
|
<then>
|
|
|
|
<xslt in="${wylie}/${filename}" out="${unicode}/${filename}" style="${styles}/qdToUnicode.xsl" processor="org.thdl.util.ant.SaxonLiaison">
|
|
|
|
<param name="title.id" expression="${title.id}"/>
|
|
|
|
<classpath>
|
|
|
|
<pathelement location="${lucene-thdl.bin}"/>
|
|
|
|
<pathelement location="${vanillalib}/Jskad.jar"/>
|
|
|
|
<path refid="saxon.classpath"/>
|
|
|
|
</classpath>
|
|
|
|
</xslt>
|
|
|
|
</then>
|
|
|
|
</if>
|
|
|
|
</target>
|
|
|
|
|
|
|
|
|
|
|
|
<!-- solr tasks -->
|
|
|
|
<target name="solr-prepare-transcripts">
|
|
|
|
|
|
|
|
</target>
|
|
|
|
|
|
|
|
|
|
|
|
<!-- solr stuff -->
|
|
|
|
<!-- <target name="solr-1:clean-local" depends="clean">
|
|
|
|
<delete dir="${solarized.transcript.dir.prefinal}"/>
|
|
|
|
<delete dir="${solarized.transcript.dir.final}"/>
|
|
|
|
<mkdir dir="${solarized.transcript.dir.prefinal}"/>
|
|
|
|
<mkdir dir="${solarized.transcript.dir.final}"/>
|
|
|
|
</target>-->
|
|
|
|
|
|
|
|
<!-- <target name="solr-2:prepare-documents" depends="init, dbxml-4:ngram-magic">-->
|
|
|
|
<!-- create xml data file used to assign tags to mono tsheg bars -->
|
|
|
|
<!--
|
|
|
|
<java classname="net.sf.saxon.Transform" fork="yes">
|
|
|
|
<arg value="-o"/>
|
|
|
|
<arg value="${build.dir}/tshegbartags.xml"/>
|
|
|
|
<arg value="${lexicon.dir}/lhasa-verbs.xml"/>
|
|
|
|
<arg value="${stylesheet.dir}/prepareTshegBarTagger.xsl"/>
|
|
|
|
<classpath>
|
|
|
|
<pathelement location="${bin.dir}"/>
|
|
|
|
<path refid="classpath"/>
|
|
|
|
</classpath>
|
|
|
|
</java>
|
|
|
|
-->
|
|
|
|
|
|
|
|
<!-- create xml file used to assign synonyms to certain mono tsheg bars -->
|
|
|
|
<!--
|
|
|
|
<java classname="net.sf.saxon.Transform" fork="yes">
|
|
|
|
<arg value="-o"/>
|
|
|
|
<arg value="${build.dir}/synonyms.xml"/>
|
|
|
|
<arg value="${lexicon.dir}/lhasa-verbs.xml"/>
|
|
|
|
<arg value="${stylesheet.dir}/makeSynonymFile.xsl"/>
|
|
|
|
<classpath>
|
|
|
|
<pathelement location="${bin.dir}"/>
|
|
|
|
<path refid="classpath"/>
|
|
|
|
</classpath>
|
|
|
|
</java>
|
|
|
|
-->
|
|
|
|
|
|
|
|
<!-- <loadfile property="xquery" srcfile="${stylesheet.dir}/solarizeTranscriptDatabase.xql" encoding="UTF-8"/>
|
|
|
|
<java classname="org.thdl.dbxml.QueryTools" fork="yes">
|
|
|
|
<arg value="${dbxml.environment.dir}"/>
|
|
|
|
<arg value="${dbxml.container}"/>
|
|
|
|
<arg value="${xquery}"/>
|
|
|
|
<arg value="${solarized.transcript.dir.prefinal}"/>
|
|
|
|
<classpath>
|
|
|
|
<pathelement location="${bin.dir}"/>
|
|
|
|
<path refid="classpath"/>
|
|
|
|
</classpath>
|
|
|
|
<jvmarg value="-Djava.library.path=${dbxml.lib}"/>
|
|
|
|
</java>-->
|
|
|
|
|
|
|
|
<!-- insert whether or not media exists for segment -->
|
|
|
|
<!-- <java classname="net.sf.saxon.Transform" fork="yes">
|
2007-05-17 18:20:12 +00:00
|
|
|
<arg value="-o"/>
|
2007-05-22 08:56:06 +00:00
|
|
|
<arg value="${solarized.transcript.dir.final}"/>
|
|
|
|
<arg value="${solarized.transcript.dir.prefinal}"/>
|
|
|
|
<arg value="${stylesheet.dir}/movieInfoToSolrAdd.xsl"/>
|
2007-05-17 18:20:12 +00:00
|
|
|
<classpath>
|
|
|
|
<pathelement location="${bin.dir}"/>
|
|
|
|
<path refid="classpath"/>
|
|
|
|
</classpath>
|
|
|
|
</java>
|
2007-05-22 08:56:06 +00:00
|
|
|
|
|
|
|
<copy todir="${solarized.transcript.dir.final}">
|
|
|
|
<fileset dir="." includes="*.sh"/>
|
|
|
|
</copy>
|
|
|
|
</target>-->
|
|
|
|
|
|
|
|
<!--
|
|
|
|
This is because the example schema.xml specifies a "uniqueKey" field called "id".
|
|
|
|
Whenever you POST instructions to Solr to add a document with the same value for
|
|
|
|
the uniqueKey as an existing document, it automaticaly replaces it for you.
|
|
|
|
|
|
|
|
However, for us, uniqueKeys combine the document id (which won't change) with
|
|
|
|
sentence ids (which may change), e.g. <field name="id">2291_d629e12</field>.
|
|
|
|
|
|
|
|
So, to replace, we'll find XML document by name (document id) in dbxml database,
|
|
|
|
then get all sentence ids for that document, then combine docId_sentenceId and
|
|
|
|
remove/replace from lucene.
|
|
|
|
-->
|
|
|
|
<!--
|
|
|
|
<target name="solr-3:commit-documents">
|
|
|
|
<exec executable="sh" dir="${solarized.transcript.dir.final}">
|
|
|
|
<arg value="post.sh"/>
|
|
|
|
<arg value="*.xml"/>
|
|
|
|
</exec>
|
2007-05-17 18:20:12 +00:00
|
|
|
</target>
|
2007-05-22 08:56:06 +00:00
|
|
|
|
|
|
|
<target name="solr-4:delete-documents" depends="solr-1:clean-local">
|
|
|
|
<exec executable="curl">
|
|
|
|
<arg value="${solr.update}"/>
|
|
|
|
<arg value="-data-binary"/> should have double dash at beginning
|
|
|
|
<arg value="<delete><query>id:[* TO *]</query></delete>"/>
|
|
|
|
</exec>
|
|
|
|
<exec executable="curl">
|
|
|
|
<arg value="${solr.update}"/>
|
|
|
|
<arg value="-data-binary"/> double dash again!
|
|
|
|
<arg value="<commit/>"/>
|
|
|
|
</exec>
|
|
|
|
</target>
|
|
|
|
-->
|
|
|
|
|
2007-05-14 11:40:24 +00:00
|
|
|
<!-- concordance program -->
|
|
|
|
<target name="lucene-thdl-compile" depends="init">
|
|
|
|
<mkdir dir="${lucene-thdl.bin}"/>
|
|
|
|
<javac srcdir="${source}" destdir="${lucene-thdl.bin}" includes="org/thdl/lucene/**.java" debug="on">
|
|
|
|
<classpath refid="lucene.classpath"/>
|
|
|
|
</javac>
|
|
|
|
</target>
|
|
|
|
|
|
|
|
<target name="lucene-thdl-jar" depends="lucene-thdl-compile">
|
|
|
|
<jar destfile="${vanillalib}/lucene-thdl.jar" basedir="${lucene-thdl.bin}/"/>
|
|
|
|
</target>
|
|
|
|
|
|
|
|
</project>
|