Jskad/lucene-thdl-build.xml

239 lines
9.9 KiB
XML
Raw Normal View History

<?xml version="1.0" encoding="utf-8"?>
<project name="lucene-thdl" default="lucene-thdl-jar" basedir=".">
<import file="build.xml"/>
<taskdef resource="net/sf/antcontrib/antcontrib.properties">
<classpath>
<pathelement location="${ext}/to-be-installed-with-ant/ant-contrib.jar"/>
</classpath>
</taskdef>
<property name="lucene-thdl.bin" location="${bin}/lucene-thdl"/>
2007-05-15 12:28:43 +00:00
<property name="archive" location="archive"/>
<property name="wylie" location="${archive}/wylie"/>
<property name="unicode" location="${archive}/unicode"/>
<property name="solr" location="${archive}/solr"/>
<property name="styles" location="${archive}/styles"/>
2007-05-15 12:28:43 +00:00
<property name="get.title.metadata" value="http://thdl.org/avarch/mediaflowcat/title_metadata.php"/>
<property name="url.to.transcripts" value="http://www.thdl.org/avarch/transcripts"/>
<property name="url.to.media.high" value="http://www.thdl.org/media/high"/>
<property name="url.to.media.low" value="http://www.thdl.org/media/low"/>
<property name="url.to.media.audio" value="http://www.thdl.org/media/audio"/>
<path id="lucene.classpath">
<fileset id="lucene.extensions" dir="${ext}/apache">
<include name="*.jar"/>
</fileset>
</path>
<path id="saxon.classpath">
<fileset id="saxon.extensions" dir="${ext}/saxon">
<include name="*.jar"/>
</fileset>
</path>
2007-05-15 12:47:43 +00:00
2007-05-15 12:28:43 +00:00
<!-- archive tasks -->
<target name="archive-get-metadata">
<mkdir dir="${archive}"/>
<get src="${get.title.metadata}" dest="${archive}/title_metadata.xml" verbose="on"/>
</target>
<target name="archive-get-list-of-transcripts">
<!-- <java classname="net.sf.saxon.Transform" fork="yes">
<arg value="-s"/>
<arg value="${archive}/title_metadata.xml"/>
<arg value="-o"/>
<arg value="${archive}/titles_as_list.txt"/>
<arg value="${styles}/get-list-of-transcripts.xsl"/>
<classpath>
<path refid="saxon.classpath"/>
</classpath>
</java>-->
<xslt in="${archive}/title_metadata.xml" out="${archive}/titles_as_list.txt" style="${styles}/get-list-of-transcripts.xsl"/>
<xslt in="${archive}/title_metadata.xml" out="${archive}/titles_as_list2.txt" style="${styles}/get-list-of-transcripts2.xsl"/>
</target>
<target name="archive-get-transcripts">
<mkdir dir="${wylie}"/>
<loadfile property="transcript-list" srcFile="${archive}/titles_as_list2.txt"/> <!-- encoding="UTF-8" -->
<foreach list="${transcript-list}" delimiter=" " param="filename" target="archive-get-one-transcript"/>
<!--<foreach list="${transcript-list}" delimiter=" " param="id.plus.filename" target="archive-get-one-transcript"/>-->
</target>
<target name="archive-get-one-transcript">
<!--<propertyregex property="filename" input="${id.plus.filename}" regexp=".+/(.+)" select="\1"/>-->
<get src="${url.to.transcripts}/${filename}" dest="${wylie}/${filename}" ignoreerrors="true"/>
</target>
<target name="saxon-test">
<mkdir dir="${unicode}"/>
<java classname="net.sf.saxon.Transform" fork="yes">
<arg value="-s"/>
<arg value="${archive}/title_metadata.xml"/>
<arg value="${styles}/mergeMetadataAndData.xsl"/>
<arg value="transcript.location=http://www.thdl.org/avarch/transcripts/"/>
<arg value="transform.to.dir=${unicode}"/>
<classpath>
<path refid="saxon.classpath"/>
<pathelement location="${vanillalib}/Jskad.jar"/>
</classpath>
</java>
</target>
<target name="archive-transcripts-to-unicode">
<mkdir dir="${unicode}"/>
<antcall target="jskad-dist">
<param name="my.jar.suffix" value=""/>
</antcall>
<mkdir dir="${lucene-thdl.bin}"/>
<javac srcdir="${source}" destdir="${lucene-thdl.bin}" includes="org/thdl/util/ant/SaxonLiaison.java" debug="on"/>
<loadfile property="transcript-list" srcFile="${archive}/titles_as_list.txt"/> <!-- encoding="UTF-8" -->
<foreach list="${transcript-list}" delimiter=" " param="id.plus.filename" target="archive-one-transcript-to-unicode"/>
</target>
<target name="archive-one-transcript-to-unicode">
<propertyregex property="title.id" input="${id.plus.filename}" regexp="(.+)/.+" select="\1"/>
<propertyregex property="filename" input="${id.plus.filename}" regexp=".+/(.+)" select="\1"/>
<!-- note: this processor is used to get around bug described here:
This class is a hack to work around Ant bug #41314: http://issues.apache.org/bugzilla/show_bug.cgi?id=41314
See Trevor's bike shed: http://www.vocaro.com/trevor/blog/2007/01/08/how-to-use-saxon-with-ant/
-->
<if>
<available file="${wylie}/${filename}" property="transcript.exists"/>
<then>
<xslt in="${wylie}/${filename}" out="${unicode}/${filename}" style="${styles}/qdToUnicode.xsl" processor="org.thdl.util.ant.SaxonLiaison">
<param name="title.id" expression="${title.id}"/>
<classpath>
<pathelement location="${lucene-thdl.bin}"/>
<pathelement location="${vanillalib}/Jskad.jar"/>
<path refid="saxon.classpath"/>
</classpath>
</xslt>
</then>
</if>
</target>
<!-- solr tasks -->
<target name="solr-prepare-transcripts">
</target>
<!-- solr stuff -->
<!-- <target name="solr-1:clean-local" depends="clean">
<delete dir="${solarized.transcript.dir.prefinal}"/>
<delete dir="${solarized.transcript.dir.final}"/>
<mkdir dir="${solarized.transcript.dir.prefinal}"/>
<mkdir dir="${solarized.transcript.dir.final}"/>
</target>-->
<!-- <target name="solr-2:prepare-documents" depends="init, dbxml-4:ngram-magic">-->
<!-- create xml data file used to assign tags to mono tsheg bars -->
<!--
<java classname="net.sf.saxon.Transform" fork="yes">
<arg value="-o"/>
<arg value="${build.dir}/tshegbartags.xml"/>
<arg value="${lexicon.dir}/lhasa-verbs.xml"/>
<arg value="${stylesheet.dir}/prepareTshegBarTagger.xsl"/>
<classpath>
<pathelement location="${bin.dir}"/>
<path refid="classpath"/>
</classpath>
</java>
-->
<!-- create xml file used to assign synonyms to certain mono tsheg bars -->
<!--
<java classname="net.sf.saxon.Transform" fork="yes">
<arg value="-o"/>
<arg value="${build.dir}/synonyms.xml"/>
<arg value="${lexicon.dir}/lhasa-verbs.xml"/>
<arg value="${stylesheet.dir}/makeSynonymFile.xsl"/>
<classpath>
<pathelement location="${bin.dir}"/>
<path refid="classpath"/>
</classpath>
</java>
-->
<!-- <loadfile property="xquery" srcfile="${stylesheet.dir}/solarizeTranscriptDatabase.xql" encoding="UTF-8"/>
<java classname="org.thdl.dbxml.QueryTools" fork="yes">
<arg value="${dbxml.environment.dir}"/>
<arg value="${dbxml.container}"/>
<arg value="${xquery}"/>
<arg value="${solarized.transcript.dir.prefinal}"/>
<classpath>
<pathelement location="${bin.dir}"/>
<path refid="classpath"/>
</classpath>
<jvmarg value="-Djava.library.path=${dbxml.lib}"/>
</java>-->
<!-- insert whether or not media exists for segment -->
<!-- <java classname="net.sf.saxon.Transform" fork="yes">
<arg value="-o"/>
<arg value="${solarized.transcript.dir.final}"/>
<arg value="${solarized.transcript.dir.prefinal}"/>
<arg value="${stylesheet.dir}/movieInfoToSolrAdd.xsl"/>
<classpath>
<pathelement location="${bin.dir}"/>
<path refid="classpath"/>
</classpath>
</java>
<copy todir="${solarized.transcript.dir.final}">
<fileset dir="." includes="*.sh"/>
</copy>
</target>-->
<!--
This is because the example schema.xml specifies a "uniqueKey" field called "id".
Whenever you POST instructions to Solr to add a document with the same value for
the uniqueKey as an existing document, it automaticaly replaces it for you.
However, for us, uniqueKeys combine the document id (which won't change) with
sentence ids (which may change), e.g. <field name="id">2291_d629e12</field>.
So, to replace, we'll find XML document by name (document id) in dbxml database,
then get all sentence ids for that document, then combine docId_sentenceId and
remove/replace from lucene.
-->
<!--
<target name="solr-3:commit-documents">
<exec executable="sh" dir="${solarized.transcript.dir.final}">
<arg value="post.sh"/>
<arg value="*.xml"/>
</exec>
</target>
<target name="solr-4:delete-documents" depends="solr-1:clean-local">
<exec executable="curl">
<arg value="${solr.update}"/>
<arg value="-data-binary"/> should have double dash at beginning
<arg value="&lt;delete&gt;&lt;query&gt;id:[* TO *]&lt;/query&gt;&lt;/delete&gt;"/>
</exec>
<exec executable="curl">
<arg value="${solr.update}"/>
<arg value="-data-binary"/> double dash again!
<arg value="&lt;commit/&gt;"/>
</exec>
</target>
-->
<!-- concordance program -->
<target name="lucene-thdl-compile" depends="init">
<mkdir dir="${lucene-thdl.bin}"/>
<javac srcdir="${source}" destdir="${lucene-thdl.bin}" includes="org/thdl/lucene/**.java" debug="on">
<classpath refid="lucene.classpath"/>
</javac>
</target>
<target name="lucene-thdl-jar" depends="lucene-thdl-compile">
<jar destfile="${vanillalib}/lucene-thdl.jar" basedir="${lucene-thdl.bin}/"/>
</target>
</project>