Jskad/lucene-thdl-build.xml

193 lines
8.2 KiB
XML

<?xml version="1.0" encoding="utf-8"?>
<!--
Some THDL titles are problematic:
* 691, A New Script, claims to have a transcript but there is no
transcript at the URL.
* 2069, Husked Barley, has a zero KB transcript.
* 2116, Nasal Congestion, also has a zero KB transcript.
Each of these is handled here as a title with metadata but no data,
just like any other title without a transcription.
-->
<project name="lucene-thdl" default="lucene-thdl-jar" basedir=".">
<import file="build.xml"/>
<taskdef resource="net/sf/antcontrib/antcontrib.properties">
<classpath>
<pathelement location="${ext}/to-be-installed-with-ant/ant-contrib.jar"/>
</classpath>
</taskdef>
<property name="get.avdb.metadata" value="http://thdl.org/avarch/mediaflowcat/title_metadata.php"/>
<property name="only.titles" value="excludeNonTitles=true"/>
<property name="only.nontitles" value="excludeTitles=true"/>
<property name="url.to.transcripts" value="http://www.thdl.org/avarch/transcripts"/>
<property name="url.to.media.high" value="http://www.thdl.org/media/high"/>
<property name="url.to.media.low" value="http://www.thdl.org/media/low"/>
<property name="url.to.media.audio" value="http://www.thdl.org/media/audio"/>
<property name="lucene-thdl.bin" location="${bin}/lucene-thdl"/>
<property name="archive" location="archive"/>
<property name="title.metadata" value="${archive}/title_metadata.xml"/>
<property name="nontitle.metadata" value="${archive}/nontitle_metadata.xml"/>
<property name="styles" location="${archive}/styles"/>
<property name="transcripts" location="${archive}/transcripts-with-metadata"/>
<property name="solr" location="${archive}/solr"/>
<property name="solr.titles" location="${solr}/titles"/>
<property name="solr.nontitle.dir" location="${solr}/nontitles"/>
<property name="solr.nontitle.file" location="${solr.nontitle.dir}/add_nontitles.xml"/>
<property name="copy.to.solr.webapp" location="${archive}/copy-to-solr-webapp"/>
<property name="url.to.solr" value="http://localhost:8983/solr"/>
<property name="url.to.solr.update" value="${url.to.solr}/update"/>
<path id="lucene.classpath">
<fileset id="lucene.extensions" dir="${ext}/apache">
<include name="*.jar"/>
</fileset>
</path>
<path id="saxon.classpath">
<fileset id="saxon.extensions" dir="${ext}/saxon">
<include name="*.jar"/>
</fileset>
</path>
<target name="compile-and-jar-libraries">
<antcall target="jskad-dist">
<param name="my.jar.suffix" value=""/>
</antcall>
</target>
<!-- archive tasks -->
<target name="archive-get-metadata">
<mkdir dir="${archive}"/>
<get src="${get.avdb.metadata}?${parameters}" dest="${output.filename}" verbose="on"/>
</target>
<target name="archive-get-all-title-metadata">
<antcall target="archive-get-metadata">
<param name="parameters" value="${only.titles}"/>
<param name="output.filename" value="${title.metadata}"/>
</antcall>
</target>
<target name="archive-get-all-nontitle-metadata">
<antcall target="archive-get-metadata">
<param name="parameters" value="${only.nontitles}"/>
<param name="output.filename" value="${nontitle.metadata}"/>
</antcall>
</target>
<!-- due to encoding issues, none of the transcripts with filenames that need to be url-encoded
are retrieved correctly by this task; for example the umlaut in title 00007 messes things
up FIX ME!! -->
<target name="archive-get-and-transform-data" depends="compile-and-jar-libraries">
<mkdir dir="${transcripts}"/>
<java classname="net.sf.saxon.Transform" fork="yes">
<arg value="-s"/>
<arg value="${title.metadata}"/>
<arg value="-o"/>
<arg value="${transcripts}/DUMMY_FOR_BASE_URI"/>
<arg value="${styles}/mergeMetadataAndData.xsl"/>
<arg value="transcript.location=${url.to.transcripts}/"/>
<classpath>
<path refid="saxon.classpath"/>
<pathelement location="${vanillalib}/Jskad.jar"/>
</classpath>
</java>
</target>
<!-- solr tasks -->
<target name="solr-prepare-titles">
<mkdir dir="${solr.titles}"/>
<xslt basedir="${transcripts}" includes="*.xml" destdir="${solr.titles}" extension=".xml" style="${styles}/solarizeTitles.xsl"/>
</target>
<target name="solr-prepare-nontitles">
<mkdir dir="${solr.nontitle.dir}"/>
<xslt in="${nontitle.metadata}" out="${solr.nontitle.file}/" style="${styles}/solarizeNonTitles.xsl"/>
</target>
<!--
SimplePostTool: version 1.2
This is a simple command line tool for POSTing raw XML to a Solr
port. XML data can be read from files specified as commandline
args; as raw commandline arg strings; or via STDIN.
Examples:
java -Ddata=files -jar post.jar *.xml
java -Ddata=args -jar post.jar '<delete><id>42</id></delete>'
java -Ddata=stdin -jar post.jar < hd.xml
Other options controlled by System Properties include the Solr
URL to POST to, and whether a commit should be executed. These
are the defaults for all System Properties...
-Ddata=files
-Durl=http://localhost:8983/solr/update
-Dcommit=yes
-->
<target name="solr-post-and-commit-titles">
<fileset dir="${solr.titles}" includes="*.xml" id="solr.add.fileset"/>
<pathconvert pathsep=" " property="list.of.files.to.post" refid="solr.add.fileset"/>
<java jar="${ext}/apache/post.jar" fork="true">
<arg line="${list.of.files.to.post}"/>
<jvmarg value="-Durl=${url.to.solr.update}"/>
<jvmarg value="-Dcommit=yes"/>
</java>
</target>
<target name="solr-post-and-commit-nontitles">
<java jar="${ext}/apache/post.jar" fork="true">
<arg line="${solr.nontitle.file}"/>
<jvmarg value="-Durl=${url.to.solr.update}"/>
<jvmarg value="-Dcommit=yes"/>
</java>
</target>
<target name="solr-prepare-for-copy-to-solr-webapp" depends="lucene-thdl-jar">
<!--<copy file="${styles}/solarizeConstantsForImport.xsl" todir="${copy.to.solr.webapp}"/>-->
</target>
<!-- insert whether or not media exists for segment -->
<!-- <java classname="net.sf.saxon.Transform" fork="yes">
<arg value="-o"/>
<arg value="${solarized.transcript.dir.final}"/>
<arg value="${solarized.transcript.dir.prefinal}"/>
<arg value="${stylesheet.dir}/movieInfoToSolrAdd.xsl"/>
<classpath>
<pathelement location="${bin.dir}"/>
<path refid="classpath"/>
</classpath>
</java>
<copy todir="${solarized.transcript.dir.final}">
<fileset dir="." includes="*.sh"/>
</copy>
</target>-->
<!--
This is because the example schema.xml specifies a "uniqueKey" field called "id".
Whenever you POST instructions to Solr to add a document with the same value for
the uniqueKey as an existing document, it automaticaly replaces it for you.
However, for us, uniqueKeys combine the document id (which won't change) with
sentence ids (which may change), e.g. <field name="id">2291_d629e12</field>.
So, to replace, we'll find XML document by name (document id) in dbxml database,
then get all sentence ids for that document, then combine docId_sentenceId and
remove/replace from lucene.
-->
<!-- concordance program -->
<target name="lucene-thdl-compile" depends="init">
<mkdir dir="${lucene-thdl.bin}"/>
<javac srcdir="${source}" destdir="${lucene-thdl.bin}" includes="org/thdl/lucene/**.java" debug="on">
<classpath refid="lucene.classpath"/>
</javac>
</target>
<target name="lucene-thdl-jar" depends="lucene-thdl-compile">
<jar destfile="${vanillalib}/lucene-thdl.jar" basedir="${lucene-thdl.bin}/"/>
<copy file="${vanillalib}/lucene-thdl.jar" todir="${copy.to.solr.webapp}"/>
</target>
</project>