192 lines
8.2 KiB
XML
192 lines
8.2 KiB
XML
<?xml version="1.0" encoding="utf-8"?>
|
|
|
|
<!--
|
|
Some THDL titles are problematic:
|
|
* 691, A New Script, claims to have a transcript but there is no
|
|
transcript at the URL.
|
|
* 2069, Husked Barley, has a zero KB transcript.
|
|
* 2116, Nasal Congestion, also has a zero KB transcript.
|
|
Each of these is handled here as a title with metadata but no data,
|
|
just like any other title without a transcription.
|
|
-->
|
|
|
|
<project name="lucene-thdl" default="lucene-thdl-jar" basedir=".">
|
|
<import file="build.xml"/>
|
|
|
|
<taskdef resource="net/sf/antcontrib/antcontrib.properties">
|
|
<classpath>
|
|
<pathelement location="${ext}/to-be-installed-with-ant/ant-contrib.jar"/>
|
|
</classpath>
|
|
</taskdef>
|
|
|
|
<property name="get.avdb.metadata" value="http://thdl.org/avarch/mediaflowcat/title_metadata.php"/>
|
|
<property name="only.titles" value="excludeNonTitles=true"/>
|
|
<property name="only.nontitles" value="excludeTitles=true"/>
|
|
<property name="url.to.transcripts" value="http://www.thdl.org/avarch/transcripts"/>
|
|
<property name="url.to.media.high" value="http://www.thdl.org/media/high"/>
|
|
<property name="url.to.media.low" value="http://www.thdl.org/media/low"/>
|
|
<property name="url.to.media.audio" value="http://www.thdl.org/media/audio"/>
|
|
<property name="lucene-thdl.bin" location="${bin}/lucene-thdl"/>
|
|
<property name="archive" location="archive"/>
|
|
<property name="title.metadata" value="${archive}/title_metadata.xml"/>
|
|
<property name="nontitle.metadata" value="${archive}/nontitle_metadata.xml"/>
|
|
<property name="styles" location="${archive}/styles"/>
|
|
<property name="transcripts" location="${archive}/transcripts-with-metadata"/>
|
|
<property name="solr" location="${archive}/solr"/>
|
|
<property name="solr.titles" location="${solr}/titles"/>
|
|
<property name="solr.nontitle.dir" location="${solr}/nontitles"/>
|
|
<property name="solr.nontitle.file" location="${solr.nontitle.dir}/add_nontitles.xml"/>
|
|
<property name="copy.to.solr.webapp" location="${archive}/copy-to-solr-webapp"/>
|
|
<property name="url.to.solr" value="http://localhost:8983/solr"/>
|
|
<property name="url.to.solr.update" value="${url.to.solr}/update"/>
|
|
|
|
<path id="lucene.classpath">
|
|
<fileset id="lucene.extensions" dir="${ext}/apache">
|
|
<include name="*.jar"/>
|
|
</fileset>
|
|
</path>
|
|
|
|
<path id="saxon.classpath">
|
|
<fileset id="saxon.extensions" dir="${ext}/saxon">
|
|
<include name="*.jar"/>
|
|
</fileset>
|
|
</path>
|
|
|
|
<target name="compile-and-jar-libraries">
|
|
<antcall target="jskad-dist">
|
|
<param name="my.jar.suffix" value=""/>
|
|
</antcall>
|
|
</target>
|
|
|
|
<!-- archive tasks -->
|
|
<target name="archive-get-metadata">
|
|
<mkdir dir="${archive}"/>
|
|
<get src="${get.avdb.metadata}?${parameters}" dest="${output.filename}" verbose="on"/>
|
|
</target>
|
|
|
|
<target name="archive-get-all-title-metadata">
|
|
<antcall target="archive-get-metadata">
|
|
<param name="parameters" value="${only.titles}"/>
|
|
<param name="output.filename" value="${title.metadata}"/>
|
|
</antcall>
|
|
</target>
|
|
|
|
<target name="archive-get-all-nontitle-metadata">
|
|
<antcall target="archive-get-metadata">
|
|
<param name="parameters" value="${only.nontitles}"/>
|
|
<param name="output.filename" value="${nontitle.metadata}"/>
|
|
</antcall>
|
|
</target>
|
|
|
|
<!-- due to encoding issues, none of the transcripts with filenames that need to be url-encoded
|
|
are retrieved correctly by this task; for example the umlaut in title 00007 messes things
|
|
up FIX ME!! -->
|
|
<target name="archive-get-and-transform-data" depends="compile-and-jar-libraries">
|
|
<mkdir dir="${transcripts}"/>
|
|
<java classname="net.sf.saxon.Transform" fork="yes">
|
|
<arg value="-s"/>
|
|
<arg value="${title.metadata}"/>
|
|
<arg value="-o"/>
|
|
<arg value="${transcripts}/DUMMY_FOR_BASE_URI"/>
|
|
<arg value="${styles}/mergeMetadataAndData.xsl"/>
|
|
<arg value="transcript.location=${url.to.transcripts}/"/>
|
|
<classpath>
|
|
<path refid="saxon.classpath"/>
|
|
<pathelement location="${vanillalib}/Jskad.jar"/>
|
|
</classpath>
|
|
</java>
|
|
</target>
|
|
|
|
<!-- solr tasks -->
|
|
<target name="solr-prepare-titles">
|
|
<mkdir dir="${solr.titles}"/>
|
|
<xslt basedir="${transcripts}" includes="*.xml" destdir="${solr.titles}" extension=".xml" style="${styles}/solarizeTitles.xsl"/>
|
|
</target>
|
|
|
|
<target name="solr-prepare-nontitles">
|
|
<mkdir dir="${solr.nontitle.dir}"/>
|
|
<xslt in="${nontitle.metadata}" out="${solr.nontitle.file}/" style="${styles}/solarizeNonTitles.xsl"/>
|
|
</target>
|
|
|
|
<!--
|
|
SimplePostTool: version 1.2
|
|
This is a simple command line tool for POSTing raw XML to a Solr
|
|
port. XML data can be read from files specified as commandline
|
|
args; as raw commandline arg strings; or via STDIN.
|
|
Examples:
|
|
java -Ddata=files -jar post.jar *.xml
|
|
java -Ddata=args -jar post.jar '<delete><id>42</id></delete>'
|
|
java -Ddata=stdin -jar post.jar < hd.xml
|
|
Other options controlled by System Properties include the Solr
|
|
URL to POST to, and whether a commit should be executed. These
|
|
are the defaults for all System Properties...
|
|
-Ddata=files
|
|
-Durl=http://localhost:8983/solr/update
|
|
-Dcommit=yes
|
|
-->
|
|
<target name="solr-post-and-commit-titles">
|
|
<fileset dir="${solr.titles}" includes="*.xml" id="solr.add.fileset"/>
|
|
<pathconvert pathsep=" " property="list.of.files.to.post" refid="solr.add.fileset"/>
|
|
<java jar="${ext}/apache/post.jar" fork="true">
|
|
<arg line="${list.of.files.to.post}"/>
|
|
<jvmarg value="-Durl=${url.to.solr.update}"/>
|
|
<jvmarg value="-Dcommit=yes"/>
|
|
</java>
|
|
</target>
|
|
|
|
<target name="solr-post-and-commit-nontitles">
|
|
<java jar="${ext}/apache/post.jar" fork="true">
|
|
<arg line="${solr.nontitle.file}"/>
|
|
<jvmarg value="-Durl=${url.to.solr.update}"/>
|
|
<jvmarg value="-Dcommit=yes"/>
|
|
</java>
|
|
</target>
|
|
|
|
<target name="solr-prepare-for-copy-to-solr-webapp" depends="lucene-thdl-jar">
|
|
<!--<copy file="${styles}/solarizeConstantsForImport.xsl" todir="${copy.to.solr.webapp}"/>-->
|
|
</target>
|
|
|
|
<!-- insert whether or not media exists for segment -->
|
|
<!-- <java classname="net.sf.saxon.Transform" fork="yes">
|
|
<arg value="-o"/>
|
|
<arg value="${solarized.transcript.dir.final}"/>
|
|
<arg value="${solarized.transcript.dir.prefinal}"/>
|
|
<arg value="${stylesheet.dir}/movieInfoToSolrAdd.xsl"/>
|
|
<classpath>
|
|
<pathelement location="${bin.dir}"/>
|
|
<path refid="classpath"/>
|
|
</classpath>
|
|
</java>
|
|
|
|
<copy todir="${solarized.transcript.dir.final}">
|
|
<fileset dir="." includes="*.sh"/>
|
|
</copy>
|
|
</target>-->
|
|
|
|
<!--
|
|
This is because the example schema.xml specifies a "uniqueKey" field called "id".
|
|
Whenever you POST instructions to Solr to add a document with the same value for
|
|
the uniqueKey as an existing document, it automaticaly replaces it for you.
|
|
|
|
However, for us, uniqueKeys combine the document id (which won't change) with
|
|
sentence ids (which may change), e.g. <field name="id">2291_d629e12</field>.
|
|
|
|
So, to replace, we'll find XML document by name (document id) in dbxml database,
|
|
then get all sentence ids for that document, then combine docId_sentenceId and
|
|
remove/replace from lucene.
|
|
-->
|
|
|
|
<!-- concordance program -->
|
|
<target name="lucene-thdl-compile" depends="init">
|
|
<mkdir dir="${lucene-thdl.bin}"/>
|
|
<javac srcdir="${source}" destdir="${lucene-thdl.bin}" includes="org/thdl/lucene/**.java" debug="on">
|
|
<classpath refid="lucene.classpath"/>
|
|
</javac>
|
|
</target>
|
|
|
|
<target name="lucene-thdl-jar" depends="lucene-thdl-compile">
|
|
<jar destfile="${vanillalib}/lucene-thdl.jar" basedir="${lucene-thdl.bin}/"/>
|
|
<copy file="${vanillalib}/lucene-thdl.jar" todir="${copy.to.solr.webapp}"/>
|
|
</target>
|
|
|
|
</project>
|