Jskad/lucene-thdl-build.xml

248 lines
9.8 KiB
XML
Raw Normal View History

<?xml version="1.0" encoding="utf-8"?>
<!--
Some THDL titles are problematic:
* 691, A New Script, claims to have a transcript but there is no
transcript at the URL.
* 2069, Husked Barley, has a zero KB transcript.
* 2116, Nasal Congestion, also has a zero KB transcript.
Each of these is handled here as a title with metadata but no data,
just like any other title without a transcription.
-->
<project name="lucene-thdl" default="lucene-thdl-jar" basedir=".">
<import file="build.xml"/>
<taskdef resource="net/sf/antcontrib/antcontrib.properties">
<classpath>
<pathelement location="${ext}/to-be-installed-with-ant/ant-contrib.jar"/>
</classpath>
</taskdef>
2007-05-15 12:28:43 +00:00
<property name="get.title.metadata" value="http://thdl.org/avarch/mediaflowcat/title_metadata.php"/>
<property name="url.to.transcripts" value="http://www.thdl.org/avarch/transcripts"/>
<property name="url.to.media.high" value="http://www.thdl.org/media/high"/>
<property name="url.to.media.low" value="http://www.thdl.org/media/low"/>
<property name="url.to.media.audio" value="http://www.thdl.org/media/audio"/>
<property name="lucene-thdl.bin" location="${bin}/lucene-thdl"/>
<property name="archive" location="archive"/>
<property name="styles" location="${archive}/styles"/>
<property name="transcripts" location="${archive}/transcripts-with-metadata"/>
<property name="solr" location="${archive}/solr"/>
<path id="lucene.classpath">
<fileset id="lucene.extensions" dir="${ext}/apache">
<include name="*.jar"/>
</fileset>
</path>
<path id="saxon.classpath">
<fileset id="saxon.extensions" dir="${ext}/saxon">
<include name="*.jar"/>
</fileset>
</path>
<target name="compile-and-jar-libraries">
<antcall target="jskad-dist">
<param name="my.jar.suffix" value=""/>
</antcall>
</target>
2007-05-15 12:47:43 +00:00
2007-05-15 12:28:43 +00:00
<!-- archive tasks -->
<target name="archive-get-metadata">
<mkdir dir="${archive}"/>
<get src="${get.title.metadata}" dest="${archive}/title_metadata.xml" verbose="on"/>
</target>
<!-- due to encoding issues, none of the transcripts with filenames that need to be url-encoded
are retrieved correctly by this task; for example the umlaut in title 00007 messes things
up FIX ME!! -->
<target name="archive-get-and-transform-data" depends="compile-and-jar-libraries">
<mkdir dir="${transcripts}"/>
<java classname="net.sf.saxon.Transform" fork="yes">
<arg value="-s"/>
<arg value="${archive}/title_metadata.xml"/>
<arg value="-o"/>
<arg value="${transcripts}/DUMMY_FOR_BASE_URI"/>
<arg value="${styles}/mergeMetadataAndData.xsl"/>
<arg value="transcript.location=${url.to.transcripts}/"/>
<classpath>
<path refid="saxon.classpath"/>
<pathelement location="${vanillalib}/Jskad.jar"/>
</classpath>
</java>
</target>
<!-- solr tasks -->
<target name="solr-prepare-transcripts">
<mkdir dir="${solr}"/>
<xslt basedir="${transcripts}" destdir="${solr}" extension=".xml" style="${styles}/solarizeTranscript.xsl"/>
</target>
<!--
SimplePostTool: version 1.2
This is a simple command line tool for POSTing raw XML to a Solr
port. XML data can be read from files specified as commandline
args; as raw commandline arg strings; or via STDIN.
Examples:
java -Ddata=files -jar post.jar *.xml
java -Ddata=args -jar post.jar '<delete><id>42</id></delete>'
java -Ddata=stdin -jar post.jar < hd.xml
Other options controlled by System Properties include the Solr
URL to POST to, and whether a commit should be executed. These
are the defaults for all System Properties...
-Ddata=files
-Durl=http://localhost:8983/solr/update
-Dcommit=yes
-->
<target name="solr-post-and-commit-transcripts">
<java classname="net.sf.saxon.Transform" fork="yes">
<classpath>
<path refid="lucene.classpath"/>
</classpath>
</java>
</target>
<target name="archive-get-list-of-transcripts">
<!-- <java classname="net.sf.saxon.Transform" fork="yes">
<arg value="-s"/>
<arg value="${archive}/title_metadata.xml"/>
<arg value="-o"/>
<arg value="${archive}/titles_as_list.txt"/>
<arg value="${styles}/get-list-of-transcripts.xsl"/>
<classpath>
<path refid="saxon.classpath"/>
</classpath>
</java>-->
<xslt in="${archive}/title_metadata.xml" out="${archive}/titles_as_list.txt" style="${styles}/get-list-of-transcripts.xsl"/>
</target>
<target name="archive-get-transcripts">
<mkdir dir="${wylie}"/>
<loadfile property="transcript-list" srcFile="${archive}/titles_as_list2.txt"/> <!-- encoding="UTF-8" -->
<foreach list="${transcript-list}" delimiter=" " param="filename" target="archive-get-one-transcript"/>
<!--<foreach list="${transcript-list}" delimiter=" " param="id.plus.filename" target="archive-get-one-transcript"/>-->
</target>
<target name="archive-get-one-transcript">
<!--<propertyregex property="filename" input="${id.plus.filename}" regexp=".+/(.+)" select="\1"/>-->
<get src="${url.to.transcripts}/${filename}" dest="${wylie}/${filename}" ignoreerrors="true"/>
</target>
<!-- solr stuff -->
<!-- <target name="solr-1:clean-local" depends="clean">
<delete dir="${solarized.transcript.dir.prefinal}"/>
<delete dir="${solarized.transcript.dir.final}"/>
<mkdir dir="${solarized.transcript.dir.prefinal}"/>
<mkdir dir="${solarized.transcript.dir.final}"/>
</target>-->
<!-- <target name="solr-2:prepare-documents" depends="init, dbxml-4:ngram-magic">-->
<!-- create xml data file used to assign tags to mono tsheg bars -->
<!--
<java classname="net.sf.saxon.Transform" fork="yes">
<arg value="-o"/>
<arg value="${build.dir}/tshegbartags.xml"/>
<arg value="${lexicon.dir}/lhasa-verbs.xml"/>
<arg value="${stylesheet.dir}/prepareTshegBarTagger.xsl"/>
<classpath>
<pathelement location="${bin.dir}"/>
<path refid="classpath"/>
</classpath>
</java>
-->
<!-- create xml file used to assign synonyms to certain mono tsheg bars -->
<!--
<java classname="net.sf.saxon.Transform" fork="yes">
<arg value="-o"/>
<arg value="${build.dir}/synonyms.xml"/>
<arg value="${lexicon.dir}/lhasa-verbs.xml"/>
<arg value="${stylesheet.dir}/makeSynonymFile.xsl"/>
<classpath>
<pathelement location="${bin.dir}"/>
<path refid="classpath"/>
</classpath>
</java>
-->
<!-- <loadfile property="xquery" srcfile="${stylesheet.dir}/solarizeTranscriptDatabase.xql" encoding="UTF-8"/>
<java classname="org.thdl.dbxml.QueryTools" fork="yes">
<arg value="${dbxml.environment.dir}"/>
<arg value="${dbxml.container}"/>
<arg value="${xquery}"/>
<arg value="${solarized.transcript.dir.prefinal}"/>
<classpath>
<pathelement location="${bin.dir}"/>
<path refid="classpath"/>
</classpath>
<jvmarg value="-Djava.library.path=${dbxml.lib}"/>
</java>-->
<!-- insert whether or not media exists for segment -->
<!-- <java classname="net.sf.saxon.Transform" fork="yes">
<arg value="-o"/>
<arg value="${solarized.transcript.dir.final}"/>
<arg value="${solarized.transcript.dir.prefinal}"/>
<arg value="${stylesheet.dir}/movieInfoToSolrAdd.xsl"/>
<classpath>
<pathelement location="${bin.dir}"/>
<path refid="classpath"/>
</classpath>
</java>
<copy todir="${solarized.transcript.dir.final}">
<fileset dir="." includes="*.sh"/>
</copy>
</target>-->
<!--
This is because the example schema.xml specifies a "uniqueKey" field called "id".
Whenever you POST instructions to Solr to add a document with the same value for
the uniqueKey as an existing document, it automaticaly replaces it for you.
However, for us, uniqueKeys combine the document id (which won't change) with
sentence ids (which may change), e.g. <field name="id">2291_d629e12</field>.
So, to replace, we'll find XML document by name (document id) in dbxml database,
then get all sentence ids for that document, then combine docId_sentenceId and
remove/replace from lucene.
-->
<!--
<target name="solr-3:commit-documents">
<exec executable="sh" dir="${solarized.transcript.dir.final}">
<arg value="post.sh"/>
<arg value="*.xml"/>
</exec>
</target>
<target name="solr-4:delete-documents" depends="solr-1:clean-local">
<exec executable="curl">
<arg value="${solr.update}"/>
<arg value="-data-binary"/> should have double dash at beginning
<arg value="&lt;delete&gt;&lt;query&gt;id:[* TO *]&lt;/query&gt;&lt;/delete&gt;"/>
</exec>
<exec executable="curl">
<arg value="${solr.update}"/>
<arg value="-data-binary"/> double dash again!
<arg value="&lt;commit/&gt;"/>
</exec>
</target>
-->
<!-- concordance program -->
<target name="lucene-thdl-compile" depends="init">
<mkdir dir="${lucene-thdl.bin}"/>
<javac srcdir="${source}" destdir="${lucene-thdl.bin}" includes="org/thdl/lucene/**.java" debug="on">
<classpath refid="lucene.classpath"/>
</javac>
</target>
<target name="lucene-thdl-jar" depends="lucene-thdl-compile">
<jar destfile="${vanillalib}/lucene-thdl.jar" basedir="${lucene-thdl.bin}/"/>
</target>
</project>