streamlined lucene build file and added task to prepare transcript data and metadata for posting to a solr index

This commit is contained in:
eg3p 2007-05-22 16:07:08 +00:00
parent 73f429f9b4
commit a53bc5b126
3 changed files with 151 additions and 62 deletions

View file

@ -9,7 +9,7 @@
<xsl:import href="qdToUnicode.xsl"/> <xsl:import href="qdToUnicode.xsl"/>
<xsl:output method="xml" encoding="UTF-8" indent="yes" name="unicode.out"/> <xsl:output method="xml" encoding="UTF-8" indent="yes" name="unicode.transcript.with.metadata"/>
<xsl:param name="transcript.location" select="''"/> <xsl:param name="transcript.location" select="''"/>
<xsl:param name="transform.to.dir" select="''"/> <xsl:param name="transform.to.dir" select="''"/>
@ -18,7 +18,7 @@
<xsl:for-each select="//transcript"> <xsl:for-each select="//transcript">
<xsl:variable name="filename" select="."/> <xsl:variable name="filename" select="."/>
<!-- <xsl:variable name="filename" select="encoder:encode(.,'UTF-8')"/> --> <!-- <xsl:variable name="filename" select="encoder:encode(.,'UTF-8')"/> -->
<xsl:result-document href="{$transform.to.dir}/{$filename}" format="unicode.out"> <xsl:result-document href="{$filename}" format="unicode.transcript.with.metadata">
<xsl:element name="TITLE"> <xsl:element name="TITLE">
<xsl:attribute name="id"> <xsl:attribute name="id">
<xsl:value-of select="../@id"/> <xsl:value-of select="../@id"/>

View file

@ -0,0 +1,103 @@
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output method="xml" encoding="UTF-8" indent="yes"/>
<xsl:param name="TITLE_TYPE" select="'AVDB_TITLE'"/>
<xsl:param name="TRANSCRIPT_FRAGMENT_TYPE" select="'TRANSCRIPT_FRAGMENT'"/>
<xsl:template match="/">
<xsl:apply-templates select="TITLE"/>
</xsl:template>
<xsl:template match="TITLE">
<add>
<xsl:variable name="title.id" select="@id"/>
<xsl:apply-templates select="METADATA">
<xsl:with-param name="title.id" select="$title.id"/>
</xsl:apply-templates>
<xsl:apply-templates select="TEXT/S">
<xsl:with-param name="title.id" select="$title.id"/>
</xsl:apply-templates>
</add>
</xsl:template>
<xsl:template match="METADATA">
<xsl:param name="title.id" select="''"/>
<doc>
<field name="id"><xsl:value-of select="$title.id"/></field>
<field name="thdl_type"><xsl:value-of select="$TITLE_TYPE"/></field>
<field name="speechType"><xsl:value-of select="speechType"/></field>
<field name="language"><xsl:value-of select="language"/></field>
<field name="culturalRegion"><xsl:value-of select="culturalRegion"/></field>
<field name="title_en"><xsl:value-of select="name"/></field>
<field name="caption_en"><xsl:value-of select="caption"/></field>
<field name="speechType"><xsl:value-of select="speechType"/></field>
<field name="transcriptFilename"><xsl:value-of select="transcript"/></field>
<!-- make plan for videos; incl. caressing #s so that you can search for ranges -->
</doc>
</xsl:template>
<!-- Here's what a chunk of metadata looks like:
<speechType>Conversation</speechType>
<language>Tibetan</language>
<culturalRegion>dbus</culturalRegion>
<name>A Lucky Dream: &lt;i&gt;Three's Company #01&lt;/i&gt;</name>
<caption>CAPTION HEAD</caption>
<transcript id="2502">00008_01-a-lucky-dream_09.xml</transcript>
<video id="6613">
<mediaDescription>Audio</mediaDescription>
<connectionSpeed>medium</connectionSpeed>
<size>3225818</size>
<duration>00:02:14</duration>
<name>00008_lucky-dream.mp3</name>
</video>
<video id="6612">
<mediaDescription>Video</mediaDescription>
<connectionSpeed>medium</connectionSpeed>
<size>5009798</size>
<duration>00:11:11</duration>
<name>00008_lucky-dream.mp4</name>
</video>
<video id="6611">
<mediaDescription>Video</mediaDescription>
<connectionSpeed>fast</connectionSpeed>
<size>13556712</size>
<duration>00:11:11</duration>
<name>00008_lucky-dream.mp4</name>
</video>
-->
<xsl:template match="S">
<xsl:param name="title.id" select="''"/>
<doc>
<field name="id"><xsl:value-of select="concat($title.id, '_', @id)"/></field>
<field name="thdl_type"><xsl:value-of select="$TRANSCRIPT_FRAGMENT_TYPE"/></field>
<field name="form_bo"><xsl:value-of select="FORM[@xml:lang='bo']"/></field>
<field name="form_bo-Latn"><xsl:value-of select="FORM[@xml:lang='bo-Latn']"/></field>
<field name="transl_en"><xsl:value-of select="TRANSL[@xml:lang='en']"/></field>
<field name="transl_zh"><xsl:value-of select="TRANSL[@xml:lang='zh']"/></field>
<xsl:if test="AUDIO/@start">
<field name="start_f"><xsl:value-of select="AUDIO/@start"/></field>
</xsl:if>
<xsl:if test="AUDIO/@end">
<field name="end_f"><xsl:value-of select="AUDIO/@end"/></field>
</xsl:if>
</doc>
</xsl:template>
<!-- Here's what a chunk of transcript looks like:
<S who="N400005" id="d148e29">
<FORM xml:lang="bo">དེ་རིང་གནམ་ཡག་པོ་ར་ཅིག་༿འདྲ་ཅིག༾མི་འདུག་གས།</FORM>
<FORM xml:lang="bo-Latn">de ring gnam yag po ra cig {'dra cig}mi 'dug gas/</FORM>
<TRANSL xml:lang="en">Isn't it a nice day today?</TRANSL>
<TRANSL xml:lang="zh">今天的天气多好啊, 是吧!</TRANSL>
<AUDIO end="8.925999997392298" start="7.63"/>
</S>
-->
</xsl:stylesheet>

View file

@ -1,5 +1,15 @@
<?xml version="1.0" encoding="utf-8"?> <?xml version="1.0" encoding="utf-8"?>
<!--
Some THDL titles are problematic:
* 691, A New Script, claims to have a transcript but there is no
transcript at the URL.
* 2069, Husked Barley, has a zero KB transcript.
* 2116, Nasal Congestion, also has a zero KB transcript.
Each of these is handled here as a title with metadata but no data,
just like any other title without a transcription.
-->
<project name="lucene-thdl" default="lucene-thdl-jar" basedir="."> <project name="lucene-thdl" default="lucene-thdl-jar" basedir=".">
<import file="build.xml"/> <import file="build.xml"/>
@ -9,17 +19,16 @@
</classpath> </classpath>
</taskdef> </taskdef>
<property name="lucene-thdl.bin" location="${bin}/lucene-thdl"/>
<property name="archive" location="archive"/>
<property name="wylie" location="${archive}/wylie"/>
<property name="unicode" location="${archive}/unicode"/>
<property name="solr" location="${archive}/solr"/>
<property name="styles" location="${archive}/styles"/>
<property name="get.title.metadata" value="http://thdl.org/avarch/mediaflowcat/title_metadata.php"/> <property name="get.title.metadata" value="http://thdl.org/avarch/mediaflowcat/title_metadata.php"/>
<property name="url.to.transcripts" value="http://www.thdl.org/avarch/transcripts"/> <property name="url.to.transcripts" value="http://www.thdl.org/avarch/transcripts"/>
<property name="url.to.media.high" value="http://www.thdl.org/media/high"/> <property name="url.to.media.high" value="http://www.thdl.org/media/high"/>
<property name="url.to.media.low" value="http://www.thdl.org/media/low"/> <property name="url.to.media.low" value="http://www.thdl.org/media/low"/>
<property name="url.to.media.audio" value="http://www.thdl.org/media/audio"/> <property name="url.to.media.audio" value="http://www.thdl.org/media/audio"/>
<property name="lucene-thdl.bin" location="${bin}/lucene-thdl"/>
<property name="archive" location="archive"/>
<property name="styles" location="${archive}/styles"/>
<property name="transcripts" location="${archive}/transcripts-with-metadata"/>
<property name="solr" location="${archive}/solr"/>
<path id="lucene.classpath"> <path id="lucene.classpath">
<fileset id="lucene.extensions" dir="${ext}/apache"> <fileset id="lucene.extensions" dir="${ext}/apache">
@ -33,13 +42,44 @@
</fileset> </fileset>
</path> </path>
<target name="compile-and-jar-libraries">
<antcall target="jskad-dist">
<param name="my.jar.suffix" value=""/>
</antcall>
</target>
<!-- archive tasks --> <!-- archive tasks -->
<target name="archive-get-metadata"> <target name="archive-get-metadata">
<mkdir dir="${archive}"/> <mkdir dir="${archive}"/>
<get src="${get.title.metadata}" dest="${archive}/title_metadata.xml" verbose="on"/> <get src="${get.title.metadata}" dest="${archive}/title_metadata.xml" verbose="on"/>
</target> </target>
<!-- due to encoding issues, none of the transcripts with filenames that need to be url-encoded
are retrieved correctly by this task; for example the umlaut in title 00007 messes things
up FIX ME!! -->
<target name="archive-get-and-transform-data" depends="compile-and-jar-libraries">
<mkdir dir="${transcripts}"/>
<java classname="net.sf.saxon.Transform" fork="yes">
<arg value="-s"/>
<arg value="${archive}/title_metadata.xml"/>
<arg value="-o"/>
<arg value="${transcripts}/DUMMY_FOR_BASE_URI"/>
<arg value="${styles}/mergeMetadataAndData.xsl"/>
<arg value="transcript.location=${url.to.transcripts}/"/>
<classpath>
<path refid="saxon.classpath"/>
<pathelement location="${vanillalib}/Jskad.jar"/>
</classpath>
</java>
</target>
<!-- solr tasks -->
<target name="solr-prepare-transcripts">
<mkdir dir="${solr}"/>
<xslt basedir="${transcripts}" destdir="${solr}" extension=".xml" style="${styles}/solarizeTranscript.xsl"/>
</target>
<target name="archive-get-list-of-transcripts"> <target name="archive-get-list-of-transcripts">
<!-- <java classname="net.sf.saxon.Transform" fork="yes"> <!-- <java classname="net.sf.saxon.Transform" fork="yes">
<arg value="-s"/> <arg value="-s"/>
@ -52,7 +92,6 @@
</classpath> </classpath>
</java>--> </java>-->
<xslt in="${archive}/title_metadata.xml" out="${archive}/titles_as_list.txt" style="${styles}/get-list-of-transcripts.xsl"/> <xslt in="${archive}/title_metadata.xml" out="${archive}/titles_as_list.txt" style="${styles}/get-list-of-transcripts.xsl"/>
<xslt in="${archive}/title_metadata.xml" out="${archive}/titles_as_list2.txt" style="${styles}/get-list-of-transcripts2.xsl"/>
</target> </target>
<target name="archive-get-transcripts"> <target name="archive-get-transcripts">
@ -67,59 +106,6 @@
<get src="${url.to.transcripts}/${filename}" dest="${wylie}/${filename}" ignoreerrors="true"/> <get src="${url.to.transcripts}/${filename}" dest="${wylie}/${filename}" ignoreerrors="true"/>
</target> </target>
<target name="saxon-test">
<mkdir dir="${unicode}"/>
<java classname="net.sf.saxon.Transform" fork="yes">
<arg value="-s"/>
<arg value="${archive}/title_metadata.xml"/>
<arg value="${styles}/mergeMetadataAndData.xsl"/>
<arg value="transcript.location=http://www.thdl.org/avarch/transcripts/"/>
<arg value="transform.to.dir=${unicode}"/>
<classpath>
<path refid="saxon.classpath"/>
<pathelement location="${vanillalib}/Jskad.jar"/>
</classpath>
</java>
</target>
<target name="archive-transcripts-to-unicode">
<mkdir dir="${unicode}"/>
<antcall target="jskad-dist">
<param name="my.jar.suffix" value=""/>
</antcall>
<mkdir dir="${lucene-thdl.bin}"/>
<javac srcdir="${source}" destdir="${lucene-thdl.bin}" includes="org/thdl/util/ant/SaxonLiaison.java" debug="on"/>
<loadfile property="transcript-list" srcFile="${archive}/titles_as_list.txt"/> <!-- encoding="UTF-8" -->
<foreach list="${transcript-list}" delimiter=" " param="id.plus.filename" target="archive-one-transcript-to-unicode"/>
</target>
<target name="archive-one-transcript-to-unicode">
<propertyregex property="title.id" input="${id.plus.filename}" regexp="(.+)/.+" select="\1"/>
<propertyregex property="filename" input="${id.plus.filename}" regexp=".+/(.+)" select="\1"/>
<!-- note: this processor is used to get around bug described here:
This class is a hack to work around Ant bug #41314: http://issues.apache.org/bugzilla/show_bug.cgi?id=41314
See Trevor's bike shed: http://www.vocaro.com/trevor/blog/2007/01/08/how-to-use-saxon-with-ant/
-->
<if>
<available file="${wylie}/${filename}" property="transcript.exists"/>
<then>
<xslt in="${wylie}/${filename}" out="${unicode}/${filename}" style="${styles}/qdToUnicode.xsl" processor="org.thdl.util.ant.SaxonLiaison">
<param name="title.id" expression="${title.id}"/>
<classpath>
<pathelement location="${lucene-thdl.bin}"/>
<pathelement location="${vanillalib}/Jskad.jar"/>
<path refid="saxon.classpath"/>
</classpath>
</xslt>
</then>
</if>
</target>
<!-- solr tasks -->
<target name="solr-prepare-transcripts">
</target>
<!-- solr stuff --> <!-- solr stuff -->