streamlined lucene build file and added task to prepare transcript data and metadata for posting to a solr index
This commit is contained in:
parent
73f429f9b4
commit
a53bc5b126
3 changed files with 151 additions and 62 deletions
|
@ -9,7 +9,7 @@
|
||||||
|
|
||||||
<xsl:import href="qdToUnicode.xsl"/>
|
<xsl:import href="qdToUnicode.xsl"/>
|
||||||
|
|
||||||
<xsl:output method="xml" encoding="UTF-8" indent="yes" name="unicode.out"/>
|
<xsl:output method="xml" encoding="UTF-8" indent="yes" name="unicode.transcript.with.metadata"/>
|
||||||
|
|
||||||
<xsl:param name="transcript.location" select="''"/>
|
<xsl:param name="transcript.location" select="''"/>
|
||||||
<xsl:param name="transform.to.dir" select="''"/>
|
<xsl:param name="transform.to.dir" select="''"/>
|
||||||
|
@ -18,7 +18,7 @@
|
||||||
<xsl:for-each select="//transcript">
|
<xsl:for-each select="//transcript">
|
||||||
<xsl:variable name="filename" select="."/>
|
<xsl:variable name="filename" select="."/>
|
||||||
<!-- <xsl:variable name="filename" select="encoder:encode(.,'UTF-8')"/> -->
|
<!-- <xsl:variable name="filename" select="encoder:encode(.,'UTF-8')"/> -->
|
||||||
<xsl:result-document href="{$transform.to.dir}/{$filename}" format="unicode.out">
|
<xsl:result-document href="{$filename}" format="unicode.transcript.with.metadata">
|
||||||
<xsl:element name="TITLE">
|
<xsl:element name="TITLE">
|
||||||
<xsl:attribute name="id">
|
<xsl:attribute name="id">
|
||||||
<xsl:value-of select="../@id"/>
|
<xsl:value-of select="../@id"/>
|
||||||
|
|
103
archive/styles/solarizeTranscript.xsl
Normal file
103
archive/styles/solarizeTranscript.xsl
Normal file
|
@ -0,0 +1,103 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
|
||||||
|
<xsl:stylesheet version="1.0"
|
||||||
|
xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
|
||||||
|
|
||||||
|
<xsl:output method="xml" encoding="UTF-8" indent="yes"/>
|
||||||
|
|
||||||
|
<xsl:param name="TITLE_TYPE" select="'AVDB_TITLE'"/>
|
||||||
|
<xsl:param name="TRANSCRIPT_FRAGMENT_TYPE" select="'TRANSCRIPT_FRAGMENT'"/>
|
||||||
|
|
||||||
|
<xsl:template match="/">
|
||||||
|
<xsl:apply-templates select="TITLE"/>
|
||||||
|
</xsl:template>
|
||||||
|
|
||||||
|
<xsl:template match="TITLE">
|
||||||
|
<add>
|
||||||
|
<xsl:variable name="title.id" select="@id"/>
|
||||||
|
<xsl:apply-templates select="METADATA">
|
||||||
|
<xsl:with-param name="title.id" select="$title.id"/>
|
||||||
|
</xsl:apply-templates>
|
||||||
|
<xsl:apply-templates select="TEXT/S">
|
||||||
|
<xsl:with-param name="title.id" select="$title.id"/>
|
||||||
|
</xsl:apply-templates>
|
||||||
|
</add>
|
||||||
|
</xsl:template>
|
||||||
|
|
||||||
|
<xsl:template match="METADATA">
|
||||||
|
<xsl:param name="title.id" select="''"/>
|
||||||
|
<doc>
|
||||||
|
<field name="id"><xsl:value-of select="$title.id"/></field>
|
||||||
|
<field name="thdl_type"><xsl:value-of select="$TITLE_TYPE"/></field>
|
||||||
|
<field name="speechType"><xsl:value-of select="speechType"/></field>
|
||||||
|
<field name="language"><xsl:value-of select="language"/></field>
|
||||||
|
<field name="culturalRegion"><xsl:value-of select="culturalRegion"/></field>
|
||||||
|
<field name="title_en"><xsl:value-of select="name"/></field>
|
||||||
|
<field name="caption_en"><xsl:value-of select="caption"/></field>
|
||||||
|
<field name="speechType"><xsl:value-of select="speechType"/></field>
|
||||||
|
<field name="transcriptFilename"><xsl:value-of select="transcript"/></field>
|
||||||
|
<!-- make plan for videos; incl. caressing #s so that you can search for ranges -->
|
||||||
|
</doc>
|
||||||
|
</xsl:template>
|
||||||
|
|
||||||
|
<!-- Here's what a chunk of metadata looks like:
|
||||||
|
|
||||||
|
<speechType>Conversation</speechType>
|
||||||
|
<language>Tibetan</language>
|
||||||
|
<culturalRegion>dbus</culturalRegion>
|
||||||
|
<name>A Lucky Dream: <i>Three's Company #01</i></name>
|
||||||
|
<caption>CAPTION HEAD</caption>
|
||||||
|
<transcript id="2502">00008_01-a-lucky-dream_09.xml</transcript>
|
||||||
|
<video id="6613">
|
||||||
|
<mediaDescription>Audio</mediaDescription>
|
||||||
|
<connectionSpeed>medium</connectionSpeed>
|
||||||
|
<size>3225818</size>
|
||||||
|
<duration>00:02:14</duration>
|
||||||
|
<name>00008_lucky-dream.mp3</name>
|
||||||
|
</video>
|
||||||
|
<video id="6612">
|
||||||
|
<mediaDescription>Video</mediaDescription>
|
||||||
|
<connectionSpeed>medium</connectionSpeed>
|
||||||
|
<size>5009798</size>
|
||||||
|
<duration>00:11:11</duration>
|
||||||
|
<name>00008_lucky-dream.mp4</name>
|
||||||
|
</video>
|
||||||
|
<video id="6611">
|
||||||
|
<mediaDescription>Video</mediaDescription>
|
||||||
|
<connectionSpeed>fast</connectionSpeed>
|
||||||
|
<size>13556712</size>
|
||||||
|
<duration>00:11:11</duration>
|
||||||
|
<name>00008_lucky-dream.mp4</name>
|
||||||
|
</video>
|
||||||
|
-->
|
||||||
|
|
||||||
|
<xsl:template match="S">
|
||||||
|
<xsl:param name="title.id" select="''"/>
|
||||||
|
<doc>
|
||||||
|
<field name="id"><xsl:value-of select="concat($title.id, '_', @id)"/></field>
|
||||||
|
<field name="thdl_type"><xsl:value-of select="$TRANSCRIPT_FRAGMENT_TYPE"/></field>
|
||||||
|
<field name="form_bo"><xsl:value-of select="FORM[@xml:lang='bo']"/></field>
|
||||||
|
<field name="form_bo-Latn"><xsl:value-of select="FORM[@xml:lang='bo-Latn']"/></field>
|
||||||
|
<field name="transl_en"><xsl:value-of select="TRANSL[@xml:lang='en']"/></field>
|
||||||
|
<field name="transl_zh"><xsl:value-of select="TRANSL[@xml:lang='zh']"/></field>
|
||||||
|
<xsl:if test="AUDIO/@start">
|
||||||
|
<field name="start_f"><xsl:value-of select="AUDIO/@start"/></field>
|
||||||
|
</xsl:if>
|
||||||
|
<xsl:if test="AUDIO/@end">
|
||||||
|
<field name="end_f"><xsl:value-of select="AUDIO/@end"/></field>
|
||||||
|
</xsl:if>
|
||||||
|
</doc>
|
||||||
|
</xsl:template>
|
||||||
|
|
||||||
|
<!-- Here's what a chunk of transcript looks like:
|
||||||
|
|
||||||
|
<S who="N400005" id="d148e29">
|
||||||
|
<FORM xml:lang="bo">དེ་རིང་གནམ་ཡག་པོ་ར་ཅིག་༿འདྲ་ཅིག༾མི་འདུག་གས།</FORM>
|
||||||
|
<FORM xml:lang="bo-Latn">de ring gnam yag po ra cig {'dra cig}mi 'dug gas/</FORM>
|
||||||
|
<TRANSL xml:lang="en">Isn't it a nice day today?</TRANSL>
|
||||||
|
<TRANSL xml:lang="zh">今天的天气多好啊, 是吧!</TRANSL>
|
||||||
|
<AUDIO end="8.925999997392298" start="7.63"/>
|
||||||
|
</S>
|
||||||
|
-->
|
||||||
|
|
||||||
|
</xsl:stylesheet>
|
|
@ -1,5 +1,15 @@
|
||||||
<?xml version="1.0" encoding="utf-8"?>
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
|
||||||
|
<!--
|
||||||
|
Some THDL titles are problematic:
|
||||||
|
* 691, A New Script, claims to have a transcript but there is no
|
||||||
|
transcript at the URL.
|
||||||
|
* 2069, Husked Barley, has a zero KB transcript.
|
||||||
|
* 2116, Nasal Congestion, also has a zero KB transcript.
|
||||||
|
Each of these is handled here as a title with metadata but no data,
|
||||||
|
just like any other title without a transcription.
|
||||||
|
-->
|
||||||
|
|
||||||
<project name="lucene-thdl" default="lucene-thdl-jar" basedir=".">
|
<project name="lucene-thdl" default="lucene-thdl-jar" basedir=".">
|
||||||
<import file="build.xml"/>
|
<import file="build.xml"/>
|
||||||
|
|
||||||
|
@ -9,17 +19,16 @@
|
||||||
</classpath>
|
</classpath>
|
||||||
</taskdef>
|
</taskdef>
|
||||||
|
|
||||||
<property name="lucene-thdl.bin" location="${bin}/lucene-thdl"/>
|
|
||||||
<property name="archive" location="archive"/>
|
|
||||||
<property name="wylie" location="${archive}/wylie"/>
|
|
||||||
<property name="unicode" location="${archive}/unicode"/>
|
|
||||||
<property name="solr" location="${archive}/solr"/>
|
|
||||||
<property name="styles" location="${archive}/styles"/>
|
|
||||||
<property name="get.title.metadata" value="http://thdl.org/avarch/mediaflowcat/title_metadata.php"/>
|
<property name="get.title.metadata" value="http://thdl.org/avarch/mediaflowcat/title_metadata.php"/>
|
||||||
<property name="url.to.transcripts" value="http://www.thdl.org/avarch/transcripts"/>
|
<property name="url.to.transcripts" value="http://www.thdl.org/avarch/transcripts"/>
|
||||||
<property name="url.to.media.high" value="http://www.thdl.org/media/high"/>
|
<property name="url.to.media.high" value="http://www.thdl.org/media/high"/>
|
||||||
<property name="url.to.media.low" value="http://www.thdl.org/media/low"/>
|
<property name="url.to.media.low" value="http://www.thdl.org/media/low"/>
|
||||||
<property name="url.to.media.audio" value="http://www.thdl.org/media/audio"/>
|
<property name="url.to.media.audio" value="http://www.thdl.org/media/audio"/>
|
||||||
|
<property name="lucene-thdl.bin" location="${bin}/lucene-thdl"/>
|
||||||
|
<property name="archive" location="archive"/>
|
||||||
|
<property name="styles" location="${archive}/styles"/>
|
||||||
|
<property name="transcripts" location="${archive}/transcripts-with-metadata"/>
|
||||||
|
<property name="solr" location="${archive}/solr"/>
|
||||||
|
|
||||||
<path id="lucene.classpath">
|
<path id="lucene.classpath">
|
||||||
<fileset id="lucene.extensions" dir="${ext}/apache">
|
<fileset id="lucene.extensions" dir="${ext}/apache">
|
||||||
|
@ -33,6 +42,11 @@
|
||||||
</fileset>
|
</fileset>
|
||||||
</path>
|
</path>
|
||||||
|
|
||||||
|
<target name="compile-and-jar-libraries">
|
||||||
|
<antcall target="jskad-dist">
|
||||||
|
<param name="my.jar.suffix" value=""/>
|
||||||
|
</antcall>
|
||||||
|
</target>
|
||||||
|
|
||||||
<!-- archive tasks -->
|
<!-- archive tasks -->
|
||||||
<target name="archive-get-metadata">
|
<target name="archive-get-metadata">
|
||||||
|
@ -40,6 +54,32 @@
|
||||||
<get src="${get.title.metadata}" dest="${archive}/title_metadata.xml" verbose="on"/>
|
<get src="${get.title.metadata}" dest="${archive}/title_metadata.xml" verbose="on"/>
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
|
<!-- due to encoding issues, none of the transcripts with filenames that need to be url-encoded
|
||||||
|
are retrieved correctly by this task; for example the umlaut in title 00007 messes things
|
||||||
|
up FIX ME!! -->
|
||||||
|
<target name="archive-get-and-transform-data" depends="compile-and-jar-libraries">
|
||||||
|
<mkdir dir="${transcripts}"/>
|
||||||
|
<java classname="net.sf.saxon.Transform" fork="yes">
|
||||||
|
<arg value="-s"/>
|
||||||
|
<arg value="${archive}/title_metadata.xml"/>
|
||||||
|
<arg value="-o"/>
|
||||||
|
<arg value="${transcripts}/DUMMY_FOR_BASE_URI"/>
|
||||||
|
<arg value="${styles}/mergeMetadataAndData.xsl"/>
|
||||||
|
<arg value="transcript.location=${url.to.transcripts}/"/>
|
||||||
|
<classpath>
|
||||||
|
<path refid="saxon.classpath"/>
|
||||||
|
<pathelement location="${vanillalib}/Jskad.jar"/>
|
||||||
|
</classpath>
|
||||||
|
</java>
|
||||||
|
</target>
|
||||||
|
|
||||||
|
<!-- solr tasks -->
|
||||||
|
<target name="solr-prepare-transcripts">
|
||||||
|
<mkdir dir="${solr}"/>
|
||||||
|
<xslt basedir="${transcripts}" destdir="${solr}" extension=".xml" style="${styles}/solarizeTranscript.xsl"/>
|
||||||
|
</target>
|
||||||
|
|
||||||
|
|
||||||
<target name="archive-get-list-of-transcripts">
|
<target name="archive-get-list-of-transcripts">
|
||||||
<!-- <java classname="net.sf.saxon.Transform" fork="yes">
|
<!-- <java classname="net.sf.saxon.Transform" fork="yes">
|
||||||
<arg value="-s"/>
|
<arg value="-s"/>
|
||||||
|
@ -52,7 +92,6 @@
|
||||||
</classpath>
|
</classpath>
|
||||||
</java>-->
|
</java>-->
|
||||||
<xslt in="${archive}/title_metadata.xml" out="${archive}/titles_as_list.txt" style="${styles}/get-list-of-transcripts.xsl"/>
|
<xslt in="${archive}/title_metadata.xml" out="${archive}/titles_as_list.txt" style="${styles}/get-list-of-transcripts.xsl"/>
|
||||||
<xslt in="${archive}/title_metadata.xml" out="${archive}/titles_as_list2.txt" style="${styles}/get-list-of-transcripts2.xsl"/>
|
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<target name="archive-get-transcripts">
|
<target name="archive-get-transcripts">
|
||||||
|
@ -67,59 +106,6 @@
|
||||||
<get src="${url.to.transcripts}/${filename}" dest="${wylie}/${filename}" ignoreerrors="true"/>
|
<get src="${url.to.transcripts}/${filename}" dest="${wylie}/${filename}" ignoreerrors="true"/>
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<target name="saxon-test">
|
|
||||||
<mkdir dir="${unicode}"/>
|
|
||||||
<java classname="net.sf.saxon.Transform" fork="yes">
|
|
||||||
<arg value="-s"/>
|
|
||||||
<arg value="${archive}/title_metadata.xml"/>
|
|
||||||
<arg value="${styles}/mergeMetadataAndData.xsl"/>
|
|
||||||
<arg value="transcript.location=http://www.thdl.org/avarch/transcripts/"/>
|
|
||||||
<arg value="transform.to.dir=${unicode}"/>
|
|
||||||
<classpath>
|
|
||||||
<path refid="saxon.classpath"/>
|
|
||||||
<pathelement location="${vanillalib}/Jskad.jar"/>
|
|
||||||
</classpath>
|
|
||||||
</java>
|
|
||||||
</target>
|
|
||||||
|
|
||||||
<target name="archive-transcripts-to-unicode">
|
|
||||||
<mkdir dir="${unicode}"/>
|
|
||||||
<antcall target="jskad-dist">
|
|
||||||
<param name="my.jar.suffix" value=""/>
|
|
||||||
</antcall>
|
|
||||||
<mkdir dir="${lucene-thdl.bin}"/>
|
|
||||||
<javac srcdir="${source}" destdir="${lucene-thdl.bin}" includes="org/thdl/util/ant/SaxonLiaison.java" debug="on"/>
|
|
||||||
<loadfile property="transcript-list" srcFile="${archive}/titles_as_list.txt"/> <!-- encoding="UTF-8" -->
|
|
||||||
<foreach list="${transcript-list}" delimiter=" " param="id.plus.filename" target="archive-one-transcript-to-unicode"/>
|
|
||||||
</target>
|
|
||||||
|
|
||||||
<target name="archive-one-transcript-to-unicode">
|
|
||||||
<propertyregex property="title.id" input="${id.plus.filename}" regexp="(.+)/.+" select="\1"/>
|
|
||||||
<propertyregex property="filename" input="${id.plus.filename}" regexp=".+/(.+)" select="\1"/>
|
|
||||||
<!-- note: this processor is used to get around bug described here:
|
|
||||||
This class is a hack to work around Ant bug #41314: http://issues.apache.org/bugzilla/show_bug.cgi?id=41314
|
|
||||||
See Trevor's bike shed: http://www.vocaro.com/trevor/blog/2007/01/08/how-to-use-saxon-with-ant/
|
|
||||||
-->
|
|
||||||
<if>
|
|
||||||
<available file="${wylie}/${filename}" property="transcript.exists"/>
|
|
||||||
<then>
|
|
||||||
<xslt in="${wylie}/${filename}" out="${unicode}/${filename}" style="${styles}/qdToUnicode.xsl" processor="org.thdl.util.ant.SaxonLiaison">
|
|
||||||
<param name="title.id" expression="${title.id}"/>
|
|
||||||
<classpath>
|
|
||||||
<pathelement location="${lucene-thdl.bin}"/>
|
|
||||||
<pathelement location="${vanillalib}/Jskad.jar"/>
|
|
||||||
<path refid="saxon.classpath"/>
|
|
||||||
</classpath>
|
|
||||||
</xslt>
|
|
||||||
</then>
|
|
||||||
</if>
|
|
||||||
</target>
|
|
||||||
|
|
||||||
|
|
||||||
<!-- solr tasks -->
|
|
||||||
<target name="solr-prepare-transcripts">
|
|
||||||
|
|
||||||
</target>
|
|
||||||
|
|
||||||
|
|
||||||
<!-- solr stuff -->
|
<!-- solr stuff -->
|
||||||
|
|
Loading…
Reference in a new issue