added more dynamic fields to solr schema, and added post.jar for batch posting within java of xml files

This commit is contained in:
eg3p 2007-05-24 15:53:34 +00:00
parent d9d12d9f64
commit 305cbc46fe
4 changed files with 65 additions and 9 deletions

View File

@ -182,8 +182,6 @@
<!-- transcript documents -->
<field name="id" type="string" indexed="true" stored="true"/>
<field name="thdl_type" type="string" indexed="true" stored="true"/>
<!-- Dynamic field definitions. If a field name is not found, dynamicFields
will be used if the name matches any of the patterns.
@ -216,6 +214,13 @@
<!-- FIXME shouldn't be string!! suffix for any field containing Chinese language content -->
<dynamicField name="*_zh" type="string" indexed="true" stored="true"/>
<dynamicField name="*_idref" type="string" indexed="true" stored="false"/>
<dynamicField name="*_opt" type="string" indexed="true" stored="true"/>
<dynamicField name="*_lang" type="string" indexed="true" stored="true"/>
<dynamicField name="*_size" type="sint" indexed="true" stored="true"/>
<dynamicField name="*_name" type="string" indexed="true" stored="true"/>
<dynamicField name="*_duration" type="date" indexed="true" stored="true"/>
<dynamicField name="*_filename" type="string" indexed="true" stored="true"/>
</fields>

View File

@ -6,7 +6,10 @@
<xsl:output method="xml" encoding="UTF-8" indent="yes"/>
<xsl:param name="TITLE_TYPE" select="'AVDB_TITLE'"/>
<xsl:param name="VIDEO_TYPE" select="'VIDEO'"/>
<xsl:param name="TRANSCRIPT_FRAGMENT_TYPE" select="'TRANSCRIPT_FRAGMENT'"/>
<xsl:param name="DURATION_PREFIX" select="'1970-01-01T'"/>
<xsl:param name="DURATION_SUFFIX" select="'Z'"/>
<xsl:template match="/">
<xsl:apply-templates select="TITLE"/>
@ -28,18 +31,42 @@
<xsl:param name="title.id" select="''"/>
<doc>
<field name="id"><xsl:value-of select="$title.id"/></field>
<field name="thdl_type"><xsl:value-of select="$TITLE_TYPE"/></field>
<field name="speechType"><xsl:value-of select="speechType"/></field>
<field name="language"><xsl:value-of select="language"/></field>
<field name="culturalRegion"><xsl:value-of select="culturalRegion"/></field>
<field name="thdlType_opt"><xsl:value-of select="$TITLE_TYPE"/></field>
<field name="speechType_opt"><xsl:value-of select="speechType"/></field>
<field name="language_lang"><xsl:value-of select="language"/></field>
<field name="administrativeLocation_opt"><xsl:value-of select="administrativeLocation"/></field>
<field name="culturalRegion_opt"><xsl:value-of select="culturalRegion"/></field>
<field name="title_en"><xsl:value-of select="name"/></field>
<field name="caption_en"><xsl:value-of select="caption"/></field>
<field name="speechType"><xsl:value-of select="speechType"/></field>
<field name="transcriptFilename"><xsl:value-of select="transcript"/></field>
<!-- make plan for videos; incl. caressing #s so that you can search for ranges -->
<!-- should we also include transcript and video ids? -->
<field name="transcript_filename"><xsl:value-of select="transcript"/></field>
<xsl:for-each select="video">
<xsl:choose>
<xsl:when test="mediaDescription='Audio'">
<field name="audio_size"><xsl:value-of select="size"/></field>
<field name="audio_duration"><xsl:value-of select="concat($DURATION_PREFIX,duration,$DURATION_SUFFIX)"/></field>
<field name="audio_filename"><xsl:value-of select="name"/></field>
</xsl:when>
<xsl:otherwise> <!-- must be video -->
<xsl:choose>
<xsl:when test="connectionSpeed='fast'">
<field name="high_size"><xsl:value-of select="size"/></field>
<field name="high_duration"><xsl:value-of select="concat($DURATION_PREFIX,duration,$DURATION_SUFFIX)"/></field>
<field name="high_filename"><xsl:value-of select="name"/></field>
</xsl:when>
<xsl:otherwise>
<field name="low_size"><xsl:value-of select="size"/></field>
<field name="low_duration"><xsl:value-of select="concat($DURATION_PREFIX,duration,$DURATION_SUFFIX)"/></field>
<field name="low_filename"><xsl:value-of select="name"/></field>
</xsl:otherwise>
</xsl:choose>
</xsl:otherwise>
</xsl:choose>
</xsl:for-each>
</doc>
</xsl:template>
<!-- Here's what a chunk of metadata looks like:
<speechType>Conversation</speechType>
@ -75,6 +102,7 @@
<xsl:param name="title.id" select="''"/>
<doc>
<field name="id"><xsl:value-of select="concat($title.id, '_', @id)"/></field>
<field name="transcript_idref"><xsl:value-of select="$title.id"/></field>
<field name="thdl_type"><xsl:value-of select="$TRANSCRIPT_FRAGMENT_TYPE"/></field>
<field name="form_bo"><xsl:value-of select="FORM[@xml:lang='bo']"/></field>
<field name="form_bo-Latn"><xsl:value-of select="FORM[@xml:lang='bo-Latn']"/></field>

BIN
extensions/apache/post.jar Normal file

Binary file not shown.

View File

@ -79,6 +79,29 @@
<xslt basedir="${transcripts}" destdir="${solr}" extension=".xml" style="${styles}/solarizeTranscript.xsl"/>
</target>
<!--
SimplePostTool: version 1.2
This is a simple command line tool for POSTing raw XML to a Solr
port. XML data can be read from files specified as commandline
args; as raw commandline arg strings; or via STDIN.
Examples:
java -Ddata=files -jar post.jar *.xml
java -Ddata=args -jar post.jar '<delete><id>42</id></delete>'
java -Ddata=stdin -jar post.jar < hd.xml
Other options controlled by System Properties include the Solr
URL to POST to, and whether a commit should be executed. These
are the defaults for all System Properties...
-Ddata=files
-Durl=http://localhost:8983/solr/update
-Dcommit=yes
-->
<target name="solr-post-and-commit-transcripts">
<java classname="net.sf.saxon.Transform" fork="yes">
<classpath>
<path refid="lucene.classpath"/>
</classpath>
</java>
</target>
<target name="archive-get-list-of-transcripts">
<!-- <java classname="net.sf.saxon.Transform" fork="yes">