replaced solr version 1.1 with 1.2

This commit is contained in:
eg3p 2007-06-22 11:17:33 +00:00
parent 0addc3c957
commit 6327403a3f
12 changed files with 116 additions and 163 deletions

View file

@ -2,10 +2,11 @@ The files in this directory must be copied to your solr webapp
before you can use lucene-thdl-build.xml to post, commit, or
delete documents from your solr server.
schema.xml :
Copy this file to your solr/conf directory,
replacing the existing schema.xml file.
schema.xml & solrconfig.xml:
Copy these files to your solr/conf directory,
replacing the existing files with these names
lucene-thdl.jar :
Create an up to date copy of this file by
running the task lucene-thdl-jar, then copy

View file

@ -8,7 +8,7 @@
http://wiki.apache.org/solr/SchemaXml
-->
<schema name="THDL" version="1.1">
<schema name="THDL Archive" version="1.1">
<!-- attribute "name" is the name of this schema and is only used for display purposes.
Applications should change this to reflect the nature of the search collection.
version="1.1" is Solr's version number for the schema syntax and semantics. It should
@ -158,10 +158,15 @@
</fieldtype>
<fieldtype name="text_wylie" class="solr.TextField">
<analyzer class="solr.WhitespaceTokenizerFactory"/>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<!-- <analyzer class="org.thdl.lucene.WylieTibetanAnalyzer"/> -->
</fieldtype>
<fieldtype name="text_idlist" class="solr.TextField">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<!-- <filter class="org.thdl.lucene.NumberPadderFactory"/> -->
</fieldtype>
</types>
@ -183,6 +188,7 @@
<!-- transcript documents -->
<field name="id" type="string" indexed="true" stored="true"/>
<field name="form_bo" type="text_tibetan" indexed="true" stored="true"/>
<!-- Dynamic field definitions. If a field name is not found, dynamicFields
will be used if the name matches any of the patterns.
@ -207,7 +213,7 @@
<dynamicField name="*_bo" type="text_tibetan" indexed="true" stored="true"/>
<!-- suffix for any field containing Tibetan transliterated into Wylie -->
<dynamicField name="*_bo-Latn" type="text_wylie" indexed="true" stored="true"/>
<dynamicField name="*_bo-Latn" type="text_ws" indexed="true" stored="true"/>
<!-- suffix for any field containing English language content -->
<dynamicField name="*_en" type="text_lu" indexed="true" stored="true"/>
@ -216,11 +222,8 @@
<dynamicField name="*_zh" type="string" indexed="true" stored="true"/>
<dynamicField name="*_idref" type="string" indexed="true" stored="false"/>
<dynamicField name="*_opt" type="string" indexed="true" stored="true"/>
<dynamicField name="*_idlist" type="text_ws" indexed="true" stored="true"/>
<dynamicField name="*_lang" type="string" indexed="true" stored="true"/>
<dynamicField name="*_size" type="sint" indexed="true" stored="true"/>
<dynamicField name="*_name" type="string" indexed="true" stored="true"/>
<dynamicField name="*_duration" type="date" indexed="true" stored="true"/>
<dynamicField name="*_filename" type="string" indexed="true" stored="true"/>
</fields>

View file

@ -17,8 +17,7 @@
<xsl:template match="/">
<xsl:for-each select="//transcript">
<xsl:variable name="filename" select="."/>
<!-- <xsl:variable name="filename" select="encoder:encode(.,'UTF-8')"/> -->
<xsl:result-document href="{$filename}" format="unicode.transcript.with.metadata">
<xsl:result-document href="Transcript_{../@id}.xml" format="unicode.transcript.with.metadata">
<xsl:element name="TITLE">
<xsl:attribute name="id">
<xsl:value-of select="../@id"/>

View file

@ -5,11 +5,7 @@
<xsl:output method="xml" encoding="UTF-8" indent="yes"/>
<xsl:param name="TITLE_TYPE" select="'AVDB_TITLE'"/>
<xsl:param name="VIDEO_TYPE" select="'VIDEO'"/>
<xsl:param name="TRANSCRIPT_FRAGMENT_TYPE" select="'TRANSCRIPT_FRAGMENT'"/>
<xsl:param name="DURATION_PREFIX" select="'1970-01-01T'"/>
<xsl:param name="DURATION_SUFFIX" select="'Z'"/>
<xsl:import href="solarizeConstantsForImport.xsl"/>
<xsl:template match="/">
<xsl:apply-templates select="TITLE"/>
@ -23,49 +19,55 @@
</xsl:apply-templates>
<xsl:apply-templates select="TEXT/S">
<xsl:with-param name="title.id" select="$title.id"/>
<xsl:with-param name="belongs.to" select="METADATA/belongsTo"/>
</xsl:apply-templates>
</add>
</xsl:template>
<!-- should we also include transcript and video ids? -->
<xsl:template match="METADATA">
<xsl:param name="title.id" select="''"/>
<doc>
<field name="id"><xsl:value-of select="$title.id"/></field>
<field name="thdlType_opt"><xsl:value-of select="$TITLE_TYPE"/></field>
<field name="speechType_opt"><xsl:value-of select="speechType"/></field>
<field name="thdlType_s"><xsl:value-of select="$TITLE_TYPE"/></field>
<field name="belongsTo_idlist"><xsl:value-of select="belongsTo"/></field>
<field name="speechType_s"><xsl:value-of select="speechType"/></field>
<field name="language_lang"><xsl:value-of select="language"/></field>
<field name="administrativeLocation_opt"><xsl:value-of select="administrativeLocation"/></field>
<field name="culturalRegion_opt"><xsl:value-of select="culturalRegion"/></field>
<field name="administrativeLocation_s"><xsl:value-of select="administrativeLocation"/></field>
<field name="culturalRegion_s"><xsl:value-of select="culturalRegion"/></field>
<field name="title_en"><xsl:value-of select="name"/></field>
<field name="caption_en"><xsl:value-of select="caption"/></field>
<!-- should we also include transcript and video ids? -->
<field name="transcript_filename"><xsl:value-of select="transcript"/></field>
<xsl:for-each select="video">
<xsl:choose>
<xsl:when test="mediaDescription='Audio'">
<field name="audio_size"><xsl:value-of select="size"/></field>
<field name="audio_duration"><xsl:value-of select="concat($DURATION_PREFIX,duration,$DURATION_SUFFIX)"/></field>
<field name="audio_filename"><xsl:value-of select="name"/></field>
</xsl:when>
<xsl:otherwise> <!-- must be video -->
<xsl:choose>
<xsl:when test="connectionSpeed='fast'">
<field name="high_size"><xsl:value-of select="size"/></field>
<field name="high_duration"><xsl:value-of select="concat($DURATION_PREFIX,duration,$DURATION_SUFFIX)"/></field>
<field name="high_filename"><xsl:value-of select="name"/></field>
</xsl:when>
<xsl:otherwise>
<field name="low_size"><xsl:value-of select="size"/></field>
<field name="low_duration"><xsl:value-of select="concat($DURATION_PREFIX,duration,$DURATION_SUFFIX)"/></field>
<field name="low_filename"><xsl:value-of select="name"/></field>
</xsl:otherwise>
</xsl:choose>
</xsl:otherwise>
</xsl:choose>
</xsl:for-each>
<xsl:variable name="video.ids">
<xsl:call-template name="getVideoList">
<xsl:with-param name="metadata" select="."/>
</xsl:call-template>
</xsl:variable>
<xsl:if test="normalize-space($video.ids)">
<field name="videos_idlist"><xsl:value-of select="normalize-space($video.ids)"/></field>
</xsl:if>
</doc>
<xsl:for-each select="video">
<doc>
<field name="id"><xsl:value-of select="concat(@id)"/></field>
<field name="title_idref"><xsl:value-of select="$title.id"/></field>
<field name="thdlType_s"><xsl:value-of select="$VIDEO_TYPE"/></field>
<field name="mediaType_s"><xsl:value-of select="mediaDescription"/></field>
<field name="connSpeed_s"><xsl:value-of select="connectionSpeed"/></field>
<field name="quality_s"><xsl:value-of select="quality"/></field>
<field name="size_i"><xsl:value-of select="size"/></field>
<field name="duration_dt"><xsl:value-of select="concat($DURATION_PREFIX,duration,$DURATION_SUFFIX)"/></field>
<field name="media_filename"><xsl:value-of select="name"/></field>
</doc>
</xsl:for-each>
</xsl:template>
<xsl:template name="getVideoList">
<xsl:param name="metadata" select="''"/>
<xsl:for-each select="$metadata/video">
<xsl:value-of select="@id"/><xsl:text> </xsl:text>
</xsl:for-each>
</xsl:template>
<!-- Here's what a chunk of metadata looks like:
@ -100,19 +102,26 @@
<xsl:template match="S">
<xsl:param name="title.id" select="''"/>
<xsl:param name="belongs.to" select="''"/>
<doc>
<field name="id"><xsl:value-of select="concat($title.id, '_', @id)"/></field>
<field name="transcript_idref"><xsl:value-of select="$title.id"/></field>
<field name="thdl_type"><xsl:value-of select="$TRANSCRIPT_FRAGMENT_TYPE"/></field>
<field name="title_idref"><xsl:value-of select="$title.id"/></field>
<field name="thdlType_s"><xsl:value-of select="$TRANSCRIPT_FRAGMENT_TYPE"/></field>
<field name="belongsTo_idlist"><xsl:value-of select="$belongs.to"/></field>
<field name="form_bo"><xsl:value-of select="FORM[@xml:lang='bo']"/></field>
<field name="form_bo-Latn"><xsl:value-of select="FORM[@xml:lang='bo-Latn']"/></field>
<field name="transl_en"><xsl:value-of select="TRANSL[@xml:lang='en']"/></field>
<field name="transl_zh"><xsl:value-of select="TRANSL[@xml:lang='zh']"/></field>
<xsl:if test="AUDIO/@start">
<field name="start_f"><xsl:value-of select="AUDIO/@start"/></field>
<xsl:if test="TRANSL[@xml:lang='en']">
<field name="transl_en"><xsl:value-of select="TRANSL[@xml:lang='en']"/></field>
</xsl:if>
<xsl:if test="AUDIO/@end">
<field name="end_f"><xsl:value-of select="AUDIO/@end"/></field>
<xsl:if test="TRANSL[@xml:lang='zh']">
<field name="transl_zh"><xsl:value-of select="TRANSL[@xml:lang='zh']"/></field>
</xsl:if>
<xsl:if test="string(AUDIO/@start)">
<field name="start_f"><xsl:value-of select="AUDIO/@start"/></field>
<xsl:if test="string(AUDIO/@end)">
<field name="end_f"><xsl:value-of select="AUDIO/@end"/></field>
<field name="duration_f"><xsl:value-of select="AUDIO/@end - AUDIO/@start"/></field>
</xsl:if>
</xsl:if>
</doc>
</xsl:template>

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -19,16 +19,26 @@
</classpath>
</taskdef>
<property name="get.title.metadata" value="http://thdl.org/avarch/mediaflowcat/title_metadata.php"/>
<property name="get.avdb.metadata" value="http://thdl.org/avarch/mediaflowcat/title_metadata.php"/>
<property name="only.titles" value="excludeNonTitles=true"/>
<property name="only.nontitles" value="excludeTitles=true"/>
<property name="url.to.transcripts" value="http://www.thdl.org/avarch/transcripts"/>
<property name="url.to.media.high" value="http://www.thdl.org/media/high"/>
<property name="url.to.media.low" value="http://www.thdl.org/media/low"/>
<property name="url.to.media.audio" value="http://www.thdl.org/media/audio"/>
<property name="lucene-thdl.bin" location="${bin}/lucene-thdl"/>
<property name="archive" location="archive"/>
<property name="title.metadata" value="${archive}/title_metadata.xml"/>
<property name="nontitle.metadata" value="${archive}/nontitle_metadata.xml"/>
<property name="styles" location="${archive}/styles"/>
<property name="transcripts" location="${archive}/transcripts-with-metadata"/>
<property name="solr" location="${archive}/solr"/>
<property name="solr.titles" location="${solr}/titles"/>
<property name="solr.nontitle.dir" location="${solr}/nontitles"/>
<property name="solr.nontitle.file" location="${solr.nontitle.dir}/add_nontitles.xml"/>
<property name="copy.to.solr.webapp" location="${archive}/copy-to-solr-webapp"/>
<property name="url.to.solr" value="http://localhost:8983/solr"/>
<property name="url.to.solr.update" value="${url.to.solr}/update"/>
<path id="lucene.classpath">
<fileset id="lucene.extensions" dir="${ext}/apache">
@ -51,7 +61,21 @@
<!-- archive tasks -->
<target name="archive-get-metadata">
<mkdir dir="${archive}"/>
<get src="${get.title.metadata}" dest="${archive}/title_metadata.xml" verbose="on"/>
<get src="${get.avdb.metadata}?${parameters}" dest="${output.filename}" verbose="on"/>
</target>
<target name="archive-get-all-title-metadata">
<antcall target="archive-get-metadata">
<param name="parameters" value="${only.titles}"/>
<param name="output.filename" value="${title.metadata}"/>
</antcall>
</target>
<target name="archive-get-all-nontitle-metadata">
<antcall target="archive-get-metadata">
<param name="parameters" value="${only.nontitles}"/>
<param name="output.filename" value="${nontitle.metadata}"/>
</antcall>
</target>
<!-- due to encoding issues, none of the transcripts with filenames that need to be url-encoded
@ -61,7 +85,7 @@
<mkdir dir="${transcripts}"/>
<java classname="net.sf.saxon.Transform" fork="yes">
<arg value="-s"/>
<arg value="${archive}/title_metadata.xml"/>
<arg value="${title.metadata}"/>
<arg value="-o"/>
<arg value="${transcripts}/DUMMY_FOR_BASE_URI"/>
<arg value="${styles}/mergeMetadataAndData.xsl"/>
@ -74,11 +98,16 @@
</target>
<!-- solr tasks -->
<target name="solr-prepare-transcripts">
<mkdir dir="${solr}"/>
<xslt basedir="${transcripts}" destdir="${solr}" extension=".xml" style="${styles}/solarizeTranscript.xsl"/>
<target name="solr-prepare-titles">
<mkdir dir="${solr.titles}"/>
<xslt basedir="${transcripts}" includes="*.xml" destdir="${solr.titles}" extension=".xml" style="${styles}/solarizeTranscript.xsl"/>
</target>
<target name="solr-prepare-nontitles">
<mkdir dir="${solr.nontitle.dir}"/>
<xslt in="${nontitle.metadata}" out="${solr.nontitle.file}/" style="${styles}/solarizeNonTitles.xsl"/>
</target>
<!--
SimplePostTool: version 1.2
This is a simple command line tool for POSTing raw XML to a Solr
@ -95,91 +124,23 @@
-Durl=http://localhost:8983/solr/update
-Dcommit=yes
-->
<target name="solr-post-and-commit-transcripts">
<java classname="net.sf.saxon.Transform" fork="yes">
<classpath>
<path refid="lucene.classpath"/>
</classpath>
<target name="solr-post-and-commit-titles">
<fileset dir="${solr.titles}" includes="*.xml" id="solr.add.fileset"/>
<pathconvert pathsep=" " property="list.of.files.to.post" refid="solr.add.fileset"/>
<java jar="${ext}/apache/post.jar" fork="true">
<arg line="${list.of.files.to.post}"/>
<jvmarg value="-Durl=${url.to.solr.update}"/>
<jvmarg value="-Dcommit=yes"/>
</java>
</target>
<target name="archive-get-list-of-transcripts">
<!-- <java classname="net.sf.saxon.Transform" fork="yes">
<arg value="-s"/>
<arg value="${archive}/title_metadata.xml"/>
<arg value="-o"/>
<arg value="${archive}/titles_as_list.txt"/>
<arg value="${styles}/get-list-of-transcripts.xsl"/>
<classpath>
<path refid="saxon.classpath"/>
</classpath>
</java>-->
<xslt in="${archive}/title_metadata.xml" out="${archive}/titles_as_list.txt" style="${styles}/get-list-of-transcripts.xsl"/>
</target>
<target name="archive-get-transcripts">
<mkdir dir="${wylie}"/>
<loadfile property="transcript-list" srcFile="${archive}/titles_as_list2.txt"/> <!-- encoding="UTF-8" -->
<foreach list="${transcript-list}" delimiter=" " param="filename" target="archive-get-one-transcript"/>
<!--<foreach list="${transcript-list}" delimiter=" " param="id.plus.filename" target="archive-get-one-transcript"/>-->
</target>
<target name="archive-get-one-transcript">
<!--<propertyregex property="filename" input="${id.plus.filename}" regexp=".+/(.+)" select="\1"/>-->
<get src="${url.to.transcripts}/${filename}" dest="${wylie}/${filename}" ignoreerrors="true"/>
</target>
<!-- solr stuff -->
<!-- <target name="solr-1:clean-local" depends="clean">
<delete dir="${solarized.transcript.dir.prefinal}"/>
<delete dir="${solarized.transcript.dir.final}"/>
<mkdir dir="${solarized.transcript.dir.prefinal}"/>
<mkdir dir="${solarized.transcript.dir.final}"/>
</target>-->
<!-- <target name="solr-2:prepare-documents" depends="init, dbxml-4:ngram-magic">-->
<!-- create xml data file used to assign tags to mono tsheg bars -->
<!--
<java classname="net.sf.saxon.Transform" fork="yes">
<arg value="-o"/>
<arg value="${build.dir}/tshegbartags.xml"/>
<arg value="${lexicon.dir}/lhasa-verbs.xml"/>
<arg value="${stylesheet.dir}/prepareTshegBarTagger.xsl"/>
<classpath>
<pathelement location="${bin.dir}"/>
<path refid="classpath"/>
</classpath>
<target name="solr-post-and-commit-nontitles">
<java jar="${ext}/apache/post.jar" fork="true">
<arg line="${solr.nontitle.file}"/>
<jvmarg value="-Durl=${url.to.solr.update}"/>
<jvmarg value="-Dcommit=yes"/>
</java>
-->
<!-- create xml file used to assign synonyms to certain mono tsheg bars -->
<!--
<java classname="net.sf.saxon.Transform" fork="yes">
<arg value="-o"/>
<arg value="${build.dir}/synonyms.xml"/>
<arg value="${lexicon.dir}/lhasa-verbs.xml"/>
<arg value="${stylesheet.dir}/makeSynonymFile.xsl"/>
<classpath>
<pathelement location="${bin.dir}"/>
<path refid="classpath"/>
</classpath>
</java>
-->
<!-- <loadfile property="xquery" srcfile="${stylesheet.dir}/solarizeTranscriptDatabase.xql" encoding="UTF-8"/>
<java classname="org.thdl.dbxml.QueryTools" fork="yes">
<arg value="${dbxml.environment.dir}"/>
<arg value="${dbxml.container}"/>
<arg value="${xquery}"/>
<arg value="${solarized.transcript.dir.prefinal}"/>
<classpath>
<pathelement location="${bin.dir}"/>
<path refid="classpath"/>
</classpath>
<jvmarg value="-Djava.library.path=${dbxml.lib}"/>
</java>-->
</target>
<!-- insert whether or not media exists for segment -->
<!-- <java classname="net.sf.saxon.Transform" fork="yes">
@ -210,27 +171,6 @@
then get all sentence ids for that document, then combine docId_sentenceId and
remove/replace from lucene.
-->
<!--
<target name="solr-3:commit-documents">
<exec executable="sh" dir="${solarized.transcript.dir.final}">
<arg value="post.sh"/>
<arg value="*.xml"/>
</exec>
</target>
<target name="solr-4:delete-documents" depends="solr-1:clean-local">
<exec executable="curl">
<arg value="${solr.update}"/>
<arg value="-data-binary"/> should have double dash at beginning
<arg value="&lt;delete&gt;&lt;query&gt;id:[* TO *]&lt;/query&gt;&lt;/delete&gt;"/>
</exec>
<exec executable="curl">
<arg value="${solr.update}"/>
<arg value="-data-binary"/> double dash again!
<arg value="&lt;commit/&gt;"/>
</exec>
</target>
-->
<!-- concordance program -->
<target name="lucene-thdl-compile" depends="init">
@ -242,6 +182,7 @@
<target name="lucene-thdl-jar" depends="lucene-thdl-compile">
<jar destfile="${vanillalib}/lucene-thdl.jar" basedir="${lucene-thdl.bin}/"/>
<copy file="${vanillalib}/lucene-thdl.jar" todir="${copy.to.solr.webapp}"/>
</target>
</project>