AnsweredAssumed Answered

XML Data Extraction for multiple document types

Question asked by eleaese on Oct 19, 2007
Latest reply on Oct 24, 2007 by eleaese
I created a simple XML Metadata Extracter, based on the post http://forums.alfresco.com/viewtopic.php?t=7801&highlight=xml+metadata+extraction.

It worked. The context file is recipe-xml-metadata-extracter-context.xml, enclosed.

Now I want to extend to other XML document types. namely Book. I tried with the example included (many-xml-metadata-extracter-context.xml). It worked for Book, but din't work for Recipe.

Could someone tell me how to declare the options in the context file ?

Thank you !



recipe-xml-metadata-extracter-context.xml

<?xml version='1.0' encoding='UTF-8'?>
<!DOCTYPE beans PUBLIC '-//SPRING//DTD BEAN//EN' 'http://www.springframework.org/dtd/spring-beans.dtd'>

<beans>
   <bean id="avmMetadataExtracterRegistry" class="org.alfresco.repo.content.metadata.MetadataExtracterRegistry" />
  
   <!–
      Configure the AVM services to broadcast the content update notifications.
   –>
   <bean id="avmNodeService" class="org.alfresco.repo.avm.AVMNodeService" init-method="init">
      <property name="dictionaryService">
         <ref bean="dictionaryService"/>
      </property>
      <property name="avmService">
         <ref bean="avmLockingAwareService"/>
      </property>
      <property name="policyComponent">
         <ref bean="policyComponent"/>
      </property>
      <property name="invokePolicies">
         <value>true</value>
      </property>
   </bean>
   <bean id="avmMetadataExtracter" class="org.alfresco.repo.avm.AvmMetadataExtracter" init-method="init">
      <property name="policyComponent">
         <ref bean="policyComponent"/>
      </property>
      <property name="extracterAction">
         <bean class="org.alfresco.repo.action.executer.ContentMetadataExtracter" >
            <property name="dictionaryService">
               <ref bean="dictionaryService"/>
            </property>
            <property name="nodeService">
               <ref bean="avmNodeService" />
            </property>
            <property name="contentService">
               <ref bean="contentService" />
            </property>
            <!– <property name="metadataExtracterRegistry">
               <ref bean="avmMetadataExtracterRegistry" />
            </property>
            –>
         </bean>
      </property>
   </bean>
  
   <!–
      Configure an extractor that targets Alfresco Model XML files.
      Although this inherits from the base extracter bean, the use of the 'init' method
      means that it isn't automatically registered.
   –>
   <bean id="extracter.xml.recipe.AlfrescoModelMetadataExtracter"
         class="org.alfresco.repo.content.metadata.xml.XPathMetadataExtracter"
         parent="baseMetadataExtracter"
>
      <property name="mappingProperties">
         <bean class="org.springframework.beans.factory.config.PropertiesFactoryBean">
            <property name="properties">
               <props>
                  <prop key="namespace.prefix.cm">http://www.alfresco.org/model/content/1.0</prop>
                  <prop key="author">cm:author</prop>
                  <prop key="title">cm:title</prop>
                  <prop key="description">cm:description</prop>
               </props>
            </property>
         </bean>
      </property>
      <property name="xpathMappingProperties">
         <bean class="org.springframework.beans.factory.config.PropertiesFactoryBean">
            <property name="properties">
               <props>
                  <prop key="namespace.prefix.cm">http://www.alfresco.org/model/content/1.0</prop>
                  <prop key="author">/Recipe/Author/text()</prop>
                  <prop key="title">/Recipe/Title/text()</prop>
                  <prop key="description">/Recipe/Instruction/text()</prop>
               </props>
            </property>
         </bean>
      </property>
   </bean>
  
   <!–
      This selector examines the XML documents, executing the given XPath statements until a
      match is made.
   –>
   <bean
         id="extracter.xml.recipe.selector.XPathSelector"
         class="org.alfresco.repo.content.selector.XPathContentWorkerSelector"
>
      <property name="workers">
         <map>
            <entry key="/my:test">
               <null />
            </entry>
            <entry key="/Recipe">
               <ref bean="extracter.xml.recipe.AlfrescoModelMetadataExtracter" />
            </entry>
         </map>
      </property>
   </bean>
  
   <!–
      This is the face of the XML metadata extraction.  If passes the XML document to each of
      the selectors, until one of them gives back a MetadataExtracter (via the selectors),
      which is then used as normal to extract the values.
      Note the use of the AVM-specific registry.
      The overwrite policy of the embedded extracters has no effect.  It is only this extractor's
      policy that is used.
   –>
   <bean
         id="extracter.xml.recipe.XMLMetadataExtracter"
         class="org.alfresco.repo.content.metadata.xml.XmlMetadataExtracter"
         parent="baseMetadataExtracter">
      <property name="registry">
         <ref bean="avmMetadataExtracterRegistry" />
      </property>
      <property name="overwritePolicy">
         <value>EAGER</value>
      </property>
      <property name="selectors">
         <list>
            <ref bean="extracter.xml.recipe.selector.XPathSelector" />
         </list>
      </property>
   </bean>
  
</beans>



many-xml-metadata-extracter-context.xml

<?xml version='1.0' encoding='UTF-8'?>
<!DOCTYPE beans PUBLIC '-//SPRING//DTD BEAN//EN' 'http://www.springframework.org/dtd/spring-beans.dtd'>

<beans>
   <bean id="avmMetadataExtracterRegistry" class="org.alfresco.repo.content.metadata.MetadataExtracterRegistry" />
  
   <!–
      Configure the AVM services to broadcast the content update notifications.
   –>
   <bean id="avmNodeService" class="org.alfresco.repo.avm.AVMNodeService" init-method="init">
      <property name="dictionaryService">
         <ref bean="dictionaryService"/>
      </property>
      <property name="avmService">
         <ref bean="avmLockingAwareService"/>
      </property>
      <property name="policyComponent">
         <ref bean="policyComponent"/>
      </property>
      <property name="invokePolicies">
         <value>true</value>
      </property>
   </bean>
   <bean id="avmMetadataExtracter" class="org.alfresco.repo.avm.AvmMetadataExtracter" init-method="init">
      <property name="policyComponent">
         <ref bean="policyComponent"/>
      </property>
      <property name="extracterAction">
         <bean class="org.alfresco.repo.action.executer.ContentMetadataExtracter" >
            <property name="dictionaryService">
               <ref bean="dictionaryService"/>
            </property>
            <property name="nodeService">
               <ref bean="avmNodeService" />
            </property>
            <property name="contentService">
               <ref bean="contentService" />
            </property>
            <!– <property name="metadataExtracterRegistry">
               <ref bean="avmMetadataExtracterRegistry" />
            </property>
            –>
         </bean>
      </property>
   </bean>
  
   <!–
      Configure an extractor that targets Alfresco Model XML files.
      Although this inherits from the base extracter bean, the use of the 'init' method
      means that it isn't automatically registered.
   –>
   <bean id="extracter.xml.recipe.AlfrescoModelMetadataExtracter"
         class="org.alfresco.repo.content.metadata.xml.XPathMetadataExtracter"
         parent="baseMetadataExtracter"
>
      <property name="mappingProperties">
         <bean class="org.springframework.beans.factory.config.PropertiesFactoryBean">
            <property name="properties">
               <props>
                  <prop key="namespace.prefix.cm">http://www.alfresco.org/model/content/1.0</prop>
                  <prop key="author">cm:author</prop>
                  <prop key="title">cm:title</prop>
                  <prop key="description">cm:description</prop>
               </props>
            </property>
         </bean>
      </property>
      <property name="xpathMappingProperties">
         <bean class="org.springframework.beans.factory.config.PropertiesFactoryBean">
            <property name="properties">
               <props>
                  <prop key="namespace.prefix.cm">http://www.alfresco.org/model/content/1.0</prop>
                  <prop key="author">/Recipe/Author/text()</prop>
                  <prop key="title">/Recipe/Title/text()</prop>
                  <prop key="description">/Recipe/Instruction/text()</prop>
               </props>
            </property>
         </bean>
      </property>
   </bean>

   <bean id="extracter.xml.book.AlfrescoModelMetadataExtracter"
         class="org.alfresco.repo.content.metadata.xml.XPathMetadataExtracter"
         parent="baseMetadataExtracter"
>
      <property name="mappingProperties">
         <bean class="org.springframework.beans.factory.config.PropertiesFactoryBean">
            <property name="properties">
               <props>
                  <prop key="namespace.prefix.cm">http://www.alfresco.org/model/content/1.0</prop>
                  <prop key="author">cm:author</prop>
                  <prop key="title">cm:title</prop>
                  <prop key="description">cm:description</prop>
               </props>
            </property>
         </bean>
      </property>
      <property name="xpathMappingProperties">
         <bean class="org.springframework.beans.factory.config.PropertiesFactoryBean">
            <property name="properties">
               <props>
                  <prop key="namespace.prefix.cm">http://www.alfresco.org/model/content/1.0</prop>
                  <prop key="author">/Book/Author/text()</prop>
                  <prop key="title">/Book/Title/text()</prop>
                  <prop key="description">/Book/Summary/text()</prop>
               </props>
            </property>
         </bean>
      </property>
   </bean>
  
  
   <!–
      This selector examines the XML documents, executing the given XPath statements until a
      match is made.
   –>
   <bean
         id="extracter.xml.sample.selector.XPathSelector"
         class="org.alfresco.repo.content.selector.XPathContentWorkerSelector"
>
      <property name="workers">
         <map>
            <entry key="/Recipe">
               <ref bean="extracter.xml.recipe.AlfrescoModelMetadataExtracter" />
            </entry>
            <entry key="/Book">
               <ref bean="extracter.xml.book.AlfrescoModelMetadataExtracter" />
            </entry>
         </map>
      </property>
   </bean>
  
   <!–
      This is the face of the XML metadata extraction.  If passes the XML document to each of
      the selectors, until one of them gives back a MetadataExtracter (via the selectors),
      which is then used as normal to extract the values.
      Note the use of the AVM-specific registry.
      The overwrite policy of the embedded extracters has no effect.  It is only this extractor's
      policy that is used.
   –>
   <bean
         id="extracter.xml.sample.XMLMetadataExtracter"
         class="org.alfresco.repo.content.metadata.xml.XmlMetadataExtracter"
         parent="baseMetadataExtracter">
      <property name="registry">
         <ref bean="avmMetadataExtracterRegistry" />
      </property>
      <property name="overwritePolicy">
         <value>EAGER</value>
      </property>
      <property name="selectors">
         <list>
            <ref bean="extracter.xml.sample.selector.XPathSelector" />
         </list>
      </property>
   </bean>
</beans>

Outcomes