added parsing of metadata to surrogate reading:

a dublin core record inside of surrogate input files may now contain tokens within the namespace 'md' (short for: metadata). The token names must be valid withing the namespace of the solr field names. All md-tokens inside of surrogate files then overwrite values within solr documents before they are written to the solr index. This makes it possible to assign collection names to each surrogate entry and also ranking information can be added. Please see the example file.
11 years ago · 937273d4e3
parent 4de3fefdb5
commit 937273d4e3
3 changed files with 40 additions and 8 deletions
--- a/examples/surrogate_dublin_core.xml
+++ b/examples/surrogate_dublin_core.xml
@ -15,7 +15,9 @@
 -->

 <surrogates
-  xmlns:dc="http://purl.org/dc/elements/1.1/">
+  xmlns:dc="http://purl.org/dc/elements/1.1/"
+  xmlns:md="http://localhost:8090/api/schema.xml?core=collection1"
+>

  <record>
    <dc:Title><![CDATA[Alan Smithee]]></dc:Title>
@ -23,6 +25,15 @@
    <dc:Description><![CDATA[Der als Filmregisseur oft genannte '''Alan Smithee''' ist ein Anagramm von „The Alias Men“.]]></dc:Description>
    <dc:Language>de</dc:Language>
    <dc:Date>2009-03-02T11:12:36Z</dc:Date> <!-- date is in ISO 8601 -->
+    <md:h1_txt>Alan Smithee</md:h1_txt>
+    <md:h2_txt>Geschichte</md:h2_txt>
+    <md:h3_txt>Entstehung</md:h3_txt>
+    <md:h3_txt>Aufdeckung und Abkehr</md:h3_txt>
+    <md:h2_txt>Verwendung</md:h2_txt>
+    <md:h2_txt>Literatur</md:h2_txt>
+    <md:h2_txt>Weblinks</md:h2_txt>
+    <md:h2_txt>Referenzen</md:h2_txt>
+    <md:collection_sxt>surrogate</md:collection_sxt>
  </record>

 </surrogates>
--- a/source/net/yacy/document/content/SurrogateReader.java
+++ b/source/net/yacy/document/content/SurrogateReader.java
@ -32,6 +32,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
+import java.util.Map;
 import java.util.concurrent.ArrayBlockingQueue;
 import java.util.concurrent.BlockingQueue;
 import java.util.zip.GZIPInputStream;
@ -140,7 +141,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
        } else if ("value".equals(tag)) {
            this.buffer.setLength(0);
            this.parsingValue = true;
-        } else if (tag.startsWith("dc:") || tag.startsWith("geo:")) {
+        } else if (tag.startsWith("dc:") || tag.startsWith("geo:") || tag.startsWith("md:")) {
            // parse dublin core attribute
            this.elementName = tag;
            this.parsingValue = true;
@ -173,15 +174,18 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
            }
            this.buffer.setLength(0);
            this.parsingValue = false;
-        } else if (tag.startsWith("dc:") || tag.startsWith("geo:")) {
+        } else if (tag.startsWith("dc:") || tag.startsWith("geo:") || tag.startsWith("md:")) {
            final String value = buffer.toString().trim();
            if (this.elementName != null && tag.equals(this.elementName)) {
-                value.replaceAll(";", ",");
-                String oldcontent = this.surrogate.get(this.elementName);
-                if (oldcontent == null) {
-                    this.surrogate.getMap().put(this.elementName, new String[]{value});
+                Map<String,String[]> map = this.surrogate.getMap();
+                String[] oldcontent = map.get(this.elementName);
+                if (oldcontent == null || oldcontent.length == 0) {
+                    map.put(this.elementName, new String[]{value});
                } else {
-                    this.surrogate.getMap().put(this.elementName, new String[]{oldcontent + ";" + value});
+                    String[] newcontent = new String[oldcontent.length + 1];
+                    System.arraycopy(oldcontent, 0, newcontent, 0, oldcontent.length);
+                    newcontent[oldcontent.length] = value;
+                    map.put(this.elementName, newcontent);
                }
            }
            this.buffer.setLength(0);
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@ -70,6 +70,7 @@ import net.yacy.crawler.retrieval.Response;
 import net.yacy.document.Condenser;
 import net.yacy.document.Document;
 import net.yacy.document.SentenceReader;
+import net.yacy.document.content.DCEntry;
 import net.yacy.document.parser.html.ContentScraper;
 import net.yacy.document.parser.html.ImageEntry;
 import net.yacy.kelondro.data.citation.CitationReference;
@ -788,6 +789,22 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
                add(doc, CollectionSchema.publisher_url_s, html.getPublisherLink().toNormalform(true));
            }
        }
+
+        if (parser instanceof DCEntry) {
+            // the document was created with a surrogate parsing; overwrite all md: -entries to Solr
+            DCEntry dcentry = (DCEntry) parser;
+            for (Map.Entry<String, String[]> entry: dcentry.getMap().entrySet()) {
+                String tag = entry.getKey();
+                if (!tag.startsWith("md:") || tag.length() < 4) continue;
+                CollectionSchema solr_field = CollectionSchema.valueOf(tag.substring(3));
+                if (solr_field == null) continue;
+                String[] values = entry.getValue();
+                if (values == null || values.length == 0) continue;
+                if (allAttr || contains(solr_field)) {
+                    add(doc, solr_field, values);
+                }
+            }
+        }
        
        String content = document.getTextString();
        String tokens = digestURL.toTokens();