alpha version of surrogate reading and indexing.

see the example file for an explanation. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5815 6c8d7289-2bf4-0310-a012-ef5d649a1542
16 years ago · 9050a3c4c5
parent 870066ab35
commit 9050a3c4c5
3 changed files with 94 additions and 5 deletions
--- a/examples/surrogate_dublin_core.xml
+++ b/examples/surrogate_dublin_core.xml
@ -0,0 +1,28 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- YaCy surrogate file using dublin core notion -->
+<!-- 
+  This is a surrogate file which is an intermediate document description
+  file for index generation. Once you have YaCy started, you can copy a file
+  like this (or actual this file) into DATA/SURROGATE/in and then the indexing
+  process will read the file, store the content into the search index and moves
+  the file into DATA/SURROGATE/out
+  Using surrogate files and the surrogate file format you can easily create your
+  own data harvesting sources for the YaCy indexer. Just write a file generator
+  that generates files like this. The xml schema is very similar to that
+  described in
+    http://dublincore.org/documents/dc-xml-guidelines/
+  using the Dublin Core metadata element set.
+-->
+
+<surrogates
+  xmlns:dc="http://purl.org/dc/elements/1.1/">
+
+  <record>
+    <dc:title><![CDATA[Alan Smithee]]></dc:title>
+    <dc:identifier>http://de.wikipedia.org/wiki/Alan_Smithee</dc:identifier>
+    <dc:description><![CDATA[Der als Filmregisseur oft genannte '''Alan Smithee''' ist ein Anagramm von „The Alias Men“.]]></dc:description>
+    <dc:language>de</dc:language>
+    <dc:date>2009-03-02T11:12:36Z</dc:date> <!-- date is in ISO 8601 -->
+  </record>
+
+</surrogates>
--- a/source/de/anomic/crawler/Surrogate.java
+++ b/source/de/anomic/crawler/Surrogate.java
@ -41,9 +41,31 @@ public class Surrogate extends HashMap<String, String> {
    public Surrogate() {
        super();
    }
+    
+    /*
+    DC according to rfc 5013
+
+    * dc_title
+    * dc_creator
+    * dc_subject
+    * dc_description
+    * dc_publisher
+    dc_contributor
+    dc_date
+    dc_type
+    * dc_format
+    * dc_identifier
+    * dc_source
+    dc_language
+    dc_relation
+    dc_coverage
+    dc_rights
+         */
+    
    public Date date() {
-        String d = this.get("date");
+        String d = this.get("dateISO8601");
        if (d == null) d = this.get("docdatetime");
+        if (d == null) d = this.get("dc:date");
        if (d == null) return null;
        try {
            return DateFormatter.parseISO8601(d);
@ -54,6 +76,7 @@ public class Surrogate extends HashMap<String, String> {
    }
    public yacyURL url() {
        String u = this.get("url");
+        if (u == null) u = this.get("dc:identifier");
        if (u == null) return null;
        try {
            return new yacyURL(u, null);
@ -64,19 +87,28 @@ public class Surrogate extends HashMap<String, String> {
    }
    public String language() {
        String l = this.get("language");
+        if (l == null) l = this.get("dc:language");
        if (l == null) return "en"; else return l;
    }
    public String title() {
        String t = this.get("title");
-        return stripCDATA(t);
+        if (t == null) t = this.get("dc:title");
+        t = stripCDATA(t);
+        if (t == null) return "";
+        return t;
    }
    public String body() {
        String t = this.get("body");
-        return stripCDATA(t);
+        if (t == null) this.get("dc:description");
+        t = stripCDATA(t);
+        if (t == null) return "";
+        return t;
    }
    public String[] categories() {
        String t = this.get("categories");
+        if (t == null) this.get("dc:subject");
        t = stripCDATA(t);
+        if (t == null) return new String[]{};
        return t.split(";");
    }
    private String stripCDATA(String s) {
--- a/source/de/anomic/xml/SurrogateReader.java
+++ b/source/de/anomic/xml/SurrogateReader.java
@ -89,19 +89,23 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato
    }
    
    public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException {
-        if ("document".equals(tag)) {
+        if ("record".equals(tag) || "document".equals(tag)) {
            this.surrogate = new Surrogate();
        } else if ("element".equals(tag)) {
            this.elementName = atts.getValue("name");
        } else if ("value".equals(tag)) {
            this.buffer.setLength(0);
            this.parsingValue = true;
+        } else if (tag.startsWith("dc:")) {
+            // parse dublin core attribute
+            this.elementName = tag;
+            this.parsingValue = true;
        }
    }

    public void endElement(final String uri, final String name, final String tag) {
        if (tag == null) return;
-        if ("document".equals(tag)) {
+        if ("record".equals(tag) || "document".equals(tag)) {
            //System.out.println("A Title: " + this.surrogate.title());
            try {
                this.surrogates.put(this.surrogate);
@ -124,6 +128,13 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato
            }
            this.buffer.setLength(0);
            this.parsingValue = false;
+        } else if (tag.startsWith("dc:")) {
+            final String value = buffer.toString().trim();
+            if (this.elementName != null) {
+                this.surrogate.put(this.elementName, value);
+            }
+            this.buffer.setLength(0);
+            this.parsingValue = false;
        }
    }

@ -177,6 +188,24 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato
 }
    /*
 Example surrogate
+<?xml version="1.0" encoding="utf-8"?>
+<!-- YaCy surrogate file using dublin core notion -->
+<!-- see http://dublincore.org/documents/dc-xml-guidelines/ -->
+<surrogates
+  xmlns:dc="http://purl.org/dc/elements/1.1/">
+
+  <record>
+    <dc:title><![CDATA[Alan Smithee]]></dc:title>
+    <dc:identifier>http://de.wikipedia.org/wiki/Alan_Smithee</dc:identifier>
+    <dc:description><![CDATA[Der als Filmregisseur oft genannte '''Alan Smithee''' ist ein Anagramm von ãThe Alias MenÒ.]]></dc:description>
+    <dc:language>de</dc:language>
+    <dc:date>2009-03-02T11:12:36Z</dc:date> <!-- date is in ISO 8601 -->
+  </record>
+
+</surrogates>
+
+
+or

 <?xml version="1.0" encoding="utf-8"?>
 <documents>