added parsing of metadata to surrogate reading:

a dublin core record inside of surrogate input files may now contain
tokens within the namespace 'md' (short for: metadata). The token names
must be valid withing the namespace of the solr field names. All
md-tokens inside of surrogate files then overwrite values within solr
documents before they are written to the solr index. This makes it
possible to assign collection names to each surrogate entry and also
ranking information can be added. Please see the example file.
pull/1/head
orbiter 11 years ago
parent 4de3fefdb5
commit 937273d4e3

@ -15,7 +15,9 @@
-->
<surrogates
xmlns:dc="http://purl.org/dc/elements/1.1/">
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:md="http://localhost:8090/api/schema.xml?core=collection1"
>
<record>
<dc:Title><![CDATA[Alan Smithee]]></dc:Title>
@ -23,6 +25,15 @@
<dc:Description><![CDATA[Der als Filmregisseur oft genannte '''Alan Smithee''' ist ein Anagramm von „The Alias Men“.]]></dc:Description>
<dc:Language>de</dc:Language>
<dc:Date>2009-03-02T11:12:36Z</dc:Date> <!-- date is in ISO 8601 -->
<md:h1_txt>Alan Smithee</md:h1_txt>
<md:h2_txt>Geschichte</md:h2_txt>
<md:h3_txt>Entstehung</md:h3_txt>
<md:h3_txt>Aufdeckung und Abkehr</md:h3_txt>
<md:h2_txt>Verwendung</md:h2_txt>
<md:h2_txt>Literatur</md:h2_txt>
<md:h2_txt>Weblinks</md:h2_txt>
<md:h2_txt>Referenzen</md:h2_txt>
<md:collection_sxt>surrogate</md:collection_sxt>
</record>
</surrogates>

@ -32,6 +32,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.zip.GZIPInputStream;
@ -140,7 +141,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
} else if ("value".equals(tag)) {
this.buffer.setLength(0);
this.parsingValue = true;
} else if (tag.startsWith("dc:") || tag.startsWith("geo:")) {
} else if (tag.startsWith("dc:") || tag.startsWith("geo:") || tag.startsWith("md:")) {
// parse dublin core attribute
this.elementName = tag;
this.parsingValue = true;
@ -173,15 +174,18 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
}
this.buffer.setLength(0);
this.parsingValue = false;
} else if (tag.startsWith("dc:") || tag.startsWith("geo:")) {
} else if (tag.startsWith("dc:") || tag.startsWith("geo:") || tag.startsWith("md:")) {
final String value = buffer.toString().trim();
if (this.elementName != null && tag.equals(this.elementName)) {
value.replaceAll(";", ",");
String oldcontent = this.surrogate.get(this.elementName);
if (oldcontent == null) {
this.surrogate.getMap().put(this.elementName, new String[]{value});
Map<String,String[]> map = this.surrogate.getMap();
String[] oldcontent = map.get(this.elementName);
if (oldcontent == null || oldcontent.length == 0) {
map.put(this.elementName, new String[]{value});
} else {
this.surrogate.getMap().put(this.elementName, new String[]{oldcontent + ";" + value});
String[] newcontent = new String[oldcontent.length + 1];
System.arraycopy(oldcontent, 0, newcontent, 0, oldcontent.length);
newcontent[oldcontent.length] = value;
map.put(this.elementName, newcontent);
}
}
this.buffer.setLength(0);

@ -70,6 +70,7 @@ import net.yacy.crawler.retrieval.Response;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.SentenceReader;
import net.yacy.document.content.DCEntry;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.citation.CitationReference;
@ -788,6 +789,22 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
add(doc, CollectionSchema.publisher_url_s, html.getPublisherLink().toNormalform(true));
}
}
if (parser instanceof DCEntry) {
// the document was created with a surrogate parsing; overwrite all md: -entries to Solr
DCEntry dcentry = (DCEntry) parser;
for (Map.Entry<String, String[]> entry: dcentry.getMap().entrySet()) {
String tag = entry.getKey();
if (!tag.startsWith("md:") || tag.length() < 4) continue;
CollectionSchema solr_field = CollectionSchema.valueOf(tag.substring(3));
if (solr_field == null) continue;
String[] values = entry.getValue();
if (values == null || values.length == 0) continue;
if (allAttr || contains(solr_field)) {
add(doc, solr_field, values);
}
}
}
String content = document.getTextString();
String tokens = digestURL.toTokens();

Loading…
Cancel
Save