From 22d5e33c5e522386b7c6609c6690eec99cc88b5e Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sun, 17 Jun 2012 14:53:16 +0200 Subject: [PATCH] added more methods to vocabulary generation: scrape document title and document author to vocabulary --- htroot/Vocabulary_p.html | 60 +++++++++++++++- htroot/Vocabulary_p.java | 70 ++++++++++++++----- htroot/interaction/Triple.java | 60 ++++++---------- source/net/yacy/cora/lod/JenaTripleStore.java | 16 ++--- 4 files changed, 139 insertions(+), 67 deletions(-) diff --git a/htroot/Vocabulary_p.html b/htroot/Vocabulary_p.html index 7a2fe2bf4..c073b9abf 100644 --- a/htroot/Vocabulary_p.html +++ b/htroot/Vocabulary_p.html @@ -3,6 +3,59 @@ YaCy '#[clientname]#': Federated Index #%env/templates/metas.template%# + #%env/templates/header.template%# @@ -31,14 +84,17 @@ #(create)#:: -
+ +
Vocabulary Production It is possible to produce a vocabulary out of the existing search index. This is done using a given 'objectspace' which you can enter as a URL Stub. This stub is used to find all matching URLs. If the remaining path from the matching URLs then denotes a single file, the file name is used as vocabulary term. This works best with wikis. Try to use a wiki url as objectspace path.
Vocabulary Name
-
Objectspace
+
Objectspace
+
+
Discover Terms from
object link file name  object page title  object page title (splitted)  object page author
diff --git a/htroot/Vocabulary_p.java b/htroot/Vocabulary_p.java index 5a7eeca93..b1899c254 100644 --- a/htroot/Vocabulary_p.java +++ b/htroot/Vocabulary_p.java @@ -35,6 +35,7 @@ import net.yacy.cora.lod.vocabulary.YaCyMetadata; import net.yacy.cora.protocol.RequestHeader; import net.yacy.document.LibraryProvider; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.logging.Log; import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; @@ -50,44 +51,69 @@ public class Vocabulary_p { Collection vocs = LibraryProvider.autotagging.getVocabularies(); String vocabularyName = (post == null) ? null : post.get("vocabulary", null); + String discovername = (post == null) ? null : post.get("discovername", null); Tagging vocabulary = vocabularyName == null ? null : LibraryProvider.autotagging.getVocabulary(vocabularyName); if (vocabulary == null) vocabularyName = null; - int count = 0; - for (Tagging v: vocs) { - prop.put("vocabularyset_" + count + "_name", v.getName()); - prop.put("vocabularyset_" + count + "_selected", (vocabularyName != null && vocabularyName.equals(v.getName())) ? 1 : 0); - count++; - } - prop.put("vocabularyset", count); - if (post != null) { try { if (vocabulary == null) { // create a vocabulary - String discovername = post.get("discovername", ""); - if (discovername.length() > 0) { + if (discovername != null && discovername.length() > 0) { String discoverobjectspace = post.get("discoverobjectspace", ""); MultiProtocolURI discoveruri = null; if (discoverobjectspace.length() > 0) try {discoveruri = new MultiProtocolURI(discoverobjectspace);} catch (MalformedURLException e) {} if (discoveruri == null) discoverobjectspace = ""; Map table = new TreeMap(); File propFile = LibraryProvider.autotagging.getVocabularyFile(discovername); + boolean discoverFromPath = post.get("discovermethod", "").equals("path"); + boolean discoverFromTitle = post.get("discovermethod", "").equals("title"); + boolean discoverFromTitleSplitted = post.get("discovermethod", "").equals("titlesplitted"); + boolean discoverFromAuthor = post.get("discovermethod", "").equals("author"); if (discoveruri != null) { String segmentName = sb.getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default"); Segment segment = sb.indexSegments.segment(segmentName); Iterator ui = segment.urlSelector(discoveruri); + String t; while (ui.hasNext()) { DigestURI u = ui.next(); String u0 = u.toNormalform(true, false); - String t = u0.substring(discoverobjectspace.length()); - if (t.indexOf('/') >= 0) continue; - int p = t.indexOf('.'); - if (p >= 0) t = t.substring(0, p); - while ((p = t.indexOf(':')) >= 0) t = t.substring(p + 1); - while ((p = t.indexOf('=')) >= 0) t = t.substring(p + 1); - if (p >= 0) t = t.substring(p + 1); + t = ""; + if (discoverFromPath) { + t = u0.substring(discoverobjectspace.length()); + if (t.indexOf('/') >= 0) continue; + int p = t.indexOf('.'); + if (p >= 0) t = t.substring(0, p); + while ((p = t.indexOf(':')) >= 0) t = t.substring(p + 1); + while ((p = t.indexOf('=')) >= 0) t = t.substring(p + 1); + if (p >= 0) t = t.substring(p + 1); + } + if (discoverFromTitle || discoverFromTitleSplitted) { + URIMetadataRow m = segment.urlMetadata().load(u.hash()); + if (m != null) t = m.dc_title(); + } + if (discoverFromAuthor) { + URIMetadataRow m = segment.urlMetadata().load(u.hash()); + if (m != null) t = m.dc_creator(); + } + t = t.replaceAll("\"", " ").replaceAll("'", " ").replaceAll(",", " ").replaceAll(" ", " ").trim(); if (t.length() == 0) continue; - table.put(t, new Tagging.SOTuple("", u0)); + if (discoverFromTitleSplitted) { + String[] ts = t.split(" "); + for (String s: ts) { + if (s.length() == 0) continue; + table.put(s, new Tagging.SOTuple("", u0)); + } + } else if (discoverFromAuthor) { + String[] ts = t.split(";"); // author names are often separated by ';' + for (String s: ts) { + if (s.length() == 0) continue; + int p = s.indexOf(','); // check if there is a reversed method to mention the name + if (p >= 0) s = s.substring(p + 1).trim() + " " + s.substring(0, p).trim(); + table.put(s, new Tagging.SOTuple("", u0)); + } + } else { + table.put(t, new Tagging.SOTuple("", u0)); + } } } Tagging newvoc = new Tagging(discovername, propFile, discoverobjectspace, table); @@ -143,6 +169,14 @@ public class Vocabulary_p { } } + int count = 0; + for (Tagging v: vocs) { + prop.put("vocabularyset_" + count + "_name", v.getName()); + prop.put("vocabularyset_" + count + "_selected", ((vocabularyName != null && vocabularyName.equals(v.getName())) || (discovername != null && discovername.equals(v.getName()))) ? 1 : 0); + count++; + } + prop.put("vocabularyset", count); + prop.put("create", vocabularyName == null ? 1 : 0); if (vocabulary == null) { diff --git a/htroot/interaction/Triple.java b/htroot/interaction/Triple.java index 95c1e23f6..58913d028 100644 --- a/htroot/interaction/Triple.java +++ b/htroot/interaction/Triple.java @@ -98,55 +98,37 @@ public class Triple { String s = ""; String p = ""; String o = ""; + String result = ""; Boolean global = false; - if(post != null){ + if (post != null) { - if(post.containsKey("s")){ - s = post.get("s"); - } + s = post.get("s", ""); + p = post.get("p", ""); + o = post.get("o", ""); - if(post.containsKey("sp")){ - s = post.get("sp") + "#" + s; - } - - if(post.containsKey("p")){ - p = post.get("p"); - } - - if(post.containsKey("pp")){ - p = post.get("pp") + "#" + p; - } - - if(post.containsKey("o")){ - o = post.get("o"); - } + if (post.containsKey("sp")) s = post.get("sp") + "#" + s; + if (post.containsKey("pp")) p = post.get("pp") + "#" + p; global = post.containsKey("global"); + if (post.containsKey("load")) { + if (global) { + result = JenaTripleStore.getObject(s, p); + } else { + result = JenaTripleStore.getPrivateObject(s, p, username); + } + } else { + if (global) { + JenaTripleStore.addTriple(s, p, o); + } else { + JenaTripleStore.addTriple(s, p, o, username); + } + } } - if (post.containsKey("load")) { - - if (global) { - o = JenaTripleStore.getObject(s, p); - } else { - o = JenaTripleStore.getPrivateObject(s, p, username); - } - - - } else { - - if (global) { - JenaTripleStore.addTriple(s, p, o); - } else { - JenaTripleStore.addTriple(s, p, o, username); - } - - } - - prop.put("result", o); + prop.put("result", result); return prop; } diff --git a/source/net/yacy/cora/lod/JenaTripleStore.java b/source/net/yacy/cora/lod/JenaTripleStore.java index 7220a8637..40f604f74 100644 --- a/source/net/yacy/cora/lod/JenaTripleStore.java +++ b/source/net/yacy/cora/lod/JenaTripleStore.java @@ -179,11 +179,11 @@ public class JenaTripleStore { } public static String getObject(final String subject, final String predicate) { - Log.logInfo("TRIPLESTORE", "GET " + subject + " - " + predicate + " ... "); - Iterator ni = JenaTripleStore.getObjects(subject, predicate); - if (!ni.hasNext()) return ""; - return ni.next().toString(); + String object = ""; + if (ni.hasNext()) object = ni.next().toString(); + Log.logInfo("TRIPLESTORE", "GET " + subject + " - " + predicate + " - " + object); + return object; } public static Iterator getObjects(final String subject, final String predicate) { @@ -192,11 +192,11 @@ public class JenaTripleStore { } public static String getPrivateObject(final String subject, final String predicate, final String username) { - Log.logInfo("TRIPLESTORE", "GET " + subject + " - " + predicate + " ... ("+username+")"); - Iterator ni = JenaTripleStore.getPrivateObjects(subject, predicate, username); - if (!ni.hasNext()) return ""; - return ni.next().toString(); + String object = ""; + if (ni.hasNext()) object = ni.next().toString(); + Log.logInfo("TRIPLESTORE", "GET (" + username + ") " + subject + " - " + predicate + " - " + object); + return object; } private static Iterator getPrivateObjects(final String subject, final String predicate, final String username) {