added more methods to vocabulary generation: scrape document title and

document author to vocabulary
pull/1/head
Michael Peter Christen 13 years ago
parent b2d1c25ebb
commit 22d5e33c5e

@ -3,6 +3,59 @@
<head> <head>
<title>YaCy '#[clientname]#': Federated Index</title> <title>YaCy '#[clientname]#': Federated Index</title>
#%env/templates/metas.template%# #%env/templates/metas.template%#
<script type="text/javascript">
//<![CDATA[
function xmlhttpPost() {
var searchform = document.getElementById('searchform');
search(searchform.discoverobjectspace.value);
}
function search(query) {
var xmlHttpReq = false;
var self = this;
if (window.XMLHttpRequest) { // Mozilla/Safari
self.xmlHttpReq = new XMLHttpRequest();
}
else if (window.ActiveXObject) { // IE
self.xmlHttpReq = new ActiveXObject("Microsoft.XMLHTTP");
}
self.xmlHttpReq.open('GET', "yacysearch.json?verify=false&resource=local&maximumRecords=100&nav=none&query=" + query + "+inurl:" + query, true);
self.xmlHttpReq.setRequestHeader('Content-Type', 'application/x-www-form-urlencoded');
self.xmlHttpReq.onreadystatechange = function() {
if (self.xmlHttpReq.readyState == 4) {
updatepage(self.xmlHttpReq.responseText);
}
}
self.xmlHttpReq.send(null);
}
function updatepage(str) {
var raw = document.getElementById("raw");
if (raw != null) raw.innerHTML = str;
var rsp = eval("("+str+")");
var firstChannel = rsp.channels[0];
var totalResults = firstChannel.totalResults.replace(/[,.]/,"");
var startIndex = firstChannel.startIndex;
var itemsPerPage = firstChannel.itemsPerPage;
var navigation = firstChannel.navigation;
var html = "";
if (totalResults > 0 && firstChannel.items.length > 0) {
var item;
html += "<table class=\"networkTable\" border=\"0\" cellpadding=\"2\" cellspacing=\"1\" width=\"99%\">";
html += "<tr class=\"TableHeader\" valign=\"bottom\">";
html += "<td>URL from index (total results = " + totalResults + ")<\/td>";
for (var i = 0; i < firstChannel.items.length; i++) {
item = firstChannel.items[i];
html += "<tr class=\"TableCellLight\"><td align=\"left\" onclick=\"document.getElementById('searchform').value='" + item.link + "';\">" + item.link + "<\/td>";
}
html += "<\/table>";
}
document.getElementById("searchresults").innerHTML = html;
}
//]]>
</script>
</head> </head>
<body id="Vocabulary_p" onload="document.getElementById('newterm').focus()"> <body id="Vocabulary_p" onload="document.getElementById('newterm').focus()">
#%env/templates/header.template%# #%env/templates/header.template%#
@ -31,14 +84,17 @@
</form> </form>
#(create)#:: #(create)#::
<form action="Vocabulary_p.html" method="get" accept-charset="UTF-8"> <!--<form action="Vocabulary_p.html" id="searchform" method="get" accept-charset="UTF-8" onkeyup="xmlhttpPost(); return false;">-->
<form action="Vocabulary_p.html" id="searchform" method="get" accept-charset="UTF-8" >
<fieldset><legend>Vocabulary Production</legend> <fieldset><legend>Vocabulary Production</legend>
It is possible to produce a vocabulary out of the existing search index. This is done using a given 'objectspace' which you can enter as a URL Stub. It is possible to produce a vocabulary out of the existing search index. This is done using a given 'objectspace' which you can enter as a URL Stub.
This stub is used to find all matching URLs. If the remaining path from the matching URLs then denotes a single file, the file name is used as vocabulary term. This stub is used to find all matching URLs. If the remaining path from the matching URLs then denotes a single file, the file name is used as vocabulary term.
This works best with wikis. Try to use a wiki url as objectspace path. This works best with wikis. Try to use a wiki url as objectspace path.
<dl> <dl>
<dt>Vocabulary Name</dt><dd><input type="text" name="discovername" value="" size="16" maxlength="128" /></dd> <dt>Vocabulary Name</dt><dd><input type="text" name="discovername" value="" size="16" maxlength="128" /></dd>
<dt>Objectspace</dt><dd><input type="text" name="discoverobjectspace" value="http://" size="78" maxlength="128" /></dd> <dt>Objectspace</dt><dd><input type="text" name="discoverobjectspace" value="http://" size="78" maxlength="128" />
<div id="searchresults"></div></dd>
<dt>Discover Terms from</dt><dd><input type="radio" name="discovermethod" value="path" checked="checked" />object link file name&nbsp;&nbsp;<input type="radio" name="discovermethod" value="title" />object page title&nbsp;&nbsp;<input type="radio" name="discovermethod" value="titlesplitted" />object page title (splitted)&nbsp;&nbsp;<input type="radio" name="discovermethod" value="author" />object page author</dd>
<dt></dt><dd><input type="submit" name="create" value="Create" /></dd> <dt></dt><dd><input type="submit" name="create" value="Create" /></dd>
</dl> </dl>
</fieldset> </fieldset>

@ -35,6 +35,7 @@ import net.yacy.cora.lod.vocabulary.YaCyMetadata;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.document.LibraryProvider; import net.yacy.document.LibraryProvider;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants; import net.yacy.search.SwitchboardConstants;
@ -50,44 +51,69 @@ public class Vocabulary_p {
Collection<Tagging> vocs = LibraryProvider.autotagging.getVocabularies(); Collection<Tagging> vocs = LibraryProvider.autotagging.getVocabularies();
String vocabularyName = (post == null) ? null : post.get("vocabulary", null); String vocabularyName = (post == null) ? null : post.get("vocabulary", null);
String discovername = (post == null) ? null : post.get("discovername", null);
Tagging vocabulary = vocabularyName == null ? null : LibraryProvider.autotagging.getVocabulary(vocabularyName); Tagging vocabulary = vocabularyName == null ? null : LibraryProvider.autotagging.getVocabulary(vocabularyName);
if (vocabulary == null) vocabularyName = null; if (vocabulary == null) vocabularyName = null;
int count = 0;
for (Tagging v: vocs) {
prop.put("vocabularyset_" + count + "_name", v.getName());
prop.put("vocabularyset_" + count + "_selected", (vocabularyName != null && vocabularyName.equals(v.getName())) ? 1 : 0);
count++;
}
prop.put("vocabularyset", count);
if (post != null) { if (post != null) {
try { try {
if (vocabulary == null) { if (vocabulary == null) {
// create a vocabulary // create a vocabulary
String discovername = post.get("discovername", ""); if (discovername != null && discovername.length() > 0) {
if (discovername.length() > 0) {
String discoverobjectspace = post.get("discoverobjectspace", ""); String discoverobjectspace = post.get("discoverobjectspace", "");
MultiProtocolURI discoveruri = null; MultiProtocolURI discoveruri = null;
if (discoverobjectspace.length() > 0) try {discoveruri = new MultiProtocolURI(discoverobjectspace);} catch (MalformedURLException e) {} if (discoverobjectspace.length() > 0) try {discoveruri = new MultiProtocolURI(discoverobjectspace);} catch (MalformedURLException e) {}
if (discoveruri == null) discoverobjectspace = ""; if (discoveruri == null) discoverobjectspace = "";
Map<String, Tagging.SOTuple> table = new TreeMap<String, Tagging.SOTuple>(); Map<String, Tagging.SOTuple> table = new TreeMap<String, Tagging.SOTuple>();
File propFile = LibraryProvider.autotagging.getVocabularyFile(discovername); File propFile = LibraryProvider.autotagging.getVocabularyFile(discovername);
boolean discoverFromPath = post.get("discovermethod", "").equals("path");
boolean discoverFromTitle = post.get("discovermethod", "").equals("title");
boolean discoverFromTitleSplitted = post.get("discovermethod", "").equals("titlesplitted");
boolean discoverFromAuthor = post.get("discovermethod", "").equals("author");
if (discoveruri != null) { if (discoveruri != null) {
String segmentName = sb.getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default"); String segmentName = sb.getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default");
Segment segment = sb.indexSegments.segment(segmentName); Segment segment = sb.indexSegments.segment(segmentName);
Iterator<DigestURI> ui = segment.urlSelector(discoveruri); Iterator<DigestURI> ui = segment.urlSelector(discoveruri);
String t;
while (ui.hasNext()) { while (ui.hasNext()) {
DigestURI u = ui.next(); DigestURI u = ui.next();
String u0 = u.toNormalform(true, false); String u0 = u.toNormalform(true, false);
String t = u0.substring(discoverobjectspace.length()); t = "";
if (t.indexOf('/') >= 0) continue; if (discoverFromPath) {
int p = t.indexOf('.'); t = u0.substring(discoverobjectspace.length());
if (p >= 0) t = t.substring(0, p); if (t.indexOf('/') >= 0) continue;
while ((p = t.indexOf(':')) >= 0) t = t.substring(p + 1); int p = t.indexOf('.');
while ((p = t.indexOf('=')) >= 0) t = t.substring(p + 1); if (p >= 0) t = t.substring(0, p);
if (p >= 0) t = t.substring(p + 1); while ((p = t.indexOf(':')) >= 0) t = t.substring(p + 1);
while ((p = t.indexOf('=')) >= 0) t = t.substring(p + 1);
if (p >= 0) t = t.substring(p + 1);
}
if (discoverFromTitle || discoverFromTitleSplitted) {
URIMetadataRow m = segment.urlMetadata().load(u.hash());
if (m != null) t = m.dc_title();
}
if (discoverFromAuthor) {
URIMetadataRow m = segment.urlMetadata().load(u.hash());
if (m != null) t = m.dc_creator();
}
t = t.replaceAll("\"", " ").replaceAll("'", " ").replaceAll(",", " ").replaceAll(" ", " ").trim();
if (t.length() == 0) continue; if (t.length() == 0) continue;
table.put(t, new Tagging.SOTuple("", u0)); if (discoverFromTitleSplitted) {
String[] ts = t.split(" ");
for (String s: ts) {
if (s.length() == 0) continue;
table.put(s, new Tagging.SOTuple("", u0));
}
} else if (discoverFromAuthor) {
String[] ts = t.split(";"); // author names are often separated by ';'
for (String s: ts) {
if (s.length() == 0) continue;
int p = s.indexOf(','); // check if there is a reversed method to mention the name
if (p >= 0) s = s.substring(p + 1).trim() + " " + s.substring(0, p).trim();
table.put(s, new Tagging.SOTuple("", u0));
}
} else {
table.put(t, new Tagging.SOTuple("", u0));
}
} }
} }
Tagging newvoc = new Tagging(discovername, propFile, discoverobjectspace, table); Tagging newvoc = new Tagging(discovername, propFile, discoverobjectspace, table);
@ -143,6 +169,14 @@ public class Vocabulary_p {
} }
} }
int count = 0;
for (Tagging v: vocs) {
prop.put("vocabularyset_" + count + "_name", v.getName());
prop.put("vocabularyset_" + count + "_selected", ((vocabularyName != null && vocabularyName.equals(v.getName())) || (discovername != null && discovername.equals(v.getName()))) ? 1 : 0);
count++;
}
prop.put("vocabularyset", count);
prop.put("create", vocabularyName == null ? 1 : 0); prop.put("create", vocabularyName == null ? 1 : 0);
if (vocabulary == null) { if (vocabulary == null) {

@ -98,55 +98,37 @@ public class Triple {
String s = ""; String s = "";
String p = ""; String p = "";
String o = ""; String o = "";
String result = "";
Boolean global = false; Boolean global = false;
if(post != null){ if (post != null) {
if(post.containsKey("s")){ s = post.get("s", "");
s = post.get("s"); p = post.get("p", "");
} o = post.get("o", "");
if(post.containsKey("sp")){ if (post.containsKey("sp")) s = post.get("sp") + "#" + s;
s = post.get("sp") + "#" + s; if (post.containsKey("pp")) p = post.get("pp") + "#" + p;
}
if(post.containsKey("p")){
p = post.get("p");
}
if(post.containsKey("pp")){
p = post.get("pp") + "#" + p;
}
if(post.containsKey("o")){
o = post.get("o");
}
global = post.containsKey("global"); global = post.containsKey("global");
if (post.containsKey("load")) {
if (global) {
result = JenaTripleStore.getObject(s, p);
} else {
result = JenaTripleStore.getPrivateObject(s, p, username);
}
} else {
if (global) {
JenaTripleStore.addTriple(s, p, o);
} else {
JenaTripleStore.addTriple(s, p, o, username);
}
}
} }
if (post.containsKey("load")) { prop.put("result", result);
if (global) {
o = JenaTripleStore.getObject(s, p);
} else {
o = JenaTripleStore.getPrivateObject(s, p, username);
}
} else {
if (global) {
JenaTripleStore.addTriple(s, p, o);
} else {
JenaTripleStore.addTriple(s, p, o, username);
}
}
prop.put("result", o);
return prop; return prop;
} }

@ -179,11 +179,11 @@ public class JenaTripleStore {
} }
public static String getObject(final String subject, final String predicate) { public static String getObject(final String subject, final String predicate) {
Log.logInfo("TRIPLESTORE", "GET " + subject + " - " + predicate + " ... ");
Iterator<RDFNode> ni = JenaTripleStore.getObjects(subject, predicate); Iterator<RDFNode> ni = JenaTripleStore.getObjects(subject, predicate);
if (!ni.hasNext()) return ""; String object = "";
return ni.next().toString(); if (ni.hasNext()) object = ni.next().toString();
Log.logInfo("TRIPLESTORE", "GET " + subject + " - " + predicate + " - " + object);
return object;
} }
public static Iterator<RDFNode> getObjects(final String subject, final String predicate) { public static Iterator<RDFNode> getObjects(final String subject, final String predicate) {
@ -192,11 +192,11 @@ public class JenaTripleStore {
} }
public static String getPrivateObject(final String subject, final String predicate, final String username) { public static String getPrivateObject(final String subject, final String predicate, final String username) {
Log.logInfo("TRIPLESTORE", "GET " + subject + " - " + predicate + " ... ("+username+")");
Iterator<RDFNode> ni = JenaTripleStore.getPrivateObjects(subject, predicate, username); Iterator<RDFNode> ni = JenaTripleStore.getPrivateObjects(subject, predicate, username);
if (!ni.hasNext()) return ""; String object = "";
return ni.next().toString(); if (ni.hasNext()) object = ni.next().toString();
Log.logInfo("TRIPLESTORE", "GET (" + username + ") " + subject + " - " + predicate + " - " + object);
return object;
} }
private static Iterator<RDFNode> getPrivateObjects(final String subject, final String predicate, final String username) { private static Iterator<RDFNode> getPrivateObjects(final String subject, final String predicate, final String username) {

Loading…
Cancel
Save