added more methods to vocabulary generation: scrape document title and

document author to vocabulary
13 years ago · 22d5e33c5e
parent b2d1c25ebb
commit 22d5e33c5e
4 changed files with 139 additions and 67 deletions
--- a/htroot/Vocabulary_p.html
+++ b/htroot/Vocabulary_p.html
@ -3,6 +3,59 @@
  <head>
    <title>YaCy '#[clientname]#': Federated Index</title>
    #%env/templates/metas.template%#
  <script type="text/javascript">
 //<![CDATA[
 function xmlhttpPost() {
    var searchform = document.getElementById('searchform');
    search(searchform.discoverobjectspace.value);
 }
 function search(query) {
    var xmlHttpReq = false;
    var self = this;
    if (window.XMLHttpRequest) { // Mozilla/Safari
        self.xmlHttpReq = new XMLHttpRequest(); 
    }
    else if (window.ActiveXObject) { // IE
        self.xmlHttpReq = new ActiveXObject("Microsoft.XMLHTTP");
    }
    self.xmlHttpReq.open('GET', "yacysearch.json?verify=false&resource=local&maximumRecords=100&nav=none&query=" + query + "+inurl:" + query, true);
    self.xmlHttpReq.setRequestHeader('Content-Type', 'application/x-www-form-urlencoded');
    self.xmlHttpReq.onreadystatechange = function() {
        if (self.xmlHttpReq.readyState == 4) {
            updatepage(self.xmlHttpReq.responseText);
        }
    }
    self.xmlHttpReq.send(null);
 }
 function updatepage(str) {
  var raw = document.getElementById("raw");
  if (raw != null) raw.innerHTML = str;
  var rsp = eval("("+str+")");
  var firstChannel = rsp.channels[0];
  var totalResults = firstChannel.totalResults.replace(/[,.]/,"");
  var startIndex = firstChannel.startIndex;
  var itemsPerPage = firstChannel.itemsPerPage;
  var navigation = firstChannel.navigation;
  var html = "";
  if (totalResults > 0 && firstChannel.items.length > 0) {
    var item;
    html += "<table class=\"networkTable\" border=\"0\" cellpadding=\"2\" cellspacing=\"1\" width=\"99%\">";
    html += "<tr class=\"TableHeader\" valign=\"bottom\">";
    html += "<td>URL from index (total results = " + totalResults + ")<\/td>";
    for (var i = 0; i < firstChannel.items.length; i++) {
        item = firstChannel.items[i];
        html += "<tr class=\"TableCellLight\"><td align=\"left\" onclick=\"document.getElementById('searchform').value='" + item.link + "';\">" + item.link + "<\/td>";
    }
    html += "<\/table>";
  }
  document.getElementById("searchresults").innerHTML = html;
 }
 //]]>
 </script>
  </head>
  <body id="Vocabulary_p" onload="document.getElementById('newterm').focus()">
    #%env/templates/header.template%#
@ -31,14 +84,17 @@
    </form>
    #(create)#::
-    <form action="Vocabulary_p.html" method="get" accept-charset="UTF-8">
+    <!--<form action="Vocabulary_p.html" id="searchform" method="get" accept-charset="UTF-8" onkeyup="xmlhttpPost(); return false;">-->
    <form action="Vocabulary_p.html" id="searchform" method="get" accept-charset="UTF-8" >
      <fieldset><legend>Vocabulary Production</legend>
      It is possible to produce a vocabulary out of the existing search index. This is done using a given 'objectspace' which you can enter as a URL Stub.
      This stub is used to find all matching URLs. If the remaining path from the matching URLs then denotes a single file, the file name is used as vocabulary term.
      This works best with wikis. Try to use a wiki url as objectspace path.
      <dl>
        <dt>Vocabulary Name</dt><dd><input type="text" name="discovername" value="" size="16" maxlength="128" /></dd>
-        <dt>Objectspace</dt><dd><input type="text" name="discoverobjectspace" value="http://" size="78" maxlength="128" /></dd>
+        <dt>Objectspace</dt><dd><input type="text" name="discoverobjectspace" value="http://" size="78" maxlength="128" />
            <div id="searchresults"></div></dd>
        <dt>Discover Terms from</dt><dd><input type="radio" name="discovermethod" value="path" checked="checked" />object link file name&nbsp;&nbsp;<input type="radio" name="discovermethod" value="title" />object page title&nbsp;&nbsp;<input type="radio" name="discovermethod" value="titlesplitted" />object page title (splitted)&nbsp;&nbsp;<input type="radio" name="discovermethod" value="author" />object page author</dd>
        <dt></dt><dd><input type="submit" name="create" value="Create" /></dd>
      </dl>
    </fieldset>
--- a/htroot/Vocabulary_p.java
+++ b/htroot/Vocabulary_p.java
@ -35,6 +35,7 @@ import net.yacy.cora.lod.vocabulary.YaCyMetadata;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.document.LibraryProvider;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.data.meta.URIMetadataRow;
 import net.yacy.kelondro.logging.Log;
 import net.yacy.search.Switchboard;
 import net.yacy.search.SwitchboardConstants;
@ -50,44 +51,69 @@ public class Vocabulary_p {
        Collection<Tagging> vocs = LibraryProvider.autotagging.getVocabularies();
        String vocabularyName = (post == null) ? null : post.get("vocabulary", null);
        String discovername = (post == null) ? null : post.get("discovername", null);
        Tagging vocabulary = vocabularyName == null ? null : LibraryProvider.autotagging.getVocabulary(vocabularyName);
        if (vocabulary == null) vocabularyName = null;
        int count = 0;
        for (Tagging v: vocs) {
            prop.put("vocabularyset_" + count + "_name", v.getName());
            prop.put("vocabularyset_" + count + "_selected", (vocabularyName != null && vocabularyName.equals(v.getName())) ? 1 : 0);
            count++;
        }
        prop.put("vocabularyset", count);
        if (post != null) {
            try {
                if (vocabulary == null) {
                    // create a vocabulary
-                    String discovername = post.get("discovername", "");
+                    if (discovername != null && discovername.length() > 0) {
                    if (discovername.length() > 0) {
                        String discoverobjectspace = post.get("discoverobjectspace", "");
                        MultiProtocolURI discoveruri = null;
                        if (discoverobjectspace.length() > 0) try {discoveruri = new MultiProtocolURI(discoverobjectspace);} catch (MalformedURLException e) {}
                        if (discoveruri == null) discoverobjectspace = "";
                        Map<String, Tagging.SOTuple> table = new TreeMap<String, Tagging.SOTuple>();
                        File propFile = LibraryProvider.autotagging.getVocabularyFile(discovername);
                        boolean discoverFromPath = post.get("discovermethod", "").equals("path");
                        boolean discoverFromTitle = post.get("discovermethod", "").equals("title");
                        boolean discoverFromTitleSplitted = post.get("discovermethod", "").equals("titlesplitted");
                        boolean discoverFromAuthor = post.get("discovermethod", "").equals("author");
                        if (discoveruri != null) {
                            String segmentName = sb.getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default");
                            Segment segment = sb.indexSegments.segment(segmentName);
                            Iterator<DigestURI> ui = segment.urlSelector(discoveruri);
                            String t;
                            while (ui.hasNext()) {
                                DigestURI u = ui.next();
                                String u0 = u.toNormalform(true, false);
-                                String t = u0.substring(discoverobjectspace.length());
+                                t = "";
-                                if (t.indexOf('/') >= 0) continue;
+                                if (discoverFromPath) {
-                                int p = t.indexOf('.');
+                                    t = u0.substring(discoverobjectspace.length());
-                                if (p >= 0) t = t.substring(0, p);
+                                    if (t.indexOf('/') >= 0) continue;
-                                while ((p = t.indexOf(':')) >= 0) t = t.substring(p + 1);
+                                    int p = t.indexOf('.');
-                                while ((p = t.indexOf('=')) >= 0) t = t.substring(p + 1);
+                                    if (p >= 0) t = t.substring(0, p);
-                                if (p >= 0) t = t.substring(p + 1);
+                                    while ((p = t.indexOf(':')) >= 0) t = t.substring(p + 1);
                                    while ((p = t.indexOf('=')) >= 0) t = t.substring(p + 1);
                                    if (p >= 0) t = t.substring(p + 1);
                                }
                                if (discoverFromTitle || discoverFromTitleSplitted) {
                                    URIMetadataRow m = segment.urlMetadata().load(u.hash());
                                    if (m != null) t = m.dc_title();
                                }
                                if (discoverFromAuthor) {
                                    URIMetadataRow m = segment.urlMetadata().load(u.hash());
                                    if (m != null) t = m.dc_creator();
                                }
                                t = t.replaceAll("\"", " ").replaceAll("'", " ").replaceAll(",", " ").replaceAll("  ", " ").trim();
                                if (t.length() == 0) continue;
-                                table.put(t, new Tagging.SOTuple("", u0));
+                                if (discoverFromTitleSplitted) {
                                    String[] ts = t.split(" ");
                                    for (String s: ts) {
                                        if (s.length() == 0) continue;
                                        table.put(s, new Tagging.SOTuple("", u0));
                                    }
                                } else if (discoverFromAuthor) {
                                    String[] ts = t.split(";"); // author names are often separated by ';'
                                    for (String s: ts) {
                                        if (s.length() == 0) continue;
                                        int p = s.indexOf(','); // check if there is a reversed method to mention the name
                                        if (p >= 0) s = s.substring(p + 1).trim() + " " + s.substring(0, p).trim();
                                        table.put(s, new Tagging.SOTuple("", u0));
                                    }
                                } else {
                                    table.put(t, new Tagging.SOTuple("", u0));
                                }
                            }
                        }
                        Tagging newvoc = new Tagging(discovername, propFile, discoverobjectspace, table);
@ -143,6 +169,14 @@ public class Vocabulary_p {
            }
        }
        int count = 0;
        for (Tagging v: vocs) {
            prop.put("vocabularyset_" + count + "_name", v.getName());
            prop.put("vocabularyset_" + count + "_selected", ((vocabularyName != null && vocabularyName.equals(v.getName())) || (discovername != null && discovername.equals(v.getName()))) ? 1 : 0);
            count++;
        }
        prop.put("vocabularyset", count);
        prop.put("create", vocabularyName == null ? 1 : 0);
        if (vocabulary == null) {
--- a/htroot/interaction/Triple.java
+++ b/htroot/interaction/Triple.java
@ -98,55 +98,37 @@ public class Triple {
        String s = "";
        String p = "";
        String o = "";
        String result = "";
        Boolean global = false;
-        if(post != null){
+        if (post != null) {
-            if(post.containsKey("s")){
+            s = post.get("s", "");
-            	s = post.get("s");
+            p = post.get("p", "");
-            }
+            o = post.get("o", "");
-            if(post.containsKey("sp")){
+            if (post.containsKey("sp")) s = post.get("sp") + "#" + s;
-            	s = post.get("sp") + "#" + s;
+            if (post.containsKey("pp")) p = post.get("pp") + "#" + p;
            }
            if(post.containsKey("p")){
            	p = post.get("p");
            }
            if(post.containsKey("pp")){
            	p = post.get("pp") + "#" + p;
            }
            if(post.containsKey("o")){
            	o = post.get("o");
            }
            global = post.containsKey("global");
            if (post.containsKey("load")) {
                if (global) {
                    result = JenaTripleStore.getObject(s, p);
                } else {
                    result = JenaTripleStore.getPrivateObject(s, p, username);
                }
            } else {
                if (global) {
                    JenaTripleStore.addTriple(s, p, o);
                } else {
                    JenaTripleStore.addTriple(s, p, o, username);
                }
            }
        }
-        if (post.containsKey("load")) {
+        prop.put("result", result);
        	if (global) {
        		o = JenaTripleStore.getObject(s, p);
        	} else {
        		o = JenaTripleStore.getPrivateObject(s, p, username);
        	}
        } else {
        	if (global) {
        		JenaTripleStore.addTriple(s, p, o);
        	} else {
        		JenaTripleStore.addTriple(s, p, o, username);
        	}
        }
        prop.put("result", o);
        return prop;
    }
--- a/source/net/yacy/cora/lod/JenaTripleStore.java
+++ b/source/net/yacy/cora/lod/JenaTripleStore.java
@ -179,11 +179,11 @@ public class JenaTripleStore {
    }
    public static String getObject(final String subject, final String predicate) {
    	Log.logInfo("TRIPLESTORE", "GET " + subject + " - " + predicate + " ... ");
    	Iterator<RDFNode> ni = JenaTripleStore.getObjects(subject, predicate);
-        if (!ni.hasNext()) return "";
+    	String object = "";
-        return ni.next().toString();
+        if (ni.hasNext()) object = ni.next().toString();
        Log.logInfo("TRIPLESTORE", "GET " + subject + " - " + predicate + " - " + object);
        return object;
    }
    public static Iterator<RDFNode> getObjects(final String subject, final String predicate) {
@ -192,11 +192,11 @@ public class JenaTripleStore {
    }
    public static String getPrivateObject(final String subject, final String predicate, final String username) {
    	Log.logInfo("TRIPLESTORE", "GET " + subject + " - " + predicate + " ... ("+username+")");
    	Iterator<RDFNode> ni = JenaTripleStore.getPrivateObjects(subject, predicate, username);
-        if (!ni.hasNext()) return "";
+        String object = "";
-        return ni.next().toString();
+        if (ni.hasNext()) object = ni.next().toString();
        Log.logInfo("TRIPLESTORE", "GET (" + username + ") " + subject + " - " + predicate + " - " + object);
        return object;
    }
    private static Iterator<RDFNode> getPrivateObjects(final String subject, final String predicate, final String username) {