From 22d5e33c5e522386b7c6609c6690eec99cc88b5e Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Sun, 17 Jun 2012 14:53:16 +0200
Subject: [PATCH] added more methods to vocabulary generation: scrape document
 title and document author to vocabulary

---
 htroot/Vocabulary_p.html                      | 60 +++++++++++++++-
 htroot/Vocabulary_p.java                      | 70 ++++++++++++++-----
 htroot/interaction/Triple.java                | 60 ++++++----------
 source/net/yacy/cora/lod/JenaTripleStore.java | 16 ++---
 4 files changed, 139 insertions(+), 67 deletions(-)
diff --git a/htroot/Vocabulary_p.html b/htroot/Vocabulary_p.html
index 7a2fe2bf4..c073b9abf 100644
--- a/htroot/Vocabulary_p.html
+++ b/htroot/Vocabulary_p.html
@@ -3,6 +3,59 @@
   <head>
     <title>YaCy '#[clientname]#': Federated Index</title>
     #%env/templates/metas.template%#
+  <script type="text/javascript">
+//<![CDATA[
+function xmlhttpPost() {
+    var searchform = document.getElementById('searchform');
+    search(searchform.discoverobjectspace.value);
+}
+
+function search(query) {
+    var xmlHttpReq = false;
+    var self = this;
+    if (window.XMLHttpRequest) { // Mozilla/Safari
+        self.xmlHttpReq = new XMLHttpRequest(); 
+    }
+    else if (window.ActiveXObject) { // IE
+        self.xmlHttpReq = new ActiveXObject("Microsoft.XMLHTTP");
+    }
+    self.xmlHttpReq.open('GET', "yacysearch.json?verify=false&resource=local&maximumRecords=100&nav=none&query=" + query + "+inurl:" + query, true);
+    self.xmlHttpReq.setRequestHeader('Content-Type', 'application/x-www-form-urlencoded');
+    self.xmlHttpReq.onreadystatechange = function() {
+        if (self.xmlHttpReq.readyState == 4) {
+            updatepage(self.xmlHttpReq.responseText);
+        }
+    }
+    self.xmlHttpReq.send(null);
+}
+
+function updatepage(str) {
+  var raw = document.getElementById("raw");
+  if (raw != null) raw.innerHTML = str;
+  var rsp = eval("("+str+")");
+  var firstChannel = rsp.channels[0];
+  var totalResults = firstChannel.totalResults.replace(/[,.]/,"");
+  var startIndex = firstChannel.startIndex;
+  var itemsPerPage = firstChannel.itemsPerPage;
+  var navigation = firstChannel.navigation;
+  
+  var html = "";
+  
+  if (totalResults > 0 && firstChannel.items.length > 0) {
+    var item;
+    html += "<table class=\"networkTable\" border=\"0\" cellpadding=\"2\" cellspacing=\"1\" width=\"99%\">";
+    html += "<tr class=\"TableHeader\" valign=\"bottom\">";
+    html += "<td>URL from index (total results = " + totalResults + ")<\/td>";
+    for (var i = 0; i < firstChannel.items.length; i++) {
+        item = firstChannel.items[i];
+        html += "<tr class=\"TableCellLight\"><td align=\"left\" onclick=\"document.getElementById('searchform').value='" + item.link + "';\">" + item.link + "<\/td>";
+    }
+    html += "<\/table>";
+  }
+  document.getElementById("searchresults").innerHTML = html;
+}
+//]]>
+</script>
   </head>
   <body id="Vocabulary_p" onload="document.getElementById('newterm').focus()">
     #%env/templates/header.template%#
@@ -31,14 +84,17 @@
     </form>
     
     #(create)#::
-    <form action="Vocabulary_p.html" method="get" accept-charset="UTF-8">
+    <!--<form action="Vocabulary_p.html" id="searchform" method="get" accept-charset="UTF-8" onkeyup="xmlhttpPost(); return false;">-->
+    <form action="Vocabulary_p.html" id="searchform" method="get" accept-charset="UTF-8" >
       <fieldset><legend>Vocabulary Production</legend>
       It is possible to produce a vocabulary out of the existing search index. This is done using a given 'objectspace' which you can enter as a URL Stub.
       This stub is used to find all matching URLs. If the remaining path from the matching URLs then denotes a single file, the file name is used as vocabulary term.
       This works best with wikis. Try to use a wiki url as objectspace path.
       <dl>
         <dt>Vocabulary Name</dt><dd><input type="text" name="discovername" value="" size="16" maxlength="128" /></dd>
-        <dt>Objectspace</dt><dd><input type="text" name="discoverobjectspace" value="http://" size="78" maxlength="128" /></dd>
+        <dt>Objectspace</dt><dd><input type="text" name="discoverobjectspace" value="http://" size="78" maxlength="128" />
+            <div id="searchresults"></div></dd>
+        <dt>Discover Terms from</dt><dd><input type="radio" name="discovermethod" value="path" checked="checked" />object link file name&nbsp;&nbsp;<input type="radio" name="discovermethod" value="title" />object page title&nbsp;&nbsp;<input type="radio" name="discovermethod" value="titlesplitted" />object page title (splitted)&nbsp;&nbsp;<input type="radio" name="discovermethod" value="author" />object page author</dd>
         <dt></dt><dd><input type="submit" name="create" value="Create" /></dd>
       </dl>
     </fieldset>
diff --git a/htroot/Vocabulary_p.java b/htroot/Vocabulary_p.java
index 5a7eeca93..b1899c254 100644
--- a/htroot/Vocabulary_p.java
+++ b/htroot/Vocabulary_p.java
@@ -35,6 +35,7 @@ import net.yacy.cora.lod.vocabulary.YaCyMetadata;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.document.LibraryProvider;
 import net.yacy.kelondro.data.meta.DigestURI;
+import net.yacy.kelondro.data.meta.URIMetadataRow;
 import net.yacy.kelondro.logging.Log;
 import net.yacy.search.Switchboard;
 import net.yacy.search.SwitchboardConstants;
@@ -50,44 +51,69 @@ public class Vocabulary_p {
         Collection<Tagging> vocs = LibraryProvider.autotagging.getVocabularies();
 
         String vocabularyName = (post == null) ? null : post.get("vocabulary", null);
+        String discovername = (post == null) ? null : post.get("discovername", null);
         Tagging vocabulary = vocabularyName == null ? null : LibraryProvider.autotagging.getVocabulary(vocabularyName);
         if (vocabulary == null) vocabularyName = null;
-        int count = 0;
-        for (Tagging v: vocs) {
-            prop.put("vocabularyset_" + count + "_name", v.getName());
-            prop.put("vocabularyset_" + count + "_selected", (vocabularyName != null && vocabularyName.equals(v.getName())) ? 1 : 0);
-            count++;
-        }
-        prop.put("vocabularyset", count);
-
         if (post != null) {
             try {
                 if (vocabulary == null) {
                     // create a vocabulary
-                    String discovername = post.get("discovername", "");
-                    if (discovername.length() > 0) {
+                    if (discovername != null && discovername.length() > 0) {
                         String discoverobjectspace = post.get("discoverobjectspace", "");
                         MultiProtocolURI discoveruri = null;
                         if (discoverobjectspace.length() > 0) try {discoveruri = new MultiProtocolURI(discoverobjectspace);} catch (MalformedURLException e) {}
                         if (discoveruri == null) discoverobjectspace = "";
                         Map<String, Tagging.SOTuple> table = new TreeMap<String, Tagging.SOTuple>();
                         File propFile = LibraryProvider.autotagging.getVocabularyFile(discovername);
+                        boolean discoverFromPath = post.get("discovermethod", "").equals("path");
+                        boolean discoverFromTitle = post.get("discovermethod", "").equals("title");
+                        boolean discoverFromTitleSplitted = post.get("discovermethod", "").equals("titlesplitted");
+                        boolean discoverFromAuthor = post.get("discovermethod", "").equals("author");
                         if (discoveruri != null) {
                             String segmentName = sb.getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default");
                             Segment segment = sb.indexSegments.segment(segmentName);
                             Iterator<DigestURI> ui = segment.urlSelector(discoveruri);
+                            String t;
                             while (ui.hasNext()) {
                                 DigestURI u = ui.next();
                                 String u0 = u.toNormalform(true, false);
-                                String t = u0.substring(discoverobjectspace.length());
-                                if (t.indexOf('/') >= 0) continue;
-                                int p = t.indexOf('.');
-                                if (p >= 0) t = t.substring(0, p);
-                                while ((p = t.indexOf(':')) >= 0) t = t.substring(p + 1);
-                                while ((p = t.indexOf('=')) >= 0) t = t.substring(p + 1);
-                                if (p >= 0) t = t.substring(p + 1);
+                                t = "";
+                                if (discoverFromPath) {
+                                    t = u0.substring(discoverobjectspace.length());
+                                    if (t.indexOf('/') >= 0) continue;
+                                    int p = t.indexOf('.');
+                                    if (p >= 0) t = t.substring(0, p);
+                                    while ((p = t.indexOf(':')) >= 0) t = t.substring(p + 1);
+                                    while ((p = t.indexOf('=')) >= 0) t = t.substring(p + 1);
+                                    if (p >= 0) t = t.substring(p + 1);
+                                }
+                                if (discoverFromTitle || discoverFromTitleSplitted) {
+                                    URIMetadataRow m = segment.urlMetadata().load(u.hash());
+                                    if (m != null) t = m.dc_title();
+                                }
+                                if (discoverFromAuthor) {
+                                    URIMetadataRow m = segment.urlMetadata().load(u.hash());
+                                    if (m != null) t = m.dc_creator();
+                                }
+                                t = t.replaceAll("\"", " ").replaceAll("'", " ").replaceAll(",", " ").replaceAll("  ", " ").trim();
                                 if (t.length() == 0) continue;
-                                table.put(t, new Tagging.SOTuple("", u0));
+                                if (discoverFromTitleSplitted) {
+                                    String[] ts = t.split(" ");
+                                    for (String s: ts) {
+                                        if (s.length() == 0) continue;
+                                        table.put(s, new Tagging.SOTuple("", u0));
+                                    }
+                                } else if (discoverFromAuthor) {
+                                    String[] ts = t.split(";"); // author names are often separated by ';'
+                                    for (String s: ts) {
+                                        if (s.length() == 0) continue;
+                                        int p = s.indexOf(','); // check if there is a reversed method to mention the name
+                                        if (p >= 0) s = s.substring(p + 1).trim() + " " + s.substring(0, p).trim();
+                                        table.put(s, new Tagging.SOTuple("", u0));
+                                    }
+                                } else {
+                                    table.put(t, new Tagging.SOTuple("", u0));
+                                }
                             }
                         }
                         Tagging newvoc = new Tagging(discovername, propFile, discoverobjectspace, table);
@@ -143,6 +169,14 @@ public class Vocabulary_p {
             }
         }
 
+        int count = 0;
+        for (Tagging v: vocs) {
+            prop.put("vocabularyset_" + count + "_name", v.getName());
+            prop.put("vocabularyset_" + count + "_selected", ((vocabularyName != null && vocabularyName.equals(v.getName())) || (discovername != null && discovername.equals(v.getName()))) ? 1 : 0);
+            count++;
+        }
+        prop.put("vocabularyset", count);
+
         prop.put("create", vocabularyName == null ? 1 : 0);
 
         if (vocabulary == null) {
diff --git a/htroot/interaction/Triple.java b/htroot/interaction/Triple.java
index 95c1e23f6..58913d028 100644
--- a/htroot/interaction/Triple.java
+++ b/htroot/interaction/Triple.java
@@ -98,55 +98,37 @@ public class Triple {
         String s = "";
         String p = "";
         String o = "";
+        String result = "";
 
         Boolean global = false;
 
-        if(post != null){
+        if (post != null) {
 
-            if(post.containsKey("s")){
-            	s = post.get("s");
-            }
+            s = post.get("s", "");
+            p = post.get("p", "");
+            o = post.get("o", "");
 
-            if(post.containsKey("sp")){
-            	s = post.get("sp") + "#" + s;
-            }
-
-            if(post.containsKey("p")){
-            	p = post.get("p");
-            }
-
-            if(post.containsKey("pp")){
-            	p = post.get("pp") + "#" + p;
-            }
-
-            if(post.containsKey("o")){
-            	o = post.get("o");
-            }
+            if (post.containsKey("sp")) s = post.get("sp") + "#" + s;
+            if (post.containsKey("pp")) p = post.get("pp") + "#" + p;
 
             global = post.containsKey("global");
 
+            if (post.containsKey("load")) {
+                if (global) {
+                    result = JenaTripleStore.getObject(s, p);
+                } else {
+                    result = JenaTripleStore.getPrivateObject(s, p, username);
+                }
+            } else {
+                if (global) {
+                    JenaTripleStore.addTriple(s, p, o);
+                } else {
+                    JenaTripleStore.addTriple(s, p, o, username);
+                }
+            }
         }
 
-        if (post.containsKey("load")) {
-
-        	if (global) {
-        		o = JenaTripleStore.getObject(s, p);
-        	} else {
-        		o = JenaTripleStore.getPrivateObject(s, p, username);
-        	}
-
-
-        } else {
-
-        	if (global) {
-        		JenaTripleStore.addTriple(s, p, o);
-        	} else {
-        		JenaTripleStore.addTriple(s, p, o, username);
-        	}
-
-        }
-
-        prop.put("result", o);
+        prop.put("result", result);
 
         return prop;
     }
diff --git a/source/net/yacy/cora/lod/JenaTripleStore.java b/source/net/yacy/cora/lod/JenaTripleStore.java
index 7220a8637..40f604f74 100644
--- a/source/net/yacy/cora/lod/JenaTripleStore.java
+++ b/source/net/yacy/cora/lod/JenaTripleStore.java
@@ -179,11 +179,11 @@ public class JenaTripleStore {
     }
 
     public static String getObject(final String subject, final String predicate) {
-    	Log.logInfo("TRIPLESTORE", "GET " + subject + " - " + predicate + " ... ");
-
     	Iterator<RDFNode> ni = JenaTripleStore.getObjects(subject, predicate);
-        if (!ni.hasNext()) return "";
-        return ni.next().toString();
+    	String object = "";
+        if (ni.hasNext()) object = ni.next().toString();
+        Log.logInfo("TRIPLESTORE", "GET " + subject + " - " + predicate + " - " + object);
+        return object;
     }
 
     public static Iterator<RDFNode> getObjects(final String subject, final String predicate) {
@@ -192,11 +192,11 @@ public class JenaTripleStore {
     }
 
     public static String getPrivateObject(final String subject, final String predicate, final String username) {
-    	Log.logInfo("TRIPLESTORE", "GET " + subject + " - " + predicate + " ... ("+username+")");
-
     	Iterator<RDFNode> ni = JenaTripleStore.getPrivateObjects(subject, predicate, username);
-        if (!ni.hasNext()) return "";
-        return ni.next().toString();
+        String object = "";
+        if (ni.hasNext()) object = ni.next().toString();
+        Log.logInfo("TRIPLESTORE", "GET (" + username + ") " + subject + " - " + predicate + " - " + object);
+        return object;
     }
 
     private static Iterator<RDFNode> getPrivateObjects(final String subject, final String predicate, final String username) {