From 21df1ad9e02dc58497e1e7ee5c4996f3edeb220f Mon Sep 17 00:00:00 2001 From: cominch Date: Fri, 9 Nov 2012 13:48:40 +0100 Subject: [PATCH] update and generalization of the SMW import and content control routines --- htroot/ContentControl_p.html | 31 +--- htroot/ContentControl_p.java | 13 +- .../ContentControlFilterUpdateThread.java | 56 +++--- .../contentcontrol/SMWListImporter.java | 163 +++++++++++++++++ .../contentcontrol/SMWListRow.java | 24 +++ ...portThread.java => SMWListSyncThread.java} | 166 ++++++++++-------- source/net/yacy/search/Switchboard.java | 5 +- source/net/yacy/search/query/SearchEvent.java | 30 ++-- 8 files changed, 329 insertions(+), 159 deletions(-) create mode 100644 source/net/yacy/interaction/contentcontrol/SMWListImporter.java create mode 100644 source/net/yacy/interaction/contentcontrol/SMWListRow.java rename source/net/yacy/interaction/contentcontrol/{ContentControlImportThread.java => SMWListSyncThread.java} (68%) diff --git a/htroot/ContentControl_p.html b/htroot/ContentControl_p.html index 78f78157e..270770eb4 100644 --- a/htroot/ContentControl_p.html +++ b/htroot/ContentControl_p.html @@ -27,20 +27,13 @@ Enables or disables content control.

+ -
-
-

-

- Define a category string. If defined, all URLs will be filtered out during crawling and DHT which do not belong to this category. -

-
- -
+


- Define a bookmark list. Default: contentcontrol + Define a table. Default: contentcontrol

@@ -52,9 +45,9 @@
-
Content Control Settings +
Content Control SMW Import Settings

- With this settings you can define the content control settings. + With this settings you can define the content control import settings. You can define a SMW with the appropriate extensions. Details: yacy-smwextension on Gitorious

@@ -63,7 +56,7 @@
Enabled

- Enable or disable constant background synchronisation of content control list from SMW (Semantic Mediawiki). Requires restart! + Enable or disable constant background synchronization of content control list from SMW (Semantic Mediawiki). Requires restart!

@@ -75,19 +68,11 @@

-
+


- Define import target bookmark list. Default: contentcontrol -

-
- -
-
-

-

- Define default category which is added to each entry. This category can be defined as mandatory default filter list. + Define import target table. Default: contentcontrol

diff --git a/htroot/ContentControl_p.java b/htroot/ContentControl_p.java index b89ff44e4..e39632f15 100644 --- a/htroot/ContentControl_p.java +++ b/htroot/ContentControl_p.java @@ -24,9 +24,7 @@ public final class ContentControl_p { env.setConfig("contentcontrol.smwimport.targetlist", post.get("ccsmwimportlist")); - - env.setConfig("contentcontrol.smwimport.defaultcategory", - post.get("ccsmwimportcat")); + } @@ -35,9 +33,7 @@ public final class ContentControl_p { env.setConfig("contentcontrol.enabled", "on".equals(post.get("contentcontrolenabled")) ? true : false); - env.setConfig("contentcontrol.mandatoryfilterlist", - post.get("contentcontrolmfl")); - + env.setConfig("contentcontrol.bookmarklist", post.get("contentcontrolbml")); @@ -45,8 +41,6 @@ public final class ContentControl_p { } - prop.putHTML("ccsmwimportcat", - env.getConfig("contentcontrol.smwimport.defaultcategory", "yacy")); prop.putHTML("ccsmwimportlist", env.getConfig("contentcontrol.smwimport.targetlist", "contentcontrol")); @@ -64,9 +58,6 @@ public final class ContentControl_p { prop.put("contentcontrolenabled_checked", env.getConfigBool("contentcontrol.enabled", false) ? "1" : "0"); - prop.putHTML("contentcontrolmfl", - env.getConfig("contentcontrol.mandatoryfilterlist", "yacy")); - prop.putHTML("contentcontrolbml", env.getConfig("contentcontrol.bookmarklist", "")); diff --git a/source/net/yacy/interaction/contentcontrol/ContentControlFilterUpdateThread.java b/source/net/yacy/interaction/contentcontrol/ContentControlFilterUpdateThread.java index ee1fcd136..6abd2ec61 100644 --- a/source/net/yacy/interaction/contentcontrol/ContentControlFilterUpdateThread.java +++ b/source/net/yacy/interaction/contentcontrol/ContentControlFilterUpdateThread.java @@ -1,5 +1,6 @@ package net.yacy.interaction.contentcontrol; +import java.io.IOException; import java.util.Iterator; import net.yacy.kelondro.blob.Tables; @@ -16,15 +17,9 @@ public class ContentControlFilterUpdateThread { private static FilterEngine networkfilter; public ContentControlFilterUpdateThread(final Switchboard sb) { - //final long time = System.currentTimeMillis(); - this.sb = sb; - if (this.sb.getConfigBool("contentcontrol.smwimport.purgelistoninit", - false)) { - this.sb.tables.clear(this.sb.getConfig( - "contentcontrol.smwimport.targetlist", "contentcontrol")); + this.sb = sb; - } } public final void run() { @@ -35,17 +30,11 @@ public class ContentControlFilterUpdateThread { if (this.sb.getConfigBool("contentcontrol.enabled", false) == true) { - if (!this.sb - .getConfig("contentcontrol.mandatoryfilterlist", "") - .equals("")) { - - if (this.sb.tables.bookmarks.dirty) { + if (this.sb.tables.bookmarks.dirty) { - networkfilter = updateFilter(); + networkfilter = updateFilter(); - this.sb.tables.bookmarks.dirty = false; - - } + SMWListSyncThread.dirty = false; } @@ -55,7 +44,6 @@ public class ContentControlFilterUpdateThread { } - return; } @@ -66,23 +54,23 @@ public class ContentControlFilterUpdateThread { Switchboard sb = Switchboard.getSwitchboard(); Iterator it; - it = sb.tables.bookmarks.getBookmarksByTag( - sb.getConfig( - "contentcontrol.bookmarklist", - "contentcontrol"), - "^((?!sc:" - + sb - .getConfig( - "contentcontrol.mandatoryfilterlist", - "") + ").*)$"); - while (it.hasNext()) { - Row b = it.next(); - - if (!b.get("filter", "").equals("")) { - - newfilter.add(b.get("filter", ""), null); - } - } + try { + it = sb.tables.iterator(sb.getConfig("contentcontrol.bookmarklist", + "contentcontrol")); + + while (it.hasNext()) { + Row b = it.next(); + + if (!b.get("filter", "").equals("")) { + + newfilter.add(b.get("filter", ""), null); + } + } + + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } return newfilter; } diff --git a/source/net/yacy/interaction/contentcontrol/SMWListImporter.java b/source/net/yacy/interaction/contentcontrol/SMWListImporter.java new file mode 100644 index 000000000..c55f242fc --- /dev/null +++ b/source/net/yacy/interaction/contentcontrol/SMWListImporter.java @@ -0,0 +1,163 @@ +package net.yacy.interaction.contentcontrol; + +import java.io.IOException; +import java.io.Reader; +import java.util.HashMap; +import java.util.Map.Entry; +import java.util.concurrent.ArrayBlockingQueue; + +import net.yacy.kelondro.logging.Log; + +import org.json.simple.parser.ContentHandler; +import org.json.simple.parser.JSONParser; +import org.json.simple.parser.ParseException; + +public class SMWListImporter implements Runnable, ContentHandler{ + + // Importer Variables + private final ArrayBlockingQueue listEntries; + private final Reader importFile; + + private SMWListRow row; + private final JSONParser parser; + + // Parser Variables + private final StringBuilder value; + private final StringBuilder key; + private final HashMap obj; + + private Boolean isElement; + + public SMWListImporter(final Reader importFile, final int queueSize) { + this.listEntries = new ArrayBlockingQueue(queueSize); + this.importFile = importFile; + + this.row = new SMWListRow(); + + this.parser = new JSONParser(); + + this.value = new StringBuilder(128); + this.key = new StringBuilder(16); + this.obj = new HashMap(); + + this.isElement = false; + + } + + @Override + public void startJSON() throws ParseException, IOException { + } + + @Override + public void endJSON() throws ParseException, IOException { + } + + @Override + public boolean startArray() throws ParseException, IOException { + final String key = this.key.toString(); + + if (key.equals("items")) { + + this.isElement = true; + + } + return true; + } + + @Override + public boolean endArray() throws ParseException, IOException { + + return true; + } + + @Override + public boolean startObject() throws ParseException, IOException { + + return true; + } + + @Override + public boolean endObject() throws ParseException, IOException { + + if(this.isElement) { + + for (Entry e: this.obj.entrySet()) { + this.row.add (e.getKey(), e.getValue()); + } + try { + this.listEntries.put(this.row); + //this.count++; + } catch (InterruptedException e) { + Log.logException(e); + } + this.obj.clear(); + this.row = new SMWListRow(); + } + + return true; + } + + @Override + public boolean startObjectEntry(String key) throws ParseException, IOException { + this.key.setLength(0); + this.key.append(key); + + return true; + } + + @Override + public boolean primitive(Object value) throws ParseException, IOException { + + this.value.setLength(0); + if(value instanceof java.lang.String) { + this.value.append((String)value); + } else if(value instanceof java.lang.Boolean) { + this.value.append(value); + } else if(value instanceof java.lang.Number) { + this.value.append(value); + } + + return true; + } + + @Override + public boolean endObjectEntry() throws ParseException, IOException { + + final String key = this.key.toString(); + final String value = this.value.toString(); + + this.obj.put(key, value); + + return true; + } + + @Override + public void run() { + try { + Log.logInfo("SMWLISTSYNC", "Importer run()"); + this.parser.parse(this.importFile, this, true); + + } catch (IOException e) { + Log.logException(e); + } catch (ParseException e) { + Log.logException(e); + } finally { + + try { + Log.logInfo("SMWLISTSYNC", "Importer inserted poison pill in queue"); + this.listEntries.put(SMWListRow.POISON); + } catch (InterruptedException e) { + Log.logException(e); + } + } + } + + public SMWListRow take() { + try { + return this.listEntries.take(); + } catch (InterruptedException e) { + Log.logException(e); + return null; + } + } +} diff --git a/source/net/yacy/interaction/contentcontrol/SMWListRow.java b/source/net/yacy/interaction/contentcontrol/SMWListRow.java new file mode 100644 index 000000000..21be54c5a --- /dev/null +++ b/source/net/yacy/interaction/contentcontrol/SMWListRow.java @@ -0,0 +1,24 @@ +package net.yacy.interaction.contentcontrol; + +import net.yacy.kelondro.blob.Tables; + +public class SMWListRow { + + private Tables.Data data; + + public static final SMWListRow POISON = new SMWListRow(); + public static final SMWListRow EMPTY = new SMWListRow(); + + public SMWListRow() { + this.data = new Tables.Data(); + } + + public void add (String key, String value) { + this.data.put(key, value); + } + + public Tables.Data getData() { + return this.data; + } + +} diff --git a/source/net/yacy/interaction/contentcontrol/ContentControlImportThread.java b/source/net/yacy/interaction/contentcontrol/SMWListSyncThread.java similarity index 68% rename from source/net/yacy/interaction/contentcontrol/ContentControlImportThread.java rename to source/net/yacy/interaction/contentcontrol/SMWListSyncThread.java index 12173ae29..cf0d00117 100644 --- a/source/net/yacy/interaction/contentcontrol/ContentControlImportThread.java +++ b/source/net/yacy/interaction/contentcontrol/SMWListSyncThread.java @@ -13,7 +13,7 @@ import net.yacy.data.ymark.YMarkSMWJSONImporter; import net.yacy.kelondro.logging.Log; import net.yacy.search.Switchboard; -public class ContentControlImportThread { +public class SMWListSyncThread { private final Switchboard sb; private Boolean locked = false; @@ -23,11 +23,20 @@ public class ContentControlImportThread { private final long limit = 500; private long currentmax = 0; private boolean runningjob = false; - - public ContentControlImportThread(final Switchboard sb) { + + private String targetList; + private String parameters; + private String query; + + public static Boolean dirty = false; + + public SMWListSyncThread(final Switchboard sb, final String targetList, final String query, final String parameters, final Boolean purgeOnInit) { this.sb = sb; - if (this.sb.getConfigBool("contentcontrol.smwimport.purgelistoninit",false)) { - this.sb.tables.clear(this.sb.getConfig("contentcontrol.smwimport.targetlist", "contentcontrol")); + this.targetList = targetList; + this.parameters = parameters; + this.query = query; + if (purgeOnInit) { + this.sb.tables.clear(targetList); } } @@ -51,26 +60,84 @@ public class ContentControlImportThread { if (!this.locked) { this.locked = true; if (this.sb.getConfigBool("contentcontrol.smwimport.enabled", false) == true) { - if (this.runningjob) { - Log.logInfo("CONTENTCONTROL", - "CONTENTCONTROL importing max. " + this.limit + + if (!this.runningjob) { + + // we have to count all new elements first + try { + if (!this.sb.getConfig("contentcontrol.smwimport.baseurl","").equals("")) { + URL urlCount; + + urlCount = new URL( + this.sb.getConfig( + "contentcontrol.smwimport.baseurl", + "") + + wikiurlify ("/[["+this.query+"]] [[Modification date::>" +this.lastsync+ "]]") + + + wikiurlify (this.parameters) + + + "/mainlabel%3D" + + "/offset%3D0" + + "/limit%3D200000" + + "/format%3Dystat"); + + String reply = UTF8.String(new HTTPClient(ClientIdentification.getUserAgent(), ClientIdentification.DEFAULT_TIMEOUT).GETbytes(urlCount.toString())); + String overallcount = reply.split(",")[0]; + String lastsyncstring = reply.split(",")[1]; + this.currentmax = Integer.parseInt(overallcount); + + if (this.currentmax > 0) { + Log.logInfo("SMWLISTSYNC", + "import job counts " + + this.currentmax + + " new elements between " + + this.lastsync + " and " + + this.currenttimestamp); + + this.currenttimestamp = this.lastsync; + + this.runningjob = true; + this.lastsync = lastsyncstring; + this.offset = 0; + } + } else { + Log.logWarning("SMWLISTSYNC", + "No SMWimport URL defined"); + } + } catch (MalformedURLException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + + } else { + + // there are new elements to be imported + Log.logInfo("SMWLISTSYNC", + "importing max. " + this.limit + " elements at " + this.offset + " of " + this.currentmax + ", since " + this.currenttimestamp); - URL bmks_json; + URL urlImport; try { if (!this.sb.getConfig("contentcontrol.smwimport.baseurl","").equals("")) { - bmks_json = new URL( + urlImport = new URL( this.sb.getConfig( "contentcontrol.smwimport.baseurl", "") - + wikiurlify ("/[[Category:Web Page]] [[Modification date::>" +this.currenttimestamp+ "]]") + + wikiurlify ("/[["+this.query+"]] [[Modification date::>" +this.currenttimestamp+ "]]") - + wikiurlify ("/?Url/?Filter/?Article has average rating/?Category") + + wikiurlify (this.parameters) + + "/mainlabel%3D" + + "/syntax%3Dobsolete" + "/offset%3D" + this.offset + "/limit%3D" + this.limit + "/format%3Djson"); + this.offset += this.limit; if (this.offset > this.currentmax) { this.runningjob = false; @@ -79,34 +146,34 @@ public class ContentControlImportThread { InputStreamReader reader = null; try { reader = new InputStreamReader( - bmks_json.openStream(), "UTF-8"); + urlImport.openStream(), "UTF-8"); } catch (Exception e) { Log.logException(e); this.runningjob = false; } if (reader != null) { - YMarkSMWJSONImporter bookmarkImporter = null; + SMWListImporter smwListImporter = null; try { - bookmarkImporter = new YMarkSMWJSONImporter( - reader, 200, ""); + smwListImporter = new SMWListImporter( + reader, 200); } catch (final Exception e) { // TODO: display an error message Log.logException(e); this.runningjob = false; } Thread t; - YMarkEntry bmk; - t = new Thread(bookmarkImporter,"YMarks - Network bookmark importer"); + SMWListRow row; + t = new Thread(smwListImporter,"SMW List Importer"); t.start(); - while ((bmk = bookmarkImporter.take()) != YMarkEntry.POISON) { - if (bmk == YMarkEntry.EMPTY) { + while ((row = smwListImporter.take()) != SMWListRow.POISON) { + if (row == SMWListRow.EMPTY) { this.runningjob = false; } else { try { - this.sb.tables.bookmarks.addBookmark( - this.sb.getConfig("contentcontrol.smwimport.targetlist", "contentcontrol"), bmk, - true, true); + this.sb.tables.insert(targetList, row.getData()); + + dirty = true; } catch (Exception e) { // TODO Auto-generated catch block @@ -114,62 +181,15 @@ public class ContentControlImportThread { } } } - } else { - - } - } - else { - + } } + } catch (MalformedURLException e2) { // TODO Auto-generated catch block e2.printStackTrace(); } - } else { - try { - if (!this.sb.getConfig("contentcontrol.smwimport.baseurl","").equals("")) { - URL bmks_count; - - bmks_count = new URL( - this.sb.getConfig( - "contentcontrol.smwimport.baseurl", - "") - + wikiurlify ("/[[Category:Web Page]] [[Modification date::>" +this.lastsync+ "]]") - + wikiurlify ("/?Url/?Filter/?Article has average rating/?Category") - + "/mainlabel%3D" - + "/format%3Dystat"); - - String reply = UTF8.String(new HTTPClient(ClientIdentification.getUserAgent(), ClientIdentification.DEFAULT_TIMEOUT).GETbytes(bmks_count.toString())); - String overallcount = reply.split(",")[0]; - String lastsyncstring = reply.split(",")[1]; - this.currentmax = Integer.parseInt(overallcount); - - if (this.currentmax > 0) { - Log.logInfo("CONTENTCONTROL", - "CONTENTCONTROL import job counts " - + this.currentmax - + " new elements between " - + this.lastsync + " and " - + this.currenttimestamp); - - this.currenttimestamp = this.lastsync; - - this.runningjob = true; - this.lastsync = lastsyncstring; - this.offset = 0; - } - } else { - Log.logWarning("CONTENTCONTROL", - "No SMWimport URL defined"); - } - } catch (MalformedURLException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } catch (IOException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } + } this.locked = false; } diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 4d78abe4c..ad2159f60 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -149,7 +149,7 @@ import net.yacy.document.parser.audioTagParser; import net.yacy.document.parser.html.Evaluation; import net.yacy.gui.Tray; import net.yacy.interaction.contentcontrol.ContentControlFilterUpdateThread; -import net.yacy.interaction.contentcontrol.ContentControlImportThread; +import net.yacy.interaction.contentcontrol.SMWListSyncThread; import net.yacy.kelondro.blob.Tables; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataNode; @@ -1017,7 +1017,8 @@ public final class Switchboard extends serverSwitch { "this is the content control import thread", null, new InstantBusyThread( - new ContentControlImportThread(this), + new SMWListSyncThread(this, sb.getConfig("contentcontrol.bookmarklist", "contentcontrol"), "Category:Web Page", "/?Url/?Filter/?Article has average rating/?Category/?Modification date", sb.getConfigBool( + "contentcontrol.smwimport.purgelistoninit", false)), "run", SwitchboardConstants.PEER_PING_METHOD_JOBCOUNT, SwitchboardConstants.PEER_PING_METHOD_FREEMEM, diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index c2a5d39f3..2475859dd 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -741,22 +741,20 @@ public final class SearchEvent { continue; } - // content control - if (Switchboard.getSwitchboard().getConfigBool("contentcontrol.enabled", false) == true) { - // check global network filter from bookmark list - if (!Switchboard.getSwitchboard() - .getConfig("contentcontrol.mandatoryfilterlist", "") - .equals("")) { - - FilterEngine f = ContentControlFilterUpdateThread.getNetworkFilter(); - if (f != null) { - if (!f.isListed(page.url(), null)) { - this.query.misses.add(page.hash()); - continue; - } - } - } - } + // contentcontrol + if (Switchboard.getSwitchboard().getConfigBool( + "contentcontrol.enabled", false) == true) { + + FilterEngine f = ContentControlFilterUpdateThread + .getNetworkFilter(); + if (f != null) { + if (!f.isListed(page.url(), null)) { + this.query.misses.add(page.hash()); + continue; + } + } + + } final String pageurl = page.url().toNormalform(true); final String pageauthor = page.dc_creator();