update and generalization of the SMW import and content control routines

pull/1/head
cominch 12 years ago
parent f07e5fb553
commit 21df1ad9e0

@ -27,20 +27,13 @@
Enables or disables content control.
</p>
</dd>
<dt><label for="content">Mandatory default filter list (category):</label></dt>
<dd>
<input type="text" name="contentcontrolmfl" value="#[contentcontrolmfl]#" size="60" /><br/><br/>
<p class="help">
Define a category string. If defined, all URLs will be filtered out during crawling and DHT which do not belong to this category.
</p>
</dd>
<dt><label for="content">Use this bookmark list:</label></dt>
<dt><label for="content">Use this table to create filter:</label></dt>
<dd>
<input type="text" name="contentcontrolbml" value="#[contentcontrolbml]#" size="60" /><br/><br/>
<p class="help">
Define a bookmark list. Default: contentcontrol
Define a table. Default: contentcontrol
</p>
</dd>
<dt></dt>
@ -52,9 +45,9 @@
<form id="contentcontrolExtraSettings" action="ContentControl_p.html" method="post" enctype="multipart/form-data">
<fieldset><legend id="urlproxy">Content Control Settings</legend>
<fieldset><legend id="urlproxy">Content Control SMW Import Settings</legend>
<p>
With this settings you can define the content control settings.
With this settings you can define the content control import settings. You can define a SMW with the appropriate extensions. Details: <a href="https://gitorious.org/sciety/yacy-smwextension" target="_blank">yacy-smwextension on Gitorious</a>
</p>
<dl>
@ -63,7 +56,7 @@
<dd>
<input type="checkbox" name="ccsmwimport" id="ccsmwimport" #(ccsmwimport_checked)#:: checked="checked"#(/ccsmwimport_checked)# />Enabled<br/>
<p class="help">
Enable or disable constant background synchronisation of content control list from SMW (Semantic Mediawiki). Requires restart!
Enable or disable constant background synchronization of content control list from SMW (Semantic Mediawiki). Requires restart!
</p>
</dd>
@ -75,19 +68,11 @@
</p>
</dd>
<dt><label for="content">SMW import target bookmark list:</label></dt>
<dt><label for="content">SMW import target table:</label></dt>
<dd>
<input type="text" name="ccsmwimportlist" value="#[ccsmwimportlist]#" size="60" /><br/><br/>
<p class="help">
Define import target bookmark list. Default: contentcontrol
</p>
</dd>
<dt><label for="content">SMW import default category:</label></dt>
<dd>
<input type="text" name="ccsmwimportcat" value="#[ccsmwimportcat]#" size="60" /><br/><br/>
<p class="help">
Define default category which is added to each entry. This category can be defined as mandatory default filter list.
Define import target table. Default: contentcontrol
</p>
</dd>

@ -24,9 +24,7 @@ public final class ContentControl_p {
env.setConfig("contentcontrol.smwimport.targetlist",
post.get("ccsmwimportlist"));
env.setConfig("contentcontrol.smwimport.defaultcategory",
post.get("ccsmwimportcat"));
}
@ -35,9 +33,7 @@ public final class ContentControl_p {
env.setConfig("contentcontrol.enabled",
"on".equals(post.get("contentcontrolenabled")) ? true : false);
env.setConfig("contentcontrol.mandatoryfilterlist",
post.get("contentcontrolmfl"));
env.setConfig("contentcontrol.bookmarklist",
post.get("contentcontrolbml"));
@ -45,8 +41,6 @@ public final class ContentControl_p {
}
prop.putHTML("ccsmwimportcat",
env.getConfig("contentcontrol.smwimport.defaultcategory", "yacy"));
prop.putHTML("ccsmwimportlist",
env.getConfig("contentcontrol.smwimport.targetlist", "contentcontrol"));
@ -64,9 +58,6 @@ public final class ContentControl_p {
prop.put("contentcontrolenabled_checked",
env.getConfigBool("contentcontrol.enabled", false) ? "1" : "0");
prop.putHTML("contentcontrolmfl",
env.getConfig("contentcontrol.mandatoryfilterlist", "yacy"));
prop.putHTML("contentcontrolbml",
env.getConfig("contentcontrol.bookmarklist", ""));

@ -1,5 +1,6 @@
package net.yacy.interaction.contentcontrol;
import java.io.IOException;
import java.util.Iterator;
import net.yacy.kelondro.blob.Tables;
@ -16,15 +17,9 @@ public class ContentControlFilterUpdateThread {
private static FilterEngine networkfilter;
public ContentControlFilterUpdateThread(final Switchboard sb) {
//final long time = System.currentTimeMillis();
this.sb = sb;
if (this.sb.getConfigBool("contentcontrol.smwimport.purgelistoninit",
false)) {
this.sb.tables.clear(this.sb.getConfig(
"contentcontrol.smwimport.targetlist", "contentcontrol"));
this.sb = sb;
}
}
public final void run() {
@ -35,17 +30,11 @@ public class ContentControlFilterUpdateThread {
if (this.sb.getConfigBool("contentcontrol.enabled", false) == true) {
if (!this.sb
.getConfig("contentcontrol.mandatoryfilterlist", "")
.equals("")) {
if (this.sb.tables.bookmarks.dirty) {
if (this.sb.tables.bookmarks.dirty) {
networkfilter = updateFilter();
networkfilter = updateFilter();
this.sb.tables.bookmarks.dirty = false;
}
SMWListSyncThread.dirty = false;
}
@ -55,7 +44,6 @@ public class ContentControlFilterUpdateThread {
}
return;
}
@ -66,23 +54,23 @@ public class ContentControlFilterUpdateThread {
Switchboard sb = Switchboard.getSwitchboard();
Iterator<Tables.Row> it;
it = sb.tables.bookmarks.getBookmarksByTag(
sb.getConfig(
"contentcontrol.bookmarklist",
"contentcontrol"),
"^((?!sc:"
+ sb
.getConfig(
"contentcontrol.mandatoryfilterlist",
"") + ").*)$");
while (it.hasNext()) {
Row b = it.next();
if (!b.get("filter", "").equals("")) {
newfilter.add(b.get("filter", ""), null);
}
}
try {
it = sb.tables.iterator(sb.getConfig("contentcontrol.bookmarklist",
"contentcontrol"));
while (it.hasNext()) {
Row b = it.next();
if (!b.get("filter", "").equals("")) {
newfilter.add(b.get("filter", ""), null);
}
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return newfilter;
}

@ -0,0 +1,163 @@
package net.yacy.interaction.contentcontrol;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map.Entry;
import java.util.concurrent.ArrayBlockingQueue;
import net.yacy.kelondro.logging.Log;
import org.json.simple.parser.ContentHandler;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;
public class SMWListImporter implements Runnable, ContentHandler{
// Importer Variables
private final ArrayBlockingQueue<SMWListRow> listEntries;
private final Reader importFile;
private SMWListRow row;
private final JSONParser parser;
// Parser Variables
private final StringBuilder value;
private final StringBuilder key;
private final HashMap<String,String> obj;
private Boolean isElement;
public SMWListImporter(final Reader importFile, final int queueSize) {
this.listEntries = new ArrayBlockingQueue<SMWListRow>(queueSize);
this.importFile = importFile;
this.row = new SMWListRow();
this.parser = new JSONParser();
this.value = new StringBuilder(128);
this.key = new StringBuilder(16);
this.obj = new HashMap<String,String>();
this.isElement = false;
}
@Override
public void startJSON() throws ParseException, IOException {
}
@Override
public void endJSON() throws ParseException, IOException {
}
@Override
public boolean startArray() throws ParseException, IOException {
final String key = this.key.toString();
if (key.equals("items")) {
this.isElement = true;
}
return true;
}
@Override
public boolean endArray() throws ParseException, IOException {
return true;
}
@Override
public boolean startObject() throws ParseException, IOException {
return true;
}
@Override
public boolean endObject() throws ParseException, IOException {
if(this.isElement) {
for (Entry<String, String> e: this.obj.entrySet()) {
this.row.add (e.getKey(), e.getValue());
}
try {
this.listEntries.put(this.row);
//this.count++;
} catch (InterruptedException e) {
Log.logException(e);
}
this.obj.clear();
this.row = new SMWListRow();
}
return true;
}
@Override
public boolean startObjectEntry(String key) throws ParseException, IOException {
this.key.setLength(0);
this.key.append(key);
return true;
}
@Override
public boolean primitive(Object value) throws ParseException, IOException {
this.value.setLength(0);
if(value instanceof java.lang.String) {
this.value.append((String)value);
} else if(value instanceof java.lang.Boolean) {
this.value.append(value);
} else if(value instanceof java.lang.Number) {
this.value.append(value);
}
return true;
}
@Override
public boolean endObjectEntry() throws ParseException, IOException {
final String key = this.key.toString();
final String value = this.value.toString();
this.obj.put(key, value);
return true;
}
@Override
public void run() {
try {
Log.logInfo("SMWLISTSYNC", "Importer run()");
this.parser.parse(this.importFile, this, true);
} catch (IOException e) {
Log.logException(e);
} catch (ParseException e) {
Log.logException(e);
} finally {
try {
Log.logInfo("SMWLISTSYNC", "Importer inserted poison pill in queue");
this.listEntries.put(SMWListRow.POISON);
} catch (InterruptedException e) {
Log.logException(e);
}
}
}
public SMWListRow take() {
try {
return this.listEntries.take();
} catch (InterruptedException e) {
Log.logException(e);
return null;
}
}
}

@ -0,0 +1,24 @@
package net.yacy.interaction.contentcontrol;
import net.yacy.kelondro.blob.Tables;
public class SMWListRow {
private Tables.Data data;
public static final SMWListRow POISON = new SMWListRow();
public static final SMWListRow EMPTY = new SMWListRow();
public SMWListRow() {
this.data = new Tables.Data();
}
public void add (String key, String value) {
this.data.put(key, value);
}
public Tables.Data getData() {
return this.data;
}
}

@ -13,7 +13,7 @@ import net.yacy.data.ymark.YMarkSMWJSONImporter;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard;
public class ContentControlImportThread {
public class SMWListSyncThread {
private final Switchboard sb;
private Boolean locked = false;
@ -23,11 +23,20 @@ public class ContentControlImportThread {
private final long limit = 500;
private long currentmax = 0;
private boolean runningjob = false;
public ContentControlImportThread(final Switchboard sb) {
private String targetList;
private String parameters;
private String query;
public static Boolean dirty = false;
public SMWListSyncThread(final Switchboard sb, final String targetList, final String query, final String parameters, final Boolean purgeOnInit) {
this.sb = sb;
if (this.sb.getConfigBool("contentcontrol.smwimport.purgelistoninit",false)) {
this.sb.tables.clear(this.sb.getConfig("contentcontrol.smwimport.targetlist", "contentcontrol"));
this.targetList = targetList;
this.parameters = parameters;
this.query = query;
if (purgeOnInit) {
this.sb.tables.clear(targetList);
}
}
@ -51,26 +60,84 @@ public class ContentControlImportThread {
if (!this.locked) {
this.locked = true;
if (this.sb.getConfigBool("contentcontrol.smwimport.enabled", false) == true) {
if (this.runningjob) {
Log.logInfo("CONTENTCONTROL",
"CONTENTCONTROL importing max. " + this.limit
if (!this.runningjob) {
// we have to count all new elements first
try {
if (!this.sb.getConfig("contentcontrol.smwimport.baseurl","").equals("")) {
URL urlCount;
urlCount = new URL(
this.sb.getConfig(
"contentcontrol.smwimport.baseurl",
"")
+ wikiurlify ("/[["+this.query+"]] [[Modification date::>" +this.lastsync+ "]]")
+ wikiurlify (this.parameters)
+ "/mainlabel%3D"
+ "/offset%3D0"
+ "/limit%3D200000"
+ "/format%3Dystat");
String reply = UTF8.String(new HTTPClient(ClientIdentification.getUserAgent(), ClientIdentification.DEFAULT_TIMEOUT).GETbytes(urlCount.toString()));
String overallcount = reply.split(",")[0];
String lastsyncstring = reply.split(",")[1];
this.currentmax = Integer.parseInt(overallcount);
if (this.currentmax > 0) {
Log.logInfo("SMWLISTSYNC",
"import job counts "
+ this.currentmax
+ " new elements between "
+ this.lastsync + " and "
+ this.currenttimestamp);
this.currenttimestamp = this.lastsync;
this.runningjob = true;
this.lastsync = lastsyncstring;
this.offset = 0;
}
} else {
Log.logWarning("SMWLISTSYNC",
"No SMWimport URL defined");
}
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} else {
// there are new elements to be imported
Log.logInfo("SMWLISTSYNC",
"importing max. " + this.limit
+ " elements at " + this.offset + " of "
+ this.currentmax + ", since "
+ this.currenttimestamp);
URL bmks_json;
URL urlImport;
try {
if (!this.sb.getConfig("contentcontrol.smwimport.baseurl","").equals("")) {
bmks_json = new URL(
urlImport = new URL(
this.sb.getConfig(
"contentcontrol.smwimport.baseurl",
"")
+ wikiurlify ("/[[Category:Web Page]] [[Modification date::>" +this.currenttimestamp+ "]]")
+ wikiurlify ("/[["+this.query+"]] [[Modification date::>" +this.currenttimestamp+ "]]")
+ wikiurlify ("/?Url/?Filter/?Article has average rating/?Category")
+ wikiurlify (this.parameters)
+ "/mainlabel%3D"
+ "/syntax%3Dobsolete"
+ "/offset%3D" + this.offset
+ "/limit%3D" + this.limit
+ "/format%3Djson");
this.offset += this.limit;
if (this.offset > this.currentmax) {
this.runningjob = false;
@ -79,34 +146,34 @@ public class ContentControlImportThread {
InputStreamReader reader = null;
try {
reader = new InputStreamReader(
bmks_json.openStream(), "UTF-8");
urlImport.openStream(), "UTF-8");
} catch (Exception e) {
Log.logException(e);
this.runningjob = false;
}
if (reader != null) {
YMarkSMWJSONImporter bookmarkImporter = null;
SMWListImporter smwListImporter = null;
try {
bookmarkImporter = new YMarkSMWJSONImporter(
reader, 200, "");
smwListImporter = new SMWListImporter(
reader, 200);
} catch (final Exception e) {
// TODO: display an error message
Log.logException(e);
this.runningjob = false;
}
Thread t;
YMarkEntry bmk;
t = new Thread(bookmarkImporter,"YMarks - Network bookmark importer");
SMWListRow row;
t = new Thread(smwListImporter,"SMW List Importer");
t.start();
while ((bmk = bookmarkImporter.take()) != YMarkEntry.POISON) {
if (bmk == YMarkEntry.EMPTY) {
while ((row = smwListImporter.take()) != SMWListRow.POISON) {
if (row == SMWListRow.EMPTY) {
this.runningjob = false;
} else {
try {
this.sb.tables.bookmarks.addBookmark(
this.sb.getConfig("contentcontrol.smwimport.targetlist", "contentcontrol"), bmk,
true, true);
this.sb.tables.insert(targetList, row.getData());
dirty = true;
} catch (Exception e) {
// TODO Auto-generated catch block
@ -114,62 +181,15 @@ public class ContentControlImportThread {
}
}
}
} else {
}
}
else {
}
}
} catch (MalformedURLException e2) {
// TODO Auto-generated catch block
e2.printStackTrace();
}
} else {
try {
if (!this.sb.getConfig("contentcontrol.smwimport.baseurl","").equals("")) {
URL bmks_count;
bmks_count = new URL(
this.sb.getConfig(
"contentcontrol.smwimport.baseurl",
"")
+ wikiurlify ("/[[Category:Web Page]] [[Modification date::>" +this.lastsync+ "]]")
+ wikiurlify ("/?Url/?Filter/?Article has average rating/?Category")
+ "/mainlabel%3D"
+ "/format%3Dystat");
String reply = UTF8.String(new HTTPClient(ClientIdentification.getUserAgent(), ClientIdentification.DEFAULT_TIMEOUT).GETbytes(bmks_count.toString()));
String overallcount = reply.split(",")[0];
String lastsyncstring = reply.split(",")[1];
this.currentmax = Integer.parseInt(overallcount);
if (this.currentmax > 0) {
Log.logInfo("CONTENTCONTROL",
"CONTENTCONTROL import job counts "
+ this.currentmax
+ " new elements between "
+ this.lastsync + " and "
+ this.currenttimestamp);
this.currenttimestamp = this.lastsync;
this.runningjob = true;
this.lastsync = lastsyncstring;
this.offset = 0;
}
} else {
Log.logWarning("CONTENTCONTROL",
"No SMWimport URL defined");
}
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
this.locked = false;
}

@ -149,7 +149,7 @@ import net.yacy.document.parser.audioTagParser;
import net.yacy.document.parser.html.Evaluation;
import net.yacy.gui.Tray;
import net.yacy.interaction.contentcontrol.ContentControlFilterUpdateThread;
import net.yacy.interaction.contentcontrol.ContentControlImportThread;
import net.yacy.interaction.contentcontrol.SMWListSyncThread;
import net.yacy.kelondro.blob.Tables;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataNode;
@ -1017,7 +1017,8 @@ public final class Switchboard extends serverSwitch {
"this is the content control import thread",
null,
new InstantBusyThread(
new ContentControlImportThread(this),
new SMWListSyncThread(this, sb.getConfig("contentcontrol.bookmarklist", "contentcontrol"), "Category:Web Page", "/?Url/?Filter/?Article has average rating/?Category/?Modification date", sb.getConfigBool(
"contentcontrol.smwimport.purgelistoninit", false)),
"run",
SwitchboardConstants.PEER_PING_METHOD_JOBCOUNT,
SwitchboardConstants.PEER_PING_METHOD_FREEMEM,

@ -741,22 +741,20 @@ public final class SearchEvent {
continue;
}
// content control
if (Switchboard.getSwitchboard().getConfigBool("contentcontrol.enabled", false) == true) {
// check global network filter from bookmark list
if (!Switchboard.getSwitchboard()
.getConfig("contentcontrol.mandatoryfilterlist", "")
.equals("")) {
FilterEngine f = ContentControlFilterUpdateThread.getNetworkFilter();
if (f != null) {
if (!f.isListed(page.url(), null)) {
this.query.misses.add(page.hash());
continue;
}
}
}
}
// contentcontrol
if (Switchboard.getSwitchboard().getConfigBool(
"contentcontrol.enabled", false) == true) {
FilterEngine f = ContentControlFilterUpdateThread
.getNetworkFilter();
if (f != null) {
if (!f.isListed(page.url(), null)) {
this.query.misses.add(page.hash());
continue;
}
}
}
final String pageurl = page.url().toNormalform(true);
final String pageauthor = page.dc_creator();

Loading…
Cancel
Save