From 58ecf5e4ddc462cae321bc64353000d0e799b393 Mon Sep 17 00:00:00 2001 From: reger Date: Wed, 1 Jan 2014 11:01:22 +0100 Subject: [PATCH] add to blacklist button in CrawlResults http://bugs.yacy.net/view.php?id=220 introduced Blacklist.add with sourcefile only parameter --- htroot/CrawlResults.html | 23 ++++++++++- htroot/CrawlResults.java | 37 +++++++++++++++++- source/net/yacy/repository/Blacklist.java | 47 +++++++++++++++++++++++ 3 files changed, 104 insertions(+), 3 deletions(-) diff --git a/htroot/CrawlResults.html b/htroot/CrawlResults.html index 1a3302beb..1298c1e76 100644 --- a/htroot/CrawlResults.html +++ b/htroot/CrawlResults.html @@ -73,9 +73,19 @@

Statistics about #[domains]# domains in this stack:

- + + #{domains}# @@ -84,12 +94,23 @@
+
+ #{/domains}#
Domain URLsBlacklist to use +
+ + +
+
#[domain]# #[count]# +
+
+ + + + +
+
+

diff --git a/htroot/CrawlResults.java b/htroot/CrawlResults.java index 9014570b4..50a57e53a 100644 --- a/htroot/CrawlResults.java +++ b/htroot/CrawlResults.java @@ -29,6 +29,7 @@ import java.util.Arrays; import java.util.Date; import java.util.HashSet; import java.util.Iterator; +import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; @@ -40,8 +41,11 @@ import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.data.ResultURLs; import net.yacy.crawler.data.ResultURLs.EventOrigin; import net.yacy.crawler.data.ResultURLs.InitExecEntry; +import net.yacy.data.ListManager; import net.yacy.kelondro.data.meta.URIMetadataNode; +import net.yacy.kelondro.util.FileUtils; import net.yacy.peers.Seed; +import net.yacy.repository.Blacklist; import net.yacy.search.Switchboard; import net.yacy.search.schema.CollectionSchema; import net.yacy.server.serverObjects; @@ -105,7 +109,8 @@ public class CrawlResults { return prop; } } - + + String selectedblacklist = post.get("selectedblacklist",Blacklist.defaultBlacklist(ListManager.listsPath)); if (post != null) { // custom number of lines if (post.containsKey("count")) { @@ -123,13 +128,19 @@ public class CrawlResults { } } - if (post.containsKey("deletedomain")) { + if (post.containsKey("deletedomain") || post.containsKey("delandaddtoblacklist")) { final String domain = post.get("domain", null); if (domain != null) { + selectedblacklist = post.get("blacklistname"); Set hostnames = new HashSet(); hostnames.add(domain); sb.index.fulltext().deleteStaleDomainNames(hostnames, null); ResultURLs.deleteDomain(tabletype, domain); + + // handle addtoblacklist + if (post.containsKey("delandaddtoblacklist")) { + Switchboard.urlBlacklist.add(selectedblacklist, domain, ".*"); + } } } @@ -297,11 +308,33 @@ public class CrawlResults { prop.put("table_domains_" + cnt + "_tabletype", tabletype.getCode()); prop.put("table_domains_" + cnt + "_domain", domain); prop.put("table_domains_" + cnt + "_count", ResultURLs.domainCount(tabletype, domain)); + prop.put("table_domains_" + cnt + "_blacklistname", selectedblacklist); dark = !dark; cnt++; } prop.put("table_domains", cnt); + + // load all blacklist files located in the directory + List dirlist = FileUtils.getDirListing(ListManager.listsPath, Blacklist.BLACKLIST_FILENAME_FILTER); + int blacklistCount = 0; + if (dirlist != null) { + for (final String element : dirlist) { + if (element.equals(selectedblacklist)) { + prop.put("table_blacklists_" + blacklistCount + "_selected", "selected"); + } else { + prop.put("table_blacklists_" + blacklistCount + "_selected", ""); + } + prop.putXML("table_blacklists_" + blacklistCount + "_name", element); + + blacklistCount++; + } + prop.put("table_blacklists", blacklistCount); + } } + + + + prop.put("process", tabletype.getCode()); // return rewrite properties return prop; diff --git a/source/net/yacy/repository/Blacklist.java b/source/net/yacy/repository/Blacklist.java index e4105b89c..d68999c4a 100644 --- a/source/net/yacy/repository/Blacklist.java +++ b/source/net/yacy/repository/Blacklist.java @@ -354,6 +354,53 @@ public class Blacklist { } } + /** + * appends a entry to the backlist source file + * + * @param blacklistSourcefile name of the blacklist file (LISTS/*.black) + * @param host host or host pattern + * @param path path or path pattern + */ + public final void add (final String blacklistSourcefile, final String host, final String path) { + // TODO: check sourcefile synced with cache.ser files ? + if (host == null) { + throw new IllegalArgumentException("host may not be null"); + } + if (path == null) { + throw new IllegalArgumentException("path may not be null"); + } + + String p = (!path.isEmpty() && path.charAt(0) == '/') ? path.substring(1) : path; + + // avoid PatternSyntaxException e + final String h = ((!isMatchable(host) && !host.isEmpty() && host.charAt(0) == '*') ? "." + host : host).toLowerCase(); + if (!p.isEmpty() && p.charAt(0) == '*') { + p = "." + p; + } + Pattern pattern = Pattern.compile(p, Pattern.CASE_INSENSITIVE); + // Append the line to the file. + PrintWriter pw = null; + try { + final String newEntry = h + "/" + pattern; + if (!blacklistFileContains(blacklistRootPath, blacklistSourcefile, newEntry)) { + pw = new PrintWriter(new FileWriter(new File(blacklistRootPath, blacklistSourcefile), true)); + pw.println(newEntry); + pw.close(); + } + } catch (final IOException e) { + ConcurrentLog.logException(e); + } finally { + if (pw != null) { + try { + pw.close(); + } catch (final Exception e) { + ConcurrentLog.warn("Blacklist", "could not close stream to " + + blacklistSourcefile + "! " + e.getMessage()); + } + } + } + } + public final int blacklistCacheSize() { int size = 0; final Iterator iter = this.cachedUrlHashs.keySet().iterator();