From def4253555a79181bd81b5e491813070af0b010a Mon Sep 17 00:00:00 2001 From: f1ori Date: Sat, 30 Oct 2010 14:44:33 +0000 Subject: [PATCH] * add option to network definition to provide a domainlist (syntax like in blacklists) * crawler and search allow only urls matching one in domainlist (if list is provided) * this may be useful to prevent dedicated networks from being "polluted" * FilterEngine is improved Backlist-object, Blacklist may inherit from FilterEngine in the future git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7285 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- defaults/yacy.network.readme | 1 + source/de/anomic/crawler/CrawlStacker.java | 12 +- source/de/anomic/search/Switchboard.java | 29 +- source/de/anomic/server/serverSwitch.java | 48 ++++ source/net/yacy/repository/FilterEngine.java | 287 +++++++++++++++++++ 5 files changed, 374 insertions(+), 3 deletions(-) create mode 100644 source/net/yacy/repository/FilterEngine.java diff --git a/defaults/yacy.network.readme b/defaults/yacy.network.readme index 5f2fddbef..ab65066ab 100644 --- a/defaults/yacy.network.readme +++ b/defaults/yacy.network.readme @@ -31,6 +31,7 @@ # network.unit.name = # network.unit.description = # network.unit.domain = 'global'|'local'|'any' +# network.unit.domainlist = # network.unit.dhtredundancy = # network.unit.dht.partitionExponent = # network.unit.bootstrap.seedlist = diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index 8c056ec64..d52d1c34d 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -40,6 +40,7 @@ import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.workflow.WorkflowProcessor; import net.yacy.repository.Blacklist; +import net.yacy.repository.FilterEngine; import de.anomic.crawler.retrieval.Request; import de.anomic.search.Segment; @@ -58,6 +59,7 @@ public final class CrawlStacker { private final Segment indexSegment; private final yacySeedDB peers; private final boolean acceptLocalURLs, acceptGlobalURLs; + private final FilterEngine domainList; // this is the process that checks url for double-occurrences and for allowance/disallowance by robots.txt @@ -67,7 +69,8 @@ public final class CrawlStacker { Segment indexSegment, yacySeedDB peers, boolean acceptLocalURLs, - boolean acceptGlobalURLs) { + boolean acceptGlobalURLs, + FilterEngine domainList) { this.nextQueue = cq; this.crawler = cs; this.indexSegment = indexSegment; @@ -76,6 +79,7 @@ public final class CrawlStacker { this.dnsMiss = 0; this.acceptLocalURLs = acceptLocalURLs; this.acceptGlobalURLs = acceptGlobalURLs; + this.domainList = domainList; this.fastQueue = new WorkflowProcessor("CrawlStackerFast", "This process checks new urls before they are enqueued into the balancer (proper, double-check, correct domain, filter)", new String[]{"Balancer"}, this, "job", 10000, null, 2); this.slowQueue = new WorkflowProcessor("CrawlStackerSlow", "This is like CrawlStackerFast, but does additionaly a DNS lookup. The CrawlStackerFast does not need this because it can use the DNS cache.", new String[]{"Balancer"}, this, "job", 1000, null, 5); @@ -350,6 +354,12 @@ public final class CrawlStacker { public String urlInAcceptedDomain(final DigestURI url) { // returns true if the url can be accepted according to network.unit.domain if (url == null) return "url is null"; + // check domainList from network-definition + if(this.domainList != null) { + if(!this.domainList.isListed(url, null)) { + return "the url '" + url + "' is not in domainList of this network"; + } + } final boolean local = url.isLocal(); if (this.acceptLocalURLs && local) return null; if (this.acceptGlobalURLs && !local) return null; diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 34c3353ca..2044f1fba 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -38,6 +38,7 @@ package de.anomic.search; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; +import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; @@ -47,6 +48,7 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.io.Reader; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.security.NoSuchAlgorithmException; @@ -111,6 +113,7 @@ import net.yacy.kelondro.workflow.WorkflowJob; import net.yacy.kelondro.workflow.WorkflowProcessor; import net.yacy.kelondro.workflow.WorkflowThread; import net.yacy.repository.Blacklist; +import net.yacy.repository.FilterEngine; import net.yacy.repository.LoaderDispatcher; import de.anomic.crawler.CrawlProfile; @@ -224,6 +227,7 @@ public final class Switchboard extends serverSwitch { public TreeMap clusterhashes; // map of peerhash(String)/alternative-local-address as ip:port or only ip (String) or null if address in seed should be used public URLLicense licensedURLs; public List networkWhitelist, networkBlacklist; + public FilterEngine domainList; public Dispatcher dhtDispatcher; public LinkedBlockingQueue trail; public yacySeedDB peers; @@ -553,7 +557,8 @@ public final class Switchboard extends serverSwitch { this.indexSegments.segment(Segments.Process.LOCALCRAWLING), this.peers, isIntranetMode(), - isGlobalMode()); // Intranet and Global mode may be both true! + isGlobalMode(), + this.domainList); // Intranet and Global mode may be both true! // initializing dht chunk generation this.dhtMaxReferenceCount = (int) getConfigLong(SwitchboardConstants.INDEX_DIST_CHUNK_SIZE_START, 50); @@ -819,6 +824,15 @@ public final class Switchboard extends serverSwitch { } */ MultiProtocolURI.addBotInfo(getConfig(SwitchboardConstants.NETWORK_NAME, "") + (isRobinsonMode() ? "-" : "/") + getConfig("network.unit.domain", "global")); + + try { + this.domainList = null; + Reader r = getConfigFileFromWebOrLocally(getConfig("network.unit.domainlist", ""), getAppPath().getAbsolutePath()); + this.domainList = new FilterEngine(); + this.domainList.loadList(new BufferedReader(r), null); + } catch (FileNotFoundException e) { + } catch (IOException e) { + } } public void switchNetwork(final String networkDefinition) { @@ -902,13 +916,24 @@ public final class Switchboard extends serverSwitch { // create new web structure this.webStructure = new WebStructureGraph(log, rankingPath, "LOCAL/010_cr/", getConfig("CRDist0Path", CRDistribution.CR_OWN), new File(queuesRoot, "webStructure.map")); + + try { + this.domainList = null; + Reader r = getConfigFileFromWebOrLocally(getConfig("network.unit.domainList", ""), getAppPath().getAbsolutePath()); + this.domainList = new FilterEngine(); + this.domainList.loadList(new BufferedReader(r), null); + } catch (FileNotFoundException e) { + } catch (IOException e) { + } + this.crawlStacker = new CrawlStacker( this.crawlQueues, this.crawler, this.indexSegments.segment(Segments.Process.LOCALCRAWLING), this.peers, "local.any".indexOf(getConfig("network.unit.domain", "global")) >= 0, - "global.any".indexOf(getConfig("network.unit.domain", "global")) >= 0); + "global.any".indexOf(getConfig("network.unit.domain", "global")) >= 0, + this.domainList); } // start up crawl jobs diff --git a/source/de/anomic/server/serverSwitch.java b/source/de/anomic/server/serverSwitch.java index 14628a1e1..d7c6f3fa0 100644 --- a/source/de/anomic/server/serverSwitch.java +++ b/source/de/anomic/server/serverSwitch.java @@ -21,8 +21,15 @@ package de.anomic.server; +import java.io.BufferedInputStream; +import java.io.BufferedReader; +import java.io.ByteArrayInputStream; import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; import java.io.IOException; +import java.io.InputStreamReader; +import java.io.Reader; import java.net.InetAddress; import java.util.ArrayList; import java.util.Collection; @@ -31,12 +38,18 @@ import java.util.Map; import java.util.TreeMap; import java.util.concurrent.ConcurrentHashMap; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.Domains; +import net.yacy.cora.protocol.HeaderFramework; +import net.yacy.cora.protocol.RequestHeader; +import net.yacy.cora.protocol.http.HTTPClient; +import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.workflow.BusyThread; import net.yacy.kelondro.workflow.WorkflowThread; +import de.anomic.search.Switchboard; import de.anomic.server.serverAccessTracker.Track; import de.anomic.server.serverCore.Session; @@ -554,4 +567,39 @@ public class serverSwitch { return this.accessTracker.accessHosts(); } + /** + * Retrieve text data (e. g. config file) from file + * + * file may be an url or a filename with path relative to rootPath parameter + * @param file url or filename + * @param rootPath searchpath for file + */ + public Reader getConfigFileFromWebOrLocally(String uri, String rootPath) throws IOException, FileNotFoundException { + if(uri.startsWith("http://") || uri.startsWith("https://")) { + String[] uris = uri.split(","); + for (String netdef: uris) { + netdef = netdef.trim(); + try { + final RequestHeader reqHeader = new RequestHeader(); + reqHeader.put(HeaderFramework.USER_AGENT, MultiProtocolURI.yacybotUserAgent); + final HTTPClient client = new HTTPClient(); + client.setHeader(reqHeader.entrySet()); + byte[] data = client.GETbytes(uri); + if (data == null || data.length == 0) continue; + return new InputStreamReader(new BufferedInputStream(new ByteArrayInputStream(data))); + } catch (final Exception e) { + continue; + } + } + throw new FileNotFoundException(); + } else { + final File f = (uri.length() > 0 && uri.charAt(0) == '/') ? new File(uri) : new File(rootPath, uri); + if (f.exists()) { + return new FileReader(f); + } else { + throw new FileNotFoundException(); + } + } + } + } diff --git a/source/net/yacy/repository/FilterEngine.java b/source/net/yacy/repository/FilterEngine.java new file mode 100644 index 000000000..c17b978a4 --- /dev/null +++ b/source/net/yacy/repository/FilterEngine.java @@ -0,0 +1,287 @@ +package net.yacy.repository; + +import java.io.BufferedReader; +import java.io.IOException; +import java.util.EnumSet; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; +import java.util.Map.Entry; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +import net.yacy.cora.storage.HashARC; +import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.logging.Log; + +/** + * a URL filter engine for black and white lists + * + * @TODO precompile regular expressions + * + */ +public class FilterEngine { + + /** size of URL cache */ + protected static final int CACHE_SIZE = 100; + + public static final int ERR_TWO_WILDCARDS_IN_HOST = 1; + public static final int ERR_SUBDOMAIN_XOR_WILDCARD = 2; + public static final int ERR_PATH_REGEX = 3; + public static final int ERR_WILDCARD_BEGIN_OR_END = 4; + public static final int ERR_HOST_WRONG_CHARS = 5; + public static final int ERR_DOUBLE_OCCURANCE = 6; + public static final int ERR_HOST_REGEX = 7; + + protected enum listTypes { type1 }; + + protected class FilterEntry { + public String path; + public EnumSet types; + + public FilterEntry(String path, EnumSettypes) { + this.path = path; + this.types = types; + } + } + + protected HashARC> cachedUrlHashs = null; + protected HashMap> hostpaths_matchable = null; + protected HashMap> hostpaths_notmatchable = null; + + + public FilterEngine() { + // prepare the data structure + this.hostpaths_matchable = new HashMap>(); + this.hostpaths_notmatchable = new HashMap>(); + this.cachedUrlHashs = new HashARC>(CACHE_SIZE); + } + + public void clear() { + this.cachedUrlHashs.clear(); + this.hostpaths_matchable.clear(); + this.hostpaths_notmatchable.clear(); + } + + public int size() { + return this.hostpaths_matchable.size() + this.hostpaths_notmatchable.size(); + } + + public void add(String entry, EnumSet types) { + assert entry != null; + int pos; // position between domain and path + if((pos = entry.indexOf('/')) > 0) { + String host = entry.substring(0, pos).trim().toLowerCase(); + String path = entry.substring(pos + 1).trim(); + + // avoid PatternSyntaxException e + if (!isMatchable(host) && host.length() > 0 && host.charAt(0) == '*') + host = "." + host; + + if(isMatchable(host)) { + if (!hostpaths_matchable.containsKey(host)) + hostpaths_matchable.put(host, new TreeSet()); + hostpaths_matchable.get(host).add(new FilterEntry(path, types)); + // TODO: update type, if there is an element + } else { + if (!hostpaths_notmatchable.containsKey(host)) + hostpaths_notmatchable.put(host, new TreeSet()); + hostpaths_notmatchable.get(host).add(new FilterEntry(path, types)); + } + } + } + + public void loadList(final BufferedReader in, EnumSet types) throws IOException { + String line; + while((line = in.readLine()) != null) { + line = line.trim(); + if (line.length() > 0 && line.charAt(0) != '#') + this.add(line, types); + } + } + + public void removeAll(final String host) { + assert host != null; + this.hostpaths_matchable.remove(host); + this.hostpaths_notmatchable.remove(host); + } + + public void remove(final String listType, final String host, final String path) { + } + + public boolean isListed(final DigestURI url, final EnumSet type) { + // trival anwser + if (url.getHost() == null) + return false; + + if(cachedUrlHashs.containsKey(url)) { + // Cache Hit + EnumSet e = cachedUrlHashs.get(url); + return e.containsAll(type); + } else { + // Cache Miss + return isListed(url.getHost().toLowerCase(), url.getFile(), type); + } + } + + public static boolean isMatchable (final String host) { + try { + if(Pattern.matches("^[a-z0-9.-]*$", host)) // simple Domain (yacy.net or www.yacy.net) + return true; + if(Pattern.matches("^\\*\\.[a-z0-9-.]*$", host)) // start with *. (not .* and * must follow a dot) + return true; + if(Pattern.matches("^[a-z0-9-.]*\\.\\*$", host)) // ends with .* (not *. and befor * must be a dot) + return true; + } catch (final PatternSyntaxException e) { + //System.out.println(e.toString()); + return false; + } + return false; + } + + public boolean isListed(final String host, String path, EnumSet type) { + if (host == null) throw new NullPointerException(); + if (path == null) throw new NullPointerException(); + + if (path.length() > 0 && path.charAt(0) == '/') path = path.substring(1); + Set app; + + // try to match complete domain + if ((app = hostpaths_matchable.get(host)) != null) { + for(FilterEntry e: app) { + if (e.path.indexOf("?*") > 0) { + // prevent "Dangling meta character '*'" exception + Log.logWarning("FilterEngine", "ignored blacklist path to prevent 'Dangling meta character' exception: " + e); + continue; + } + if((e.path.equals("*")) || (path.matches(e.path))) + return true; + } + } + // first try to match the domain with wildcard '*' + // [TL] While "." are found within the string + int index = 0; + while ((index = host.indexOf('.', index + 1)) != -1) { + if ((app = hostpaths_matchable.get(host.substring(0, index + 1) + "*")) != null) { + for(FilterEntry e: app) { + if((e.path.equals("*")) || (path.matches(e.path))) + return true; + } + } + if ((app = hostpaths_matchable.get(host.substring(0, index))) != null) { + for(FilterEntry e: app) { + if((e.path.equals("*")) || (path.matches(e.path))) + return true; + } + } + } + index = host.length(); + while ((index = host.lastIndexOf('.', index - 1)) != -1) { + if ((app = hostpaths_matchable.get("*" + host.substring(index, host.length()))) != null) { + for(FilterEntry e: app) { + if((e.path.equals("*")) || (path.matches(e.path))) + return true; + + } + } + if ((app = hostpaths_matchable.get(host.substring(index +1, host.length()))) != null) { + for(FilterEntry e: app) { + if((e.path.equals("*")) || (path.matches(e.path))) + return true; + } + } + } + + + // loop over all Regexentrys + for(final Entry> entry: hostpaths_notmatchable.entrySet()) { + try { + if(Pattern.matches(entry.getKey(), host)) { + app = entry.getValue(); + for(FilterEntry e: app) { + if(Pattern.matches(e.path, path)) + return true; + } + } + } catch (final PatternSyntaxException e) { + System.out.println(e.toString()); + } + } + return false; + } + + public int checkError(String element, Map properties) { + + boolean allowRegex = true; + int slashPos; + String host, path; + + if (properties != null) { + allowRegex = properties.get("allowRegex").equalsIgnoreCase("true") ? true : false; + } + + if ((slashPos = element.indexOf('/')) == -1) { + host = element; + path = ".*"; + } else { + host = element.substring(0, slashPos); + path = element.substring(slashPos + 1); + } + + if (!allowRegex || !isValidRegex(host)) { + final int i = host.indexOf('*'); + + // check whether host begins illegally + if (!host.matches("([A-Za-z0-9_-]+|\\*)(\\.([A-Za-z0-9_-]+|\\*))*")) { + if (i == 0 && host.length() > 1 && host.charAt(1) != '.') { + return ERR_SUBDOMAIN_XOR_WILDCARD; + } + return ERR_HOST_WRONG_CHARS; + } + + // in host-part only full sub-domains may be wildcards + if (host.length() > 0 && i > -1) { + if (!(i == 0 || i == host.length() - 1)) { + return ERR_WILDCARD_BEGIN_OR_END; + } + + if (i == host.length() - 1 && host.length() > 1 && host.charAt(i - 1) != '.') { + return ERR_SUBDOMAIN_XOR_WILDCARD; + } + } + + // check for double-occurences of "*" in host + if (host.indexOf("*", i + 1) > -1) { + return ERR_TWO_WILDCARDS_IN_HOST; + } + } else if (allowRegex && !isValidRegex(host)) { + return ERR_HOST_REGEX; + } + + // check for errors on regex-compiling path + if (!isValidRegex(path) && !path.equals("*")) { + return ERR_PATH_REGEX; + } + + return 0; + } + + /** + * Checks if a given expression is a valid regular expression. + * @param expression The expression to be checked. + * @return True if the expression is a valid regular expression, else false. + */ + private static boolean isValidRegex(String expression) { + boolean ret = true; + try { + Pattern.compile(expression); + } catch (final PatternSyntaxException e) { + + ret = false; + } + return ret; + } + + +}