* add option to network definition to provide a domainlist (syntax like in blacklists)

* crawler and search allow only urls matching one in domainlist (if list is provided)
* this may be useful to prevent dedicated networks from being "polluted"
* FilterEngine is improved Backlist-object, Blacklist may inherit from FilterEngine in the future

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7285 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
f1ori 15 years ago
parent 2db6ca8104
commit def4253555

@ -31,6 +31,7 @@
# network.unit.name = <any word, name of network; appears in network graphic> # network.unit.name = <any word, name of network; appears in network graphic>
# network.unit.description = <any string, just informal; appears in network graphic> # network.unit.description = <any string, just informal; appears in network graphic>
# network.unit.domain = 'global'|'local'|'any' # network.unit.domain = 'global'|'local'|'any'
# network.unit.domainlist = <url or file relative to yacy root path: text file like blacklist, only urls in this file are indexed and searchable>
# network.unit.dhtredundancy = <integer number, 0 means no DHT enabled> # network.unit.dhtredundancy = <integer number, 0 means no DHT enabled>
# network.unit.dht.partitionExponent = <integer number, 0 means no DHT parition, 1 is partition in two, 2 is partition in four and so on> # network.unit.dht.partitionExponent = <integer number, 0 means no DHT parition, 1 is partition in two, 2 is partition in four and so on>
# network.unit.bootstrap.seedlist<n> = <an url to a seedlists-file, which is stored by a principal peer> # network.unit.bootstrap.seedlist<n> = <an url to a seedlists-file, which is stored by a principal peer>

@ -40,6 +40,7 @@ import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.workflow.WorkflowProcessor; import net.yacy.kelondro.workflow.WorkflowProcessor;
import net.yacy.repository.Blacklist; import net.yacy.repository.Blacklist;
import net.yacy.repository.FilterEngine;
import de.anomic.crawler.retrieval.Request; import de.anomic.crawler.retrieval.Request;
import de.anomic.search.Segment; import de.anomic.search.Segment;
@ -58,6 +59,7 @@ public final class CrawlStacker {
private final Segment indexSegment; private final Segment indexSegment;
private final yacySeedDB peers; private final yacySeedDB peers;
private final boolean acceptLocalURLs, acceptGlobalURLs; private final boolean acceptLocalURLs, acceptGlobalURLs;
private final FilterEngine domainList;
// this is the process that checks url for double-occurrences and for allowance/disallowance by robots.txt // this is the process that checks url for double-occurrences and for allowance/disallowance by robots.txt
@ -67,7 +69,8 @@ public final class CrawlStacker {
Segment indexSegment, Segment indexSegment,
yacySeedDB peers, yacySeedDB peers,
boolean acceptLocalURLs, boolean acceptLocalURLs,
boolean acceptGlobalURLs) { boolean acceptGlobalURLs,
FilterEngine domainList) {
this.nextQueue = cq; this.nextQueue = cq;
this.crawler = cs; this.crawler = cs;
this.indexSegment = indexSegment; this.indexSegment = indexSegment;
@ -76,6 +79,7 @@ public final class CrawlStacker {
this.dnsMiss = 0; this.dnsMiss = 0;
this.acceptLocalURLs = acceptLocalURLs; this.acceptLocalURLs = acceptLocalURLs;
this.acceptGlobalURLs = acceptGlobalURLs; this.acceptGlobalURLs = acceptGlobalURLs;
this.domainList = domainList;
this.fastQueue = new WorkflowProcessor<Request>("CrawlStackerFast", "This process checks new urls before they are enqueued into the balancer (proper, double-check, correct domain, filter)", new String[]{"Balancer"}, this, "job", 10000, null, 2); this.fastQueue = new WorkflowProcessor<Request>("CrawlStackerFast", "This process checks new urls before they are enqueued into the balancer (proper, double-check, correct domain, filter)", new String[]{"Balancer"}, this, "job", 10000, null, 2);
this.slowQueue = new WorkflowProcessor<Request>("CrawlStackerSlow", "This is like CrawlStackerFast, but does additionaly a DNS lookup. The CrawlStackerFast does not need this because it can use the DNS cache.", new String[]{"Balancer"}, this, "job", 1000, null, 5); this.slowQueue = new WorkflowProcessor<Request>("CrawlStackerSlow", "This is like CrawlStackerFast, but does additionaly a DNS lookup. The CrawlStackerFast does not need this because it can use the DNS cache.", new String[]{"Balancer"}, this, "job", 1000, null, 5);
@ -350,6 +354,12 @@ public final class CrawlStacker {
public String urlInAcceptedDomain(final DigestURI url) { public String urlInAcceptedDomain(final DigestURI url) {
// returns true if the url can be accepted according to network.unit.domain // returns true if the url can be accepted according to network.unit.domain
if (url == null) return "url is null"; if (url == null) return "url is null";
// check domainList from network-definition
if(this.domainList != null) {
if(!this.domainList.isListed(url, null)) {
return "the url '" + url + "' is not in domainList of this network";
}
}
final boolean local = url.isLocal(); final boolean local = url.isLocal();
if (this.acceptLocalURLs && local) return null; if (this.acceptLocalURLs && local) return null;
if (this.acceptGlobalURLs && !local) return null; if (this.acceptGlobalURLs && !local) return null;

@ -38,6 +38,7 @@ package de.anomic.search;
import java.io.BufferedInputStream; import java.io.BufferedInputStream;
import java.io.BufferedOutputStream; import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream; import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream; import java.io.ByteArrayOutputStream;
import java.io.File; import java.io.File;
@ -47,6 +48,7 @@ import java.io.FileOutputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.OutputStream; import java.io.OutputStream;
import java.io.Reader;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.security.NoSuchAlgorithmException; import java.security.NoSuchAlgorithmException;
@ -111,6 +113,7 @@ import net.yacy.kelondro.workflow.WorkflowJob;
import net.yacy.kelondro.workflow.WorkflowProcessor; import net.yacy.kelondro.workflow.WorkflowProcessor;
import net.yacy.kelondro.workflow.WorkflowThread; import net.yacy.kelondro.workflow.WorkflowThread;
import net.yacy.repository.Blacklist; import net.yacy.repository.Blacklist;
import net.yacy.repository.FilterEngine;
import net.yacy.repository.LoaderDispatcher; import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.CrawlProfile;
@ -224,6 +227,7 @@ public final class Switchboard extends serverSwitch {
public TreeMap<byte[], String> clusterhashes; // map of peerhash(String)/alternative-local-address as ip:port or only ip (String) or null if address in seed should be used public TreeMap<byte[], String> clusterhashes; // map of peerhash(String)/alternative-local-address as ip:port or only ip (String) or null if address in seed should be used
public URLLicense licensedURLs; public URLLicense licensedURLs;
public List<Pattern> networkWhitelist, networkBlacklist; public List<Pattern> networkWhitelist, networkBlacklist;
public FilterEngine domainList;
public Dispatcher dhtDispatcher; public Dispatcher dhtDispatcher;
public LinkedBlockingQueue<String> trail; public LinkedBlockingQueue<String> trail;
public yacySeedDB peers; public yacySeedDB peers;
@ -553,7 +557,8 @@ public final class Switchboard extends serverSwitch {
this.indexSegments.segment(Segments.Process.LOCALCRAWLING), this.indexSegments.segment(Segments.Process.LOCALCRAWLING),
this.peers, this.peers,
isIntranetMode(), isIntranetMode(),
isGlobalMode()); // Intranet and Global mode may be both true! isGlobalMode(),
this.domainList); // Intranet and Global mode may be both true!
// initializing dht chunk generation // initializing dht chunk generation
this.dhtMaxReferenceCount = (int) getConfigLong(SwitchboardConstants.INDEX_DIST_CHUNK_SIZE_START, 50); this.dhtMaxReferenceCount = (int) getConfigLong(SwitchboardConstants.INDEX_DIST_CHUNK_SIZE_START, 50);
@ -819,6 +824,15 @@ public final class Switchboard extends serverSwitch {
} }
*/ */
MultiProtocolURI.addBotInfo(getConfig(SwitchboardConstants.NETWORK_NAME, "") + (isRobinsonMode() ? "-" : "/") + getConfig("network.unit.domain", "global")); MultiProtocolURI.addBotInfo(getConfig(SwitchboardConstants.NETWORK_NAME, "") + (isRobinsonMode() ? "-" : "/") + getConfig("network.unit.domain", "global"));
try {
this.domainList = null;
Reader r = getConfigFileFromWebOrLocally(getConfig("network.unit.domainlist", ""), getAppPath().getAbsolutePath());
this.domainList = new FilterEngine();
this.domainList.loadList(new BufferedReader(r), null);
} catch (FileNotFoundException e) {
} catch (IOException e) {
}
} }
public void switchNetwork(final String networkDefinition) { public void switchNetwork(final String networkDefinition) {
@ -902,13 +916,24 @@ public final class Switchboard extends serverSwitch {
// create new web structure // create new web structure
this.webStructure = new WebStructureGraph(log, rankingPath, "LOCAL/010_cr/", getConfig("CRDist0Path", CRDistribution.CR_OWN), new File(queuesRoot, "webStructure.map")); this.webStructure = new WebStructureGraph(log, rankingPath, "LOCAL/010_cr/", getConfig("CRDist0Path", CRDistribution.CR_OWN), new File(queuesRoot, "webStructure.map"));
try {
this.domainList = null;
Reader r = getConfigFileFromWebOrLocally(getConfig("network.unit.domainList", ""), getAppPath().getAbsolutePath());
this.domainList = new FilterEngine();
this.domainList.loadList(new BufferedReader(r), null);
} catch (FileNotFoundException e) {
} catch (IOException e) {
}
this.crawlStacker = new CrawlStacker( this.crawlStacker = new CrawlStacker(
this.crawlQueues, this.crawlQueues,
this.crawler, this.crawler,
this.indexSegments.segment(Segments.Process.LOCALCRAWLING), this.indexSegments.segment(Segments.Process.LOCALCRAWLING),
this.peers, this.peers,
"local.any".indexOf(getConfig("network.unit.domain", "global")) >= 0, "local.any".indexOf(getConfig("network.unit.domain", "global")) >= 0,
"global.any".indexOf(getConfig("network.unit.domain", "global")) >= 0); "global.any".indexOf(getConfig("network.unit.domain", "global")) >= 0,
this.domainList);
} }
// start up crawl jobs // start up crawl jobs

@ -21,8 +21,15 @@
package de.anomic.server; package de.anomic.server;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File; import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException; import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.InetAddress; import java.net.InetAddress;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
@ -31,12 +38,18 @@ import java.util.Map;
import java.util.TreeMap; import java.util.TreeMap;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.workflow.BusyThread; import net.yacy.kelondro.workflow.BusyThread;
import net.yacy.kelondro.workflow.WorkflowThread; import net.yacy.kelondro.workflow.WorkflowThread;
import de.anomic.search.Switchboard;
import de.anomic.server.serverAccessTracker.Track; import de.anomic.server.serverAccessTracker.Track;
import de.anomic.server.serverCore.Session; import de.anomic.server.serverCore.Session;
@ -554,4 +567,39 @@ public class serverSwitch {
return this.accessTracker.accessHosts(); return this.accessTracker.accessHosts();
} }
/**
* Retrieve text data (e. g. config file) from file
*
* file may be an url or a filename with path relative to rootPath parameter
* @param file url or filename
* @param rootPath searchpath for file
*/
public Reader getConfigFileFromWebOrLocally(String uri, String rootPath) throws IOException, FileNotFoundException {
if(uri.startsWith("http://") || uri.startsWith("https://")) {
String[] uris = uri.split(",");
for (String netdef: uris) {
netdef = netdef.trim();
try {
final RequestHeader reqHeader = new RequestHeader();
reqHeader.put(HeaderFramework.USER_AGENT, MultiProtocolURI.yacybotUserAgent);
final HTTPClient client = new HTTPClient();
client.setHeader(reqHeader.entrySet());
byte[] data = client.GETbytes(uri);
if (data == null || data.length == 0) continue;
return new InputStreamReader(new BufferedInputStream(new ByteArrayInputStream(data)));
} catch (final Exception e) {
continue;
}
}
throw new FileNotFoundException();
} else {
final File f = (uri.length() > 0 && uri.charAt(0) == '/') ? new File(uri) : new File(rootPath, uri);
if (f.exists()) {
return new FileReader(f);
} else {
throw new FileNotFoundException();
}
}
}
} }

@ -0,0 +1,287 @@
package net.yacy.repository;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.Map.Entry;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import net.yacy.cora.storage.HashARC;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
/**
* a URL filter engine for black and white lists
*
* @TODO precompile regular expressions
*
*/
public class FilterEngine {
/** size of URL cache */
protected static final int CACHE_SIZE = 100;
public static final int ERR_TWO_WILDCARDS_IN_HOST = 1;
public static final int ERR_SUBDOMAIN_XOR_WILDCARD = 2;
public static final int ERR_PATH_REGEX = 3;
public static final int ERR_WILDCARD_BEGIN_OR_END = 4;
public static final int ERR_HOST_WRONG_CHARS = 5;
public static final int ERR_DOUBLE_OCCURANCE = 6;
public static final int ERR_HOST_REGEX = 7;
protected enum listTypes { type1 };
protected class FilterEntry {
public String path;
public EnumSet<listTypes> types;
public FilterEntry(String path, EnumSet<listTypes>types) {
this.path = path;
this.types = types;
}
}
protected HashARC<DigestURI, EnumSet<listTypes>> cachedUrlHashs = null;
protected HashMap<String, Set<FilterEntry>> hostpaths_matchable = null;
protected HashMap<String, Set<FilterEntry>> hostpaths_notmatchable = null;
public FilterEngine() {
// prepare the data structure
this.hostpaths_matchable = new HashMap<String, Set<FilterEntry>>();
this.hostpaths_notmatchable = new HashMap<String, Set<FilterEntry>>();
this.cachedUrlHashs = new HashARC<DigestURI, EnumSet<listTypes>>(CACHE_SIZE);
}
public void clear() {
this.cachedUrlHashs.clear();
this.hostpaths_matchable.clear();
this.hostpaths_notmatchable.clear();
}
public int size() {
return this.hostpaths_matchable.size() + this.hostpaths_notmatchable.size();
}
public void add(String entry, EnumSet<listTypes> types) {
assert entry != null;
int pos; // position between domain and path
if((pos = entry.indexOf('/')) > 0) {
String host = entry.substring(0, pos).trim().toLowerCase();
String path = entry.substring(pos + 1).trim();
// avoid PatternSyntaxException e
if (!isMatchable(host) && host.length() > 0 && host.charAt(0) == '*')
host = "." + host;
if(isMatchable(host)) {
if (!hostpaths_matchable.containsKey(host))
hostpaths_matchable.put(host, new TreeSet<FilterEntry>());
hostpaths_matchable.get(host).add(new FilterEntry(path, types));
// TODO: update type, if there is an element
} else {
if (!hostpaths_notmatchable.containsKey(host))
hostpaths_notmatchable.put(host, new TreeSet<FilterEntry>());
hostpaths_notmatchable.get(host).add(new FilterEntry(path, types));
}
}
}
public void loadList(final BufferedReader in, EnumSet<listTypes> types) throws IOException {
String line;
while((line = in.readLine()) != null) {
line = line.trim();
if (line.length() > 0 && line.charAt(0) != '#')
this.add(line, types);
}
}
public void removeAll(final String host) {
assert host != null;
this.hostpaths_matchable.remove(host);
this.hostpaths_notmatchable.remove(host);
}
public void remove(final String listType, final String host, final String path) {
}
public boolean isListed(final DigestURI url, final EnumSet<listTypes> type) {
// trival anwser
if (url.getHost() == null)
return false;
if(cachedUrlHashs.containsKey(url)) {
// Cache Hit
EnumSet<listTypes> e = cachedUrlHashs.get(url);
return e.containsAll(type);
} else {
// Cache Miss
return isListed(url.getHost().toLowerCase(), url.getFile(), type);
}
}
public static boolean isMatchable (final String host) {
try {
if(Pattern.matches("^[a-z0-9.-]*$", host)) // simple Domain (yacy.net or www.yacy.net)
return true;
if(Pattern.matches("^\\*\\.[a-z0-9-.]*$", host)) // start with *. (not .* and * must follow a dot)
return true;
if(Pattern.matches("^[a-z0-9-.]*\\.\\*$", host)) // ends with .* (not *. and befor * must be a dot)
return true;
} catch (final PatternSyntaxException e) {
//System.out.println(e.toString());
return false;
}
return false;
}
public boolean isListed(final String host, String path, EnumSet<listTypes> type) {
if (host == null) throw new NullPointerException();
if (path == null) throw new NullPointerException();
if (path.length() > 0 && path.charAt(0) == '/') path = path.substring(1);
Set<FilterEntry> app;
// try to match complete domain
if ((app = hostpaths_matchable.get(host)) != null) {
for(FilterEntry e: app) {
if (e.path.indexOf("?*") > 0) {
// prevent "Dangling meta character '*'" exception
Log.logWarning("FilterEngine", "ignored blacklist path to prevent 'Dangling meta character' exception: " + e);
continue;
}
if((e.path.equals("*")) || (path.matches(e.path)))
return true;
}
}
// first try to match the domain with wildcard '*'
// [TL] While "." are found within the string
int index = 0;
while ((index = host.indexOf('.', index + 1)) != -1) {
if ((app = hostpaths_matchable.get(host.substring(0, index + 1) + "*")) != null) {
for(FilterEntry e: app) {
if((e.path.equals("*")) || (path.matches(e.path)))
return true;
}
}
if ((app = hostpaths_matchable.get(host.substring(0, index))) != null) {
for(FilterEntry e: app) {
if((e.path.equals("*")) || (path.matches(e.path)))
return true;
}
}
}
index = host.length();
while ((index = host.lastIndexOf('.', index - 1)) != -1) {
if ((app = hostpaths_matchable.get("*" + host.substring(index, host.length()))) != null) {
for(FilterEntry e: app) {
if((e.path.equals("*")) || (path.matches(e.path)))
return true;
}
}
if ((app = hostpaths_matchable.get(host.substring(index +1, host.length()))) != null) {
for(FilterEntry e: app) {
if((e.path.equals("*")) || (path.matches(e.path)))
return true;
}
}
}
// loop over all Regexentrys
for(final Entry<String, Set<FilterEntry>> entry: hostpaths_notmatchable.entrySet()) {
try {
if(Pattern.matches(entry.getKey(), host)) {
app = entry.getValue();
for(FilterEntry e: app) {
if(Pattern.matches(e.path, path))
return true;
}
}
} catch (final PatternSyntaxException e) {
System.out.println(e.toString());
}
}
return false;
}
public int checkError(String element, Map<String, String> properties) {
boolean allowRegex = true;
int slashPos;
String host, path;
if (properties != null) {
allowRegex = properties.get("allowRegex").equalsIgnoreCase("true") ? true : false;
}
if ((slashPos = element.indexOf('/')) == -1) {
host = element;
path = ".*";
} else {
host = element.substring(0, slashPos);
path = element.substring(slashPos + 1);
}
if (!allowRegex || !isValidRegex(host)) {
final int i = host.indexOf('*');
// check whether host begins illegally
if (!host.matches("([A-Za-z0-9_-]+|\\*)(\\.([A-Za-z0-9_-]+|\\*))*")) {
if (i == 0 && host.length() > 1 && host.charAt(1) != '.') {
return ERR_SUBDOMAIN_XOR_WILDCARD;
}
return ERR_HOST_WRONG_CHARS;
}
// in host-part only full sub-domains may be wildcards
if (host.length() > 0 && i > -1) {
if (!(i == 0 || i == host.length() - 1)) {
return ERR_WILDCARD_BEGIN_OR_END;
}
if (i == host.length() - 1 && host.length() > 1 && host.charAt(i - 1) != '.') {
return ERR_SUBDOMAIN_XOR_WILDCARD;
}
}
// check for double-occurences of "*" in host
if (host.indexOf("*", i + 1) > -1) {
return ERR_TWO_WILDCARDS_IN_HOST;
}
} else if (allowRegex && !isValidRegex(host)) {
return ERR_HOST_REGEX;
}
// check for errors on regex-compiling path
if (!isValidRegex(path) && !path.equals("*")) {
return ERR_PATH_REGEX;
}
return 0;
}
/**
* Checks if a given expression is a valid regular expression.
* @param expression The expression to be checked.
* @return True if the expression is a valid regular expression, else false.
*/
private static boolean isValidRegex(String expression) {
boolean ret = true;
try {
Pattern.compile(expression);
} catch (final PatternSyntaxException e) {
ret = false;
}
return ret;
}
}
Loading…
Cancel
Save