From 5e8038ac4d7af374f58c7a0b64663e26bea67002 Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 21 Oct 2009 20:14:30 +0000 Subject: [PATCH] - refactoring of blacklists - refactoring of event origin encoding git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6434 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/BlacklistCleaner_p.java | 45 +- htroot/BlacklistTest_p.java | 2 +- htroot/Blacklist_p.java | 8 +- htroot/CrawlResults.java | 27 +- htroot/IndexControlRWIs_p.java | 6 +- htroot/Supporter.java | 2 +- htroot/Surftips.java | 2 +- htroot/ViewFile.java | 2 +- htroot/api/blacklists_p.java | 5 +- htroot/sharedBlacklist_p.java | 6 +- htroot/yacy/crawlReceipt.java | 3 +- htroot/yacy/transferRWI.java | 2 +- htroot/yacy/transferURL.java | 5 +- source/de/anomic/crawler/CrawlStacker.java | 2 +- source/de/anomic/crawler/ResultURLs.java | 126 +-- .../anomic/crawler/retrieval/EventOrigin.java | 34 + .../anomic/crawler/retrieval/HTTPLoader.java | 2 +- .../de/anomic/crawler/retrieval/Response.java | 11 +- source/de/anomic/data/Blacklist.java | 102 --- source/de/anomic/data/DefaultBlacklist.java | 194 ----- source/de/anomic/data/listManager.java | 14 +- .../anomic/http/server/HTTPDProxyHandler.java | 2 +- .../http/server/servlets/transferURL.java | 5 +- .../de/anomic/search/MetadataRepository.java | 2 +- source/de/anomic/search/Segment.java | 2 +- source/de/anomic/search/Switchboard.java | 30 +- .../anomic/search/SwitchboardConstants.java | 11 - source/de/anomic/yacy/yacyClient.java | 5 +- source/de/anomic/yacy/yacyNewsPool.java | 2 +- source/de/anomic/yacy/yacySearch.java | 2 +- .../yacy/repository/Blacklist.java} | 778 +++++++++++------- source/net/yacy/repository/BlacklistFile.java | 58 ++ .../net/yacy/repository/LoaderDispatcher.java | 5 - 33 files changed, 693 insertions(+), 809 deletions(-) create mode 100644 source/de/anomic/crawler/retrieval/EventOrigin.java delete mode 100644 source/de/anomic/data/Blacklist.java delete mode 100644 source/de/anomic/data/DefaultBlacklist.java rename source/{de/anomic/data/AbstractBlacklist.java => net/yacy/repository/Blacklist.java} (65%) create mode 100644 source/net/yacy/repository/BlacklistFile.java diff --git a/htroot/BlacklistCleaner_p.java b/htroot/BlacklistCleaner_p.java index 0dea13995..c8fc5b420 100644 --- a/htroot/BlacklistCleaner_p.java +++ b/htroot/BlacklistCleaner_p.java @@ -43,17 +43,16 @@ import java.util.Map.Entry; import java.util.regex.Matcher; import java.util.regex.Pattern; -import de.anomic.data.AbstractBlacklist; -import de.anomic.data.Blacklist; -import de.anomic.data.DefaultBlacklist; import de.anomic.data.listManager; import de.anomic.http.server.RequestHeader; +import de.anomic.search.SearchEventCache; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import java.util.Set; import net.yacy.kelondro.logging.Log; +import net.yacy.repository.Blacklist; public class BlacklistCleaner_p { @@ -65,7 +64,7 @@ public class BlacklistCleaner_p { private final static String BLACKLIST_FILENAME_FILTER = "^.*\\.black$"; public static final Class[] supportedBLEngines = { - DefaultBlacklist.class + Blacklist.class }; public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { @@ -77,7 +76,7 @@ public class BlacklistCleaner_p { String blacklistToUse = null; // get the list of supported blacklist types - final String supportedBlacklistTypesStr = AbstractBlacklist.BLACKLIST_TYPES_STRING; + final String supportedBlacklistTypesStr = Blacklist.BLACKLIST_TYPES_STRING; final String[] supportedBlacklistTypes = supportedBlacklistTypesStr.split(","); prop.put(DISABLED+"checked", "1"); @@ -245,26 +244,23 @@ public class BlacklistCleaner_p { final List list = listManager.getListArray(new File(listManager.listsPath, blacklistToUse)); final Map properties= new HashMap(); properties.put("allowRegex", String.valueOf(allowRegex)); - - if (blEngine instanceof AbstractBlacklist) { - int err = 0; + int err = 0; - for (String element : list) { - element = element.trim(); - - // check for double-occurance - if (legalEntries.contains(element)) { - illegalEntries.put(element, Integer.valueOf(AbstractBlacklist.ERR_DOUBLE_OCCURANCE)); - continue; - } - legalEntries.add(element); + for (String element : list) { + element = element.trim(); + + // check for double-occurance + if (legalEntries.contains(element)) { + illegalEntries.put(element, Integer.valueOf(Blacklist.ERR_DOUBLE_OCCURANCE)); + continue; + } + legalEntries.add(element); - err = blEngine.checkError(element, properties); + err = blEngine.checkError(element, properties); - if (err > 0) { - illegalEntries.put(element, err); - } + if (err > 0) { + illegalEntries.put(element, err); } } @@ -309,14 +305,14 @@ public class BlacklistCleaner_p { final String host = (s.indexOf("/") == -1) ? s : s.substring(0, s.indexOf("/")); final String path = (s.indexOf("/") == -1) ? ".*" : s.substring(s.indexOf("/") + 1); try { - Switchboard.urlBlacklist.remove(supportedBlacklistTypes[blTypes], - host,path); + Switchboard.urlBlacklist.remove(supportedBlacklistTypes[blTypes], host, path); } catch (final RuntimeException e) { //System.err.println(e.getMessage() + ": " + host + "/" + path); Log.logSevere("BLACKLIST-CLEANER", e.getMessage() + ": " + host + "/" + path); } } - } + } + SearchEventCache.cleanupEvents(true); } if (listChanged){ listManager.writeList(new File(listManager.listsPath, blacklistToUse), list.toArray(new String[list.size()])); @@ -360,6 +356,7 @@ public class BlacklistCleaner_p { path); } } + SearchEventCache.cleanupEvents(true); } pw.close(); } catch (final IOException e) { diff --git a/htroot/BlacklistTest_p.java b/htroot/BlacklistTest_p.java index e4fcf0fdd..54414bea3 100644 --- a/htroot/BlacklistTest_p.java +++ b/htroot/BlacklistTest_p.java @@ -33,8 +33,8 @@ import java.io.File; import java.net.MalformedURLException; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.repository.Blacklist; -import de.anomic.data.Blacklist; import de.anomic.data.listManager; import de.anomic.http.server.RequestHeader; import de.anomic.search.Switchboard; diff --git a/htroot/Blacklist_p.java b/htroot/Blacklist_p.java index 015906e5f..7a9911648 100644 --- a/htroot/Blacklist_p.java +++ b/htroot/Blacklist_p.java @@ -41,11 +41,11 @@ import java.util.List; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; +import net.yacy.repository.Blacklist; -import de.anomic.data.AbstractBlacklist; -import de.anomic.data.Blacklist; import de.anomic.data.listManager; import de.anomic.http.server.RequestHeader; +import de.anomic.search.SearchEventCache; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -66,7 +66,7 @@ public class Blacklist_p { listManager.listsPath = new File(listManager.switchboard.getRootPath(),listManager.switchboard.getConfig("listManager.listsPath", "DATA/LISTS")); // get the list of supported blacklist types - final String supportedBlacklistTypesStr = AbstractBlacklist.BLACKLIST_TYPES_STRING; + final String supportedBlacklistTypesStr = Blacklist.BLACKLIST_TYPES_STRING; final String[] supportedBlacklistTypes = supportedBlacklistTypesStr.split(","); // load all blacklist files located in the directory @@ -560,6 +560,7 @@ public class Blacklist_p { Switchboard.urlBlacklist.add(supportedBlacklistTypes[blTypes], newEntry.substring(0, pos), newEntry.substring(pos + 1)); } } + SearchEventCache.cleanupEvents(true); } return null; @@ -610,6 +611,7 @@ public class Blacklist_p { Switchboard.urlBlacklist.remove(supportedBlacklistTypes[blTypes],oldEntry.substring(0, pos), oldEntry.substring(pos + 1)); } } + SearchEventCache.cleanupEvents(true); return null; } diff --git a/htroot/CrawlResults.java b/htroot/CrawlResults.java index 0ddfad8a5..1f1cdaa43 100644 --- a/htroot/CrawlResults.java +++ b/htroot/CrawlResults.java @@ -34,6 +34,7 @@ import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.logging.Log; +import de.anomic.crawler.retrieval.EventOrigin; import de.anomic.http.server.RequestHeader; import de.anomic.search.Segments; import de.anomic.search.Switchboard; @@ -63,20 +64,24 @@ public class CrawlResults { } // find process number - int tabletype; + EventOrigin tabletype; try { - tabletype = Integer.parseInt(post.get("process", "0")); + tabletype = EventOrigin.getEvent(Integer.parseInt(post.get("process", "0"))); } catch (final NumberFormatException e) { - tabletype = 0; + tabletype = EventOrigin.UNKNOWN; } - if ((post != null) && (post.containsKey("autoforward")) && (tabletype == 5) && (sb.crawlResults.getStackSize(5) == 0)) { + if ( + post != null && + post.containsKey("autoforward") && + tabletype == EventOrigin.LOCAL_CRAWLING && + sb.crawlResults.getStackSize(EventOrigin.LOCAL_CRAWLING) == 0) { // the main menu does a request to the local crawler page, but in case this table is empty, the overview page is shown - tabletype = 0; + tabletype = EventOrigin.UNKNOWN; } // check if authorization is needed and/or given - if (((tabletype > 0) && (tabletype < 6)) || + if (tabletype != EventOrigin.UNKNOWN || (post != null && (post.containsKey("clearlist") || post.containsKey("deleteentry")))) { final String authorization = (header.get(RequestHeader.AUTHORIZATION, "xxxxxx")); @@ -143,7 +148,7 @@ public class CrawlResults { } // end != null // create table - if (tabletype == 0) { + if (tabletype == EventOrigin.UNKNOWN) { prop.put("table", "2"); } else if (sb.crawlResults.getStackSize(tabletype) == 0 && sb.crawlResults.getDomainListSize(tabletype) == 0) { prop.put("table", "0"); @@ -159,7 +164,7 @@ public class CrawlResults { prop.put("table_size_all", sb.crawlResults.getStackSize(tabletype)); prop.putHTML("table_feedbackpage", "CrawlResults.html"); - prop.put("table_tabletype", tabletype); + prop.put("table_tabletype", tabletype.getCode()); prop.put("table_showInit", (showInit) ? "1" : "0"); prop.put("table_showExec", (showExec) ? "1" : "0"); prop.put("table_showDate", (showDate) ? "1" : "0"); @@ -196,7 +201,7 @@ public class CrawlResults { prop.put("table_indexed_" + cnt + "_dark", (dark) ? "1" : "0"); prop.put("table_indexed_" + cnt + "_feedbackpage", "CrawlResults.html"); - prop.put("table_indexed_" + cnt + "_tabletype", tabletype); + prop.put("table_indexed_" + cnt + "_tabletype", tabletype.getCode()); prop.put("table_indexed_" + cnt + "_urlhash", urlHash); if (showInit) { @@ -266,7 +271,7 @@ public class CrawlResults { if (domain == null) break; prop.put("table_domains_" + cnt + "_dark", (dark) ? "1" : "0"); prop.put("table_domains_" + cnt + "_feedbackpage", "CrawlResults.html"); - prop.put("table_domains_" + cnt + "_tabletype", tabletype); + prop.put("table_domains_" + cnt + "_tabletype", tabletype.getCode()); prop.put("table_domains_" + cnt + "_domain", domain); prop.put("table_domains_" + cnt + "_hashpart", DigestURI.hosthash6(domain)); prop.put("table_domains_" + cnt + "_count", sb.crawlResults.domainCount(tabletype, domain)); @@ -275,7 +280,7 @@ public class CrawlResults { } prop.put("table_domains", cnt); } - prop.put("process", tabletype); + prop.put("process", tabletype.getCode()); // return rewrite properties return prop; } diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index 3a5c7f4d8..22ad4c7ac 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -47,9 +47,8 @@ import net.yacy.kelondro.rwi.Reference; import net.yacy.kelondro.rwi.ReferenceContainer; import net.yacy.kelondro.rwi.ReferenceContainerCache; import net.yacy.kelondro.util.DateFormatter; +import net.yacy.repository.Blacklist; -import de.anomic.data.AbstractBlacklist; -import de.anomic.data.Blacklist; import de.anomic.data.listManager; import de.anomic.http.server.RequestHeader; import de.anomic.search.QueryParams; @@ -320,6 +319,7 @@ public class IndexControlRWIs_p { url.getFile()); } } + SearchEventCache.cleanupEvents(true); } } pw.close(); @@ -330,7 +330,7 @@ public class IndexControlRWIs_p { if (post.containsKey("blacklistdomains")) { PrintWriter pw; try { - final String[] supportedBlacklistTypes = AbstractBlacklist.BLACKLIST_TYPES_STRING.split(","); + final String[] supportedBlacklistTypes = Blacklist.BLACKLIST_TYPES_STRING.split(","); pw = new PrintWriter(new FileWriter(new File(listManager.listsPath, blacklist), true)); DigestURI url; for (i = 0; i externResultStack; // 1 - remote index: retrieved by other peer - private final LinkedList searchResultStack; // 2 - partly remote/local index: result of search queries - private final LinkedList transfResultStack; // 3 - partly remote/local index: result of index transfer - private final LinkedList proxyResultStack; // 4 - local index: result of proxy fetch/prefetch - private final LinkedList lcrawlResultStack; // 5 - local index: result of local crawling - private final LinkedList gcrawlResultStack; // 6 - local index: triggered external - - private final ScoreCluster externResultDomains; - private final ScoreCluster searchResultDomains; - private final ScoreCluster transfResultDomains; - private final ScoreCluster proxyResultDomains; - private final ScoreCluster lcrawlResultDomains; - private final ScoreCluster gcrawlResultDomains; + private final Map> resultStacks; + private final Map> resultDomains; public ResultURLs() { // init result stacks - externResultStack = new LinkedList(); - searchResultStack = new LinkedList(); - transfResultStack = new LinkedList(); - proxyResultStack = new LinkedList(); - lcrawlResultStack = new LinkedList(); - gcrawlResultStack = new LinkedList(); - // init result domain statistics - externResultDomains = new ScoreCluster(); - searchResultDomains = new ScoreCluster(); - transfResultDomains = new ScoreCluster(); - proxyResultDomains = new ScoreCluster(); - lcrawlResultDomains = new ScoreCluster(); - gcrawlResultDomains = new ScoreCluster(); + resultStacks = new HashMap>(); + resultDomains = new HashMap>(); + for (EventOrigin origin: EventOrigin.values()) { + resultStacks.put(origin, new LinkedList()); + resultDomains.put(origin, new ScoreCluster()); + } } - public synchronized void stack(final URIMetadataRow e, final String initiatorHash, final String executorHash, final int stackType) { + public synchronized void stack(final URIMetadataRow e, final String initiatorHash, final String executorHash, final EventOrigin stackType) { assert initiatorHash != null; assert executorHash != null; if (e == null) { return; } @@ -108,27 +93,27 @@ public final class ResultURLs { } } - public synchronized int getStackSize(final int stack) { + public synchronized int getStackSize(final EventOrigin stack) { final List resultStack = getStack(stack); if (resultStack == null) return 0; return resultStack.size(); } - public synchronized int getDomainListSize(final int stack) { + public synchronized int getDomainListSize(final EventOrigin stack) { final ScoreCluster domains = getDomains(stack); if (domains == null) return 0; return domains.size(); } - public synchronized String getUrlHash(final int stack, final int pos) { + public synchronized String getUrlHash(final EventOrigin stack, final int pos) { return getHashNo(stack, pos, 0); } - public synchronized String getInitiatorHash(final int stack, final int pos) { + public synchronized String getInitiatorHash(final EventOrigin stack, final int pos) { return getHashNo(stack, pos, 1); } - public synchronized String getExecutorHash(final int stack, final int pos) { + public synchronized String getExecutorHash(final EventOrigin stack, final int pos) { return getHashNo(stack, pos, 2); } @@ -150,7 +135,7 @@ public final class ResultURLs { * @param index starting at 0 * @return */ - public synchronized String getHashNo(final int stack, final int pos, final int index) { + public synchronized String getHashNo(final EventOrigin stack, final int pos, final int index) { final String result = getResultStackAt(stack, pos); if(result != null) { if(result.length() < Word.commonHashLength * 3) { @@ -175,7 +160,7 @@ public final class ResultURLs { * @param pos * @return null if either stack or element do not exist */ - private String getResultStackAt(final int stack, final int pos) { + private String getResultStackAt(final EventOrigin stack, final int pos) { assert pos >= 0 : "precondition violated: " + pos + " >= 0"; final List resultStack = getStack(stack); @@ -196,12 +181,12 @@ public final class ResultURLs { * iterate all domains in the result domain statistic * @return iterator of domains in reverse order (downwards) */ - public Iterator domains(final int stack) { + public Iterator domains(final EventOrigin stack) { assert getDomains(stack) != null : "getDomains(" + stack + ") = null"; return getDomains(stack).scores(false); } - public int deleteDomain(final int stack, String host, String hosthash) { + public int deleteDomain(final EventOrigin stack, String host, String hosthash) { assert hosthash.length() == 6; int i = 0; while (i < getStackSize(stack)) { @@ -218,41 +203,23 @@ public final class ResultURLs { * @param domain name * @return the number of occurrences of the domain in the stack statistics */ - public int domainCount(final int stack, String domain) { + public int domainCount(final EventOrigin stack, String domain) { assert domain != null : "domain = null"; assert getDomains(stack) != null : "getDomains(" + stack + ") = null"; return getDomains(stack).getScore(domain); } /** - * returns the stack indentified by the id stack + * returns the stack identified by the id stack * * @param stack id of resultStack * @return null if stack does not exist (id is unknown or stack is null (which should not occur and an error is logged)) */ - private List getStack(final int stack) { - switch (stack) { - case 1: return externResultStack; - case 2: return searchResultStack; - case 3: return transfResultStack; - case 4: return proxyResultStack; - case 5: return lcrawlResultStack; - case 6: return gcrawlResultStack; - default: - return null; - } + private List getStack(final EventOrigin stack) { + return resultStacks.get(stack); } - private ScoreCluster getDomains(final int stack) { - switch (stack) { - case 1: return externResultDomains; - case 2: return searchResultDomains; - case 3: return transfResultDomains; - case 4: return proxyResultDomains; - case 5: return lcrawlResultDomains; - case 6: return gcrawlResultDomains; - default: - return null; - } + private ScoreCluster getDomains(final EventOrigin stack) { + return resultDomains.get(stack); } /** @@ -261,11 +228,11 @@ public final class ResultURLs { * @param stack * @return */ - private boolean isValidStack(final int stack) { + private boolean isValidStack(final EventOrigin stack) { return getStack(stack) != null; } - public synchronized boolean removeStack(final int stack, final int pos) { + public synchronized boolean removeStack(final EventOrigin stack, final int pos) { final List resultStack = getStack(stack); if (resultStack == null) { return false; @@ -273,7 +240,7 @@ public final class ResultURLs { return resultStack.remove(pos) != null; } - public synchronized void clearStack(final int stack) { + public synchronized void clearStack(final EventOrigin stack) { final List resultStack = getStack(stack); if (resultStack != null) resultStack.clear(); final ScoreCluster resultDomains = getDomains(stack); @@ -287,11 +254,11 @@ public final class ResultURLs { public synchronized boolean remove(final String urlHash) { if (urlHash == null) return false; String hash; - for (int stack = 1; stack <= 6; stack++) { - for (int i = getStackSize(stack) - 1; i >= 0; i--) { - hash = getUrlHash(stack, i); + for (EventOrigin origin: EventOrigin.values()) { + for (int i = getStackSize(origin) - 1; i >= 0; i--) { + hash = getUrlHash(origin, i); if (hash != null && hash.equals(urlHash)) { - removeStack(stack, i); + removeStack(origin, i); return true; } } @@ -308,7 +275,7 @@ public final class ResultURLs { try { final DigestURI url = new DigestURI("http", "www.yacy.net", 80, "/"); final URIMetadataRow urlRef = new URIMetadataRow(url, "YaCy Homepage", "", "", "", new Date(), new Date(), new Date(), "", new byte[] {}, 123, 42, '?', new Bitfield(), "de", 0, 0, 0, 0, 0, 0); - int stackNo = 1; + EventOrigin stackNo = EventOrigin.LOCAL_CRAWLING; System.out.println("valid test:\n======="); // add results.stack(urlRef, urlRef.hash(), url.hash(), stackNo); @@ -324,29 +291,6 @@ public final class ResultURLs { System.out.println("url hash:\t"+ results.getUrlHash(stackNo, 1)); System.out.println("executor hash:\t"+ results.getExecutorHash(stackNo, 1)); System.out.println("initiator hash:\t"+ results.getInitiatorHash(stackNo, 1)); - stackNo = 42; - System.out.println("size of stack:\t"+ results.getStackSize(stackNo)); - // get - System.out.println("url hash:\t"+ results.getUrlHash(stackNo, 0)); - System.out.println("executor hash:\t"+ results.getExecutorHash(stackNo, 0)); - System.out.println("initiator hash:\t"+ results.getInitiatorHash(stackNo, 0)); - - // benchmark - final long start = System.currentTimeMillis(); - for(int i = 0; i < 1000000; i++) { - stackNo = i % 6; - // add - results.stack(urlRef, urlRef.hash(), url.hash(), stackNo); - // size - results.getStackSize(stackNo); - // get - for(int j = 0; j < 10; j++) { - results.getUrlHash(stackNo, i / 6); - results.getExecutorHash(stackNo, i / 6); - results.getInitiatorHash(stackNo, i / 6); - } - } - System.out.println("benschmark: "+ (System.currentTimeMillis() - start) + " ms"); } catch (final MalformedURLException e) { e.printStackTrace(); } diff --git a/source/de/anomic/crawler/retrieval/EventOrigin.java b/source/de/anomic/crawler/retrieval/EventOrigin.java new file mode 100644 index 000000000..29fea6250 --- /dev/null +++ b/source/de/anomic/crawler/retrieval/EventOrigin.java @@ -0,0 +1,34 @@ +package de.anomic.crawler.retrieval; + + +public enum EventOrigin { + + // we must distinguish the following cases: resource-load was initiated by + // 1) global crawling: the index is extern, not here (not possible here) + // 2) result of search queries, some indexes are here (not possible here) + // 3) result of index transfer, some of them are here (not possible here) + // 4) proxy-load (initiator is "------------") + // 5) local prefetch/crawling (initiator is own seedHash) + // 6) local fetching for global crawling (other known or unknown initiator) + + UNKNOWN(0), + REMOTE_RECEIPTS(1), + QUERIES(2), + DHT_TRANSFER(3), + PROXY_LOAD(4), + LOCAL_CRAWLING(5), + GLOBAL_CRAWLING(6); + + protected int code; + private static final EventOrigin[] list = { + UNKNOWN, REMOTE_RECEIPTS, QUERIES, DHT_TRANSFER, PROXY_LOAD, LOCAL_CRAWLING, GLOBAL_CRAWLING}; + private EventOrigin(int code) { + this.code = code; + } + public int getCode() { + return this.code; + } + public static final EventOrigin getEvent(int key) { + return list[key]; + } +} diff --git a/source/de/anomic/crawler/retrieval/HTTPLoader.java b/source/de/anomic/crawler/retrieval/HTTPLoader.java index 14962963f..811879035 100644 --- a/source/de/anomic/crawler/retrieval/HTTPLoader.java +++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java @@ -31,9 +31,9 @@ import java.util.Date; import net.yacy.document.TextParser; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; +import net.yacy.repository.Blacklist; import de.anomic.crawler.Latency; -import de.anomic.data.Blacklist; import de.anomic.http.client.Client; import de.anomic.http.server.HeaderFramework; import de.anomic.http.server.RequestHeader; diff --git a/source/de/anomic/crawler/retrieval/Response.java b/source/de/anomic/crawler/retrieval/Response.java index 9e7ed9ef2..e5162aa8a 100755 --- a/source/de/anomic/crawler/retrieval/Response.java +++ b/source/de/anomic/crawler/retrieval/Response.java @@ -37,7 +37,6 @@ import de.anomic.crawler.CrawlProfile; import de.anomic.http.server.HeaderFramework; import de.anomic.http.server.RequestHeader; import de.anomic.http.server.ResponseHeader; -import de.anomic.search.SwitchboardConstants; public class Response { @@ -770,7 +769,7 @@ public class Response { (requestHeader.get(HeaderFramework.X_YACY_INDEX_CONTROL)).toUpperCase().equals("NO-INDEX"); } - public int processCase(String mySeedHash) { + public EventOrigin processCase(String mySeedHash) { // we must distinguish the following cases: resource-load was initiated by // 1) global crawling: the index is extern, not here (not possible here) // 2) result of search queries, some indexes are here (not possible here) @@ -778,17 +777,17 @@ public class Response { // 4) proxy-load (initiator is "------------") // 5) local prefetch/crawling (initiator is own seedHash) // 6) local fetching for global crawling (other known or unknwon initiator) - int processCase = SwitchboardConstants.PROCESSCASE_0_UNKNOWN; + EventOrigin processCase = EventOrigin.UNKNOWN; // FIXME the equals seems to be incorrect: String.equals(boolean) if ((initiator() == null) || initiator().length() == 0 || initiator().equals("------------")) { // proxy-load - processCase = SwitchboardConstants.PROCESSCASE_4_PROXY_LOAD; + processCase = EventOrigin.PROXY_LOAD; } else if (initiator().equals(mySeedHash)) { // normal crawling - processCase = SwitchboardConstants.PROCESSCASE_5_LOCAL_CRAWLING; + processCase = EventOrigin.LOCAL_CRAWLING; } else { // this was done for remote peer (a global crawl) - processCase = SwitchboardConstants.PROCESSCASE_6_GLOBAL_CRAWLING; + processCase = EventOrigin.GLOBAL_CRAWLING; } return processCase; } diff --git a/source/de/anomic/data/Blacklist.java b/source/de/anomic/data/Blacklist.java deleted file mode 100644 index 87b67460c..000000000 --- a/source/de/anomic/data/Blacklist.java +++ /dev/null @@ -1,102 +0,0 @@ -// Blacklist.java -// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany -// first published 26.03.2008 on http://yacy.net -// -// This is a part of YaCy, a peer-to-peer based web search engine -// -// $LastChangedDate$ -// $LastChangedRevision$ -// $LastChangedBy$ -// -// LICENSE -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -package de.anomic.data; - -import java.io.File; -import java.util.Arrays; -import java.util.HashSet; -import java.util.Map; - -import net.yacy.kelondro.data.meta.DigestURI; - - -public interface Blacklist { - - public static final String BLACKLIST_DHT = "dht"; - public static final String BLACKLIST_CRAWLER = "crawler"; - public static final String BLACKLIST_PROXY = "proxy"; - public static final String BLACKLIST_SEARCH = "search"; - public static final String BLACKLIST_SURFTIPS = "surftips"; - public static final String BLACKLIST_NEWS = "news"; - - public static final class blacklistFile { - - private final String filename; - private final String type; - - public blacklistFile(final String filename, final String type) { - this.filename = filename; - this.type = type; - } - - public String getFileName() { return this.filename; } - - - /** - * Construct a unified array of file names from comma seperated file name - * list. - * - * @return unified String array of file names - */ - public String[] getFileNamesUnified() { - final HashSet hs = new HashSet(Arrays.asList(this.filename.split(","))); - - return hs.toArray(new String[hs.size()]); - } - - public String getType() { return this.type; } - } - - public String getEngineInfo(); - - public void setRootPath(File rootPath); - - public int blacklistCacheSize(); - - public int size(); - - public void clear(); - public void removeAll(String blacklistType, String host); - public void remove(String blacklistType, String host, String path); - public void add(String blacklistType, String host, String path); - - - public void loadList(String blacklistType, String filenames, String sep); - public void loadList(blacklistFile[] blFiles, String sep); - - - public boolean contains(String blacklistType, String host, String path); - - public boolean hashInBlacklistedCache(String blacklistType, String urlHash); - - public boolean isListed(String blacklistType, DigestURI url); - - public boolean isListed(String blacklistType, String hostlow, String path); - - public int checkError(String entry, Map properties); - -} diff --git a/source/de/anomic/data/DefaultBlacklist.java b/source/de/anomic/data/DefaultBlacklist.java deleted file mode 100644 index c5b88845e..000000000 --- a/source/de/anomic/data/DefaultBlacklist.java +++ /dev/null @@ -1,194 +0,0 @@ -// indexDefaultReference.java -// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany -// first published 11.07.2005 on http://yacy.net -// -// This is a part of YaCy, a peer-to-peer based web search engine -// -// $LastChangedDate$ -// $LastChangedRevision$ -// $LastChangedBy$ -// -// LICENSE -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -package de.anomic.data; - -import java.io.File; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Map; -import java.util.Map.Entry; -import java.util.regex.Pattern; -import java.util.regex.PatternSyntaxException; - -public class DefaultBlacklist extends AbstractBlacklist implements Blacklist { - - public DefaultBlacklist(final File rootPath) { - super(rootPath); - } - - public String getEngineInfo() { - return "Default YaCy Blacklist Engine"; - } - - public boolean isListed(final String blacklistType, final String hostlow, String path) { - if (hostlow == null) throw new NullPointerException(); - if (path == null) throw new NullPointerException(); - - // getting the proper blacklist - final HashMap> blacklistMapMatched = super.getBlacklistMap(blacklistType,true); - - if (path.length() > 0 && path.charAt(0) == '/') path = path.substring(1); - ArrayList app; - boolean matched = false; - String pp = ""; // path-pattern - - // try to match complete domain - if (!matched && (app = blacklistMapMatched.get(hostlow)) != null) { - for (int i=app.size()-1; !matched && i>-1; i--) { - pp = app.get(i); - matched |= ((pp.equals("*")) || (path.matches(pp))); - } - } - // first try to match the domain with wildcard '*' - // [TL] While "." are found within the string - int index = 0; - while (!matched && (index = hostlow.indexOf('.', index + 1)) != -1) { - if ((app = blacklistMapMatched.get(hostlow.substring(0, index + 1) + "*")) != null) { - for (int i=app.size()-1; !matched && i>-1; i--) { - pp = app.get(i); - matched |= ((pp.equals("*")) || (path.matches(pp))); - } - } - if ((app = blacklistMapMatched.get(hostlow.substring(0, index))) != null) { - for (int i=app.size()-1; !matched && i>-1; i--) { - pp = app.get(i); - matched |= ((pp.equals("*")) || (path.matches(pp))); - } - } - } - index = hostlow.length(); - while (!matched && (index = hostlow.lastIndexOf('.', index - 1)) != -1) { - if ((app = blacklistMapMatched.get("*" + hostlow.substring(index, hostlow.length()))) != null) { - for (int i=app.size()-1; !matched && i>-1; i--) { - pp = app.get(i); - matched |= ((pp.equals("*")) || (path.matches(pp))); - } - } - if ((app = blacklistMapMatched.get(hostlow.substring(index +1, hostlow.length()))) != null) { - for (int i=app.size()-1; !matched && i>-1; i--) { - pp = app.get(i); - matched |= ((pp.equals("*")) || (path.matches(pp))); - } - } - } - - - // loop over all Regexentrys - if(!matched) { - final HashMap> blacklistMapNotMatched = super.getBlacklistMap(blacklistType,false); - String key; - for(final Entry> entry: blacklistMapNotMatched.entrySet()) { - key = entry.getKey(); - try { - if(Pattern.matches(key, hostlow)) { - app = entry.getValue(); - for (int i=0; i properties) { - - boolean allowRegex = true; - int slashPos; - String host, path; - - if (properties != null) { - allowRegex = properties.get("allowRegex").equalsIgnoreCase("true") ? true : false; - } - - if ((slashPos = element.indexOf("/")) == -1) { - host = element; - path = ".*"; - } else { - host = element.substring(0, slashPos); - path = element.substring(slashPos + 1); - } - - if (!allowRegex || !isValidRegex(host)) { - final int i = host.indexOf("*"); - - // check whether host begins illegally - if (!host.matches("([A-Za-z0-9_-]+|\\*)(\\.([A-Za-z0-9_-]+|\\*))*")) { - if (i == 0 && host.length() > 1 && host.charAt(1) != '.') { - return ERR_SUBDOMAIN_XOR_WILDCARD; - } - return ERR_HOST_WRONG_CHARS; - } - - // in host-part only full sub-domains may be wildcards - if (host.length() > 0 && i > -1) { - if (!(i == 0 || i == host.length() - 1)) { - return ERR_WILDCARD_BEGIN_OR_END; - } - - if (i == host.length() - 1 && host.length() > 1 && host.charAt(i - 1) != '.') { - return ERR_SUBDOMAIN_XOR_WILDCARD; - } - } - - // check for double-occurences of "*" in host - if (host.indexOf("*", i + 1) > -1) { - return ERR_TWO_WILDCARDS_IN_HOST; - } - } else if (allowRegex && !isValidRegex(host)) { - return ERR_HOST_REGEX; - } - - // check for errors on regex-compiling path - if (!isValidRegex(path) && !path.equals("*")) { - return ERR_PATH_REGEX; - } - - return 0; - } - - /** - * Checks if a given expression is a valid regular expression. - * @param expression The expression to be checked. - * @return True if the expression is a valid regular expression, else false. - */ - private static boolean isValidRegex(String expression) { - boolean ret = true; - try { - Pattern.compile(expression); - } catch (final PatternSyntaxException e) { - - ret = false; - } - return ret; - } - -} diff --git a/source/de/anomic/data/listManager.java b/source/de/anomic/data/listManager.java index 395eca137..2c2ff5947 100644 --- a/source/de/anomic/data/listManager.java +++ b/source/de/anomic/data/listManager.java @@ -42,7 +42,10 @@ import java.util.List; import java.util.Set; import java.util.Vector; -import de.anomic.data.Blacklist.blacklistFile; +import net.yacy.repository.Blacklist; +import net.yacy.repository.BlacklistFile; + +import de.anomic.search.SearchEventCache; import de.anomic.search.Switchboard; import de.anomic.server.serverCore; @@ -391,12 +394,12 @@ public class listManager { * Load or reload all active Blacklists */ public static void reloadBlacklists(){ - final String supportedBlacklistTypesStr = AbstractBlacklist.BLACKLIST_TYPES_STRING; + final String supportedBlacklistTypesStr = Blacklist.BLACKLIST_TYPES_STRING; final String[] supportedBlacklistTypes = supportedBlacklistTypesStr.split(","); - final ArrayList blacklistFiles = new ArrayList(supportedBlacklistTypes.length); + final ArrayList blacklistFiles = new ArrayList(supportedBlacklistTypes.length); for (int i=0; i < supportedBlacklistTypes.length; i++) { - final blacklistFile blFile = new blacklistFile( + final BlacklistFile blFile = new BlacklistFile( switchboard.getConfig( supportedBlacklistTypes[i] + ".BlackLists", switchboard.getConfig("BlackLists.DefaultList", "url.default.black")), supportedBlacklistTypes[i]); @@ -405,8 +408,9 @@ public class listManager { Switchboard.urlBlacklist.clear(); Switchboard.urlBlacklist.loadList( - blacklistFiles.toArray(new blacklistFile[blacklistFiles.size()]), + blacklistFiles.toArray(new BlacklistFile[blacklistFiles.size()]), "/"); + SearchEventCache.cleanupEvents(true); // switchboard.urlBlacklist.clear(); // if (f != "") switchboard.urlBlacklist.loadLists("black", f, "/"); diff --git a/source/de/anomic/http/server/HTTPDProxyHandler.java b/source/de/anomic/http/server/HTTPDProxyHandler.java index e29e8a01e..41afa0d7c 100644 --- a/source/de/anomic/http/server/HTTPDProxyHandler.java +++ b/source/de/anomic/http/server/HTTPDProxyHandler.java @@ -80,11 +80,11 @@ import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.DateFormatter; import net.yacy.kelondro.util.Domains; import net.yacy.kelondro.util.FileUtils; +import net.yacy.repository.Blacklist; import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.crawler.retrieval.Request; import de.anomic.crawler.retrieval.Response; -import de.anomic.data.Blacklist; import de.anomic.http.client.MultiOutputStream; import de.anomic.http.client.Client; import de.anomic.http.client.RemoteProxyConfig; diff --git a/source/de/anomic/http/server/servlets/transferURL.java b/source/de/anomic/http/server/servlets/transferURL.java index 353730dea..3e8c3ca84 100644 --- a/source/de/anomic/http/server/servlets/transferURL.java +++ b/source/de/anomic/http/server/servlets/transferURL.java @@ -9,8 +9,9 @@ import net.yacy.document.content.RSSMessage; import net.yacy.document.parser.xml.RSSFeed; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.util.DateFormatter; +import net.yacy.repository.Blacklist; -import de.anomic.data.Blacklist; +import de.anomic.crawler.retrieval.EventOrigin; import de.anomic.http.server.RequestHeader; import de.anomic.search.Segments; import de.anomic.search.Switchboard; @@ -141,7 +142,7 @@ public final class transferURL { yacyCore.log.logInfo("Accepting URL " + i + "/" + urlc + " from peer " + otherPeerName + ": " + lEntry.metadata().url().toNormalform(true, false)); try { sb.indexSegments.urlMetadata(Segments.Process.DHTIN).store(lEntry); - sb.crawlResults.stack(lEntry, iam, iam, 3); + sb.crawlResults.stack(lEntry, iam, iam, EventOrigin.DHT_TRANSFER); if (yacyCore.log.isFine()) yacyCore.log.logFine("transferURL: received URL '" + metadata.url().toNormalform(false, true) + "' from peer " + otherPeerName); received++; } catch (final IOException e) { diff --git a/source/de/anomic/search/MetadataRepository.java b/source/de/anomic/search/MetadataRepository.java index 199294894..18d537db6 100644 --- a/source/de/anomic/search/MetadataRepository.java +++ b/source/de/anomic/search/MetadataRepository.java @@ -50,8 +50,8 @@ import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.CloneableIterator; import net.yacy.kelondro.table.SplitTable; import net.yacy.kelondro.util.ScoreCluster; +import net.yacy.repository.Blacklist; -import de.anomic.data.Blacklist; import de.anomic.http.client.Client; import de.anomic.http.client.RemoteProxyConfig; import de.anomic.http.server.ResponseContainer; diff --git a/source/de/anomic/search/Segment.java b/source/de/anomic/search/Segment.java index 32642fbeb..a340bb0b6 100644 --- a/source/de/anomic/search/Segment.java +++ b/source/de/anomic/search/Segment.java @@ -52,9 +52,9 @@ import net.yacy.kelondro.rwi.IndexCell; import net.yacy.kelondro.rwi.ReferenceContainer; import net.yacy.kelondro.rwi.ReferenceFactory; import net.yacy.kelondro.util.ISO639; +import net.yacy.repository.Blacklist; import de.anomic.crawler.retrieval.Response; -import de.anomic.data.Blacklist; public class Segment { diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index f3ae9cbdf..d3b9762c3 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -139,6 +139,7 @@ import net.yacy.kelondro.workflow.InstantBusyThread; import net.yacy.kelondro.workflow.WorkflowJob; import net.yacy.kelondro.workflow.WorkflowProcessor; import net.yacy.kelondro.workflow.WorkflowThread; +import net.yacy.repository.Blacklist; import net.yacy.repository.LoaderDispatcher; import de.anomic.crawler.CrawlProfile; @@ -153,11 +154,10 @@ import de.anomic.crawler.ResultURLs; import de.anomic.crawler.RobotsTxt; import de.anomic.crawler.ZURL; import de.anomic.crawler.CrawlProfile.entry; +import de.anomic.crawler.retrieval.EventOrigin; import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.crawler.retrieval.Request; import de.anomic.crawler.retrieval.Response; -import de.anomic.data.Blacklist; -import de.anomic.data.DefaultBlacklist; import de.anomic.data.LibraryProvider; import de.anomic.data.URLLicense; import de.anomic.data.blogBoard; @@ -429,7 +429,7 @@ public final class Switchboard extends serverSwitch { // load blacklist this.log.logConfig("Loading blacklist ..."); final File blacklistsPath = getConfigPath(SwitchboardConstants.LISTS_PATH, SwitchboardConstants.LISTS_PATH_DEFAULT); - urlBlacklist = new DefaultBlacklist(blacklistsPath); + urlBlacklist = new Blacklist(blacklistsPath); listManager.switchboard = this; listManager.listsPath = blacklistsPath; listManager.reloadBlacklists(); @@ -1156,7 +1156,7 @@ public final class Switchboard extends serverSwitch { // check if the document should be indexed based on proxy/crawler rules String noIndexReason = "unspecified indexing error"; - if (response.processCase(peers.mySeed().hash) == SwitchboardConstants.PROCESSCASE_4_PROXY_LOAD) { + if (response.processCase(peers.mySeed().hash) == EventOrigin.PROXY_LOAD) { // proxy-load noIndexReason = response.shallIndexCacheForProxy(); } else { @@ -1329,8 +1329,8 @@ public final class Switchboard extends serverSwitch { int c = 0; if ((crawlQueues.delegatedURL.stackSize() > 1000)) c++; if ((crawlQueues.errorURL.stackSize() > 1000)) c++; - for (int i = 1; i <= 6; i++) { - if (crawlResults.getStackSize(i) > 1000) c++; + for (EventOrigin origin: EventOrigin.values()) { + if (crawlResults.getStackSize(origin) > 1000) c++; } return c; } @@ -1410,11 +1410,11 @@ public final class Switchboard extends serverSwitch { } // clean up loadedURL stack - for (int i = 1; i <= 6; i++) { + for (EventOrigin origin: EventOrigin.values()) { checkInterruption(); - if (crawlResults.getStackSize(i) > 1000) { - if (this.log.isFine()) log.logFine("Cleaning Loaded-URLs report stack, " + crawlResults.getStackSize(i) + " entries on stack " + i); - crawlResults.clearStack(i); + if (crawlResults.getStackSize(origin) > 1000) { + if (this.log.isFine()) log.logFine("Cleaning Loaded-URLs report stack, " + crawlResults.getStackSize(origin) + " entries on stack " + origin.getCode()); + crawlResults.clearStack(origin); hasDoneSomething = true; } } @@ -1585,7 +1585,7 @@ public final class Switchboard extends serverSwitch { private Document parseDocument(Response entry) throws InterruptedException { Document document = null; - final int processCase = entry.processCase(peers.mySeed().hash); + final EventOrigin processCase = entry.processCase(peers.mySeed().hash); if (this.log.isFine()) log.logFine("processResourceStack processCase=" + processCase + ", depth=" + entry.depth() + @@ -1635,7 +1635,7 @@ public final class Switchboard extends serverSwitch { // put anchors on crawl stack final long stackStartTime = System.currentTimeMillis(); if ( - ((processCase == SwitchboardConstants.PROCESSCASE_4_PROXY_LOAD) || (processCase == SwitchboardConstants.PROCESSCASE_5_LOCAL_CRAWLING)) && + ((processCase == EventOrigin.PROXY_LOAD) || (processCase == EventOrigin.LOCAL_CRAWLING)) && ((entry.profile() == null) || (entry.depth() < entry.profile().depth())) ) { final Map hl = document.getHyperlinks(); @@ -1715,7 +1715,7 @@ public final class Switchboard extends serverSwitch { // CREATE INDEX final String dc_title = document.dc_title(); final DigestURI referrerURL = queueEntry.referrerURL(); - final int processCase = queueEntry.processCase(peers.mySeed().hash); + final EventOrigin processCase = queueEntry.processCase(peers.mySeed().hash); // remove stopwords log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + queueEntry.url()); @@ -1765,7 +1765,7 @@ public final class Switchboard extends serverSwitch { MemoryTracker.update("indexed", queueEntry.url().toNormalform(true, false), false); // if this was performed for a remote crawl request, notify requester - if ((processCase == SwitchboardConstants.PROCESSCASE_6_GLOBAL_CRAWLING) && (queueEntry.initiator() != null)) { + if ((processCase == EventOrigin.GLOBAL_CRAWLING) && (queueEntry.initiator() != null)) { final yacySeed initiatorPeer = peers.get(queueEntry.initiator()); if (initiatorPeer != null) { log.logInfo("Sending crawl receipt for '" + queueEntry.url().toNormalform(false, true) + "' to " + initiatorPeer.getName()); @@ -1841,7 +1841,7 @@ public final class Switchboard extends serverSwitch { final Long resourceContentLength = (Long) resource[1]; // parse the resource - final Document document = LoaderDispatcher.parseDocument(metadata.url(), resourceContentLength.longValue(), resourceContent); + final Document document = LoaderDispatcher.parseDocument(metadata.url(), resourceContentLength.longValue(), resourceContent, null); // get the word set Set words = null; diff --git a/source/de/anomic/search/SwitchboardConstants.java b/source/de/anomic/search/SwitchboardConstants.java index 5d84e4e1d..0b1d1045d 100644 --- a/source/de/anomic/search/SwitchboardConstants.java +++ b/source/de/anomic/search/SwitchboardConstants.java @@ -385,17 +385,6 @@ public final class SwitchboardConstants { public static final String WORK_PATH = "workPath"; public static final String WORK_PATH_DEFAULT = "DATA/WORK"; - // we must distinguish the following cases: resource-load was initiated by - // 1) global crawling: the index is extern, not here (not possible here) - // 2) result of search queries, some indexes are here (not possible here) - // 3) result of index transfer, some of them are here (not possible here) - // 4) proxy-load (initiator is "------------") - // 5) local prefetch/crawling (initiator is own seedHash) - // 6) local fetching for global crawling (other known or unknown initiator) - public static final int PROCESSCASE_0_UNKNOWN = 0; - public static final int PROCESSCASE_4_PROXY_LOAD = 4; - public static final int PROCESSCASE_5_LOCAL_CRAWLING = 5; - public static final int PROCESSCASE_6_GLOBAL_CRAWLING = 6; /* * Some constants */ diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 399cf2cce..fe174252a 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -68,13 +68,14 @@ import net.yacy.kelondro.rwi.ReferenceContainer; import net.yacy.kelondro.rwi.ReferenceContainerCache; import net.yacy.kelondro.util.ByteBuffer; import net.yacy.kelondro.util.FileUtils; +import net.yacy.repository.Blacklist; import org.apache.commons.httpclient.methods.multipart.ByteArrayPartSource; import org.apache.commons.httpclient.methods.multipart.Part; import de.anomic.crawler.ResultURLs; +import de.anomic.crawler.retrieval.EventOrigin; import de.anomic.crawler.retrieval.HTTPLoader; -import de.anomic.data.Blacklist; import de.anomic.http.client.DefaultCharsetFilePart; import de.anomic.http.client.DefaultCharsetStringPart; import de.anomic.http.client.Client; @@ -576,7 +577,7 @@ public final class yacyClient { // passed all checks, store url try { indexSegment.urlMetadata().store(urlEntry); - crawlResults.stack(urlEntry, mySeed.hash, target.hash, 2); + crawlResults.stack(urlEntry, mySeed.hash, target.hash, EventOrigin.QUERIES); } catch (final IOException e) { yacyCore.log.logSevere("could not store search result", e); continue; // db-error diff --git a/source/de/anomic/yacy/yacyNewsPool.java b/source/de/anomic/yacy/yacyNewsPool.java index 024e7dd85..8ea29dd6b 100644 --- a/source/de/anomic/yacy/yacyNewsPool.java +++ b/source/de/anomic/yacy/yacyNewsPool.java @@ -51,8 +51,8 @@ import java.util.Iterator; import java.util.Map; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.repository.Blacklist; -import de.anomic.data.Blacklist; import de.anomic.search.Switchboard; public class yacyNewsPool { diff --git a/source/de/anomic/yacy/yacySearch.java b/source/de/anomic/yacy/yacySearch.java index fac12dcea..555cea127 100644 --- a/source/de/anomic/yacy/yacySearch.java +++ b/source/de/anomic/yacy/yacySearch.java @@ -34,9 +34,9 @@ import java.util.TreeSet; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Bitfield; import net.yacy.kelondro.util.ScoreCluster; +import net.yacy.repository.Blacklist; import de.anomic.crawler.ResultURLs; -import de.anomic.data.Blacklist; import de.anomic.search.QueryParams; import de.anomic.search.RankingProfile; import de.anomic.search.RankingProcess; diff --git a/source/de/anomic/data/AbstractBlacklist.java b/source/net/yacy/repository/Blacklist.java similarity index 65% rename from source/de/anomic/data/AbstractBlacklist.java rename to source/net/yacy/repository/Blacklist.java index ffc2f99d7..105e7b998 100644 --- a/source/de/anomic/data/AbstractBlacklist.java +++ b/source/net/yacy/repository/Blacklist.java @@ -1,316 +1,462 @@ -// AbstractBlacklist.java -// first published on http://www.yacy.net -// (C) 2007 by Bjoern Krombholz -// last major change: 12. August 2006 (theli) ? -// -// This is a part of YaCy, a peer-to-peer based web search engine -// -// $LastChangedDate$ -// $LastChangedRevision$ -// $LastChangedBy$ -// -// LICENSE -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -package de.anomic.data; - -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; -import java.util.Map; -import java.util.Set; -import java.util.regex.Pattern; -import java.util.regex.PatternSyntaxException; - -import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.util.SetTools; - -import de.anomic.search.SearchEventCache; - -public abstract class AbstractBlacklist implements Blacklist { - - public static final int ERR_TWO_WILDCARDS_IN_HOST = 1; - public static final int ERR_SUBDOMAIN_XOR_WILDCARD = 2; - public static final int ERR_PATH_REGEX = 3; - public static final int ERR_WILDCARD_BEGIN_OR_END = 4; - public static final int ERR_HOST_WRONG_CHARS = 5; - public static final int ERR_DOUBLE_OCCURANCE = 6; - public static final int ERR_HOST_REGEX = 7; - - protected static final HashSet BLACKLIST_TYPES = new HashSet(Arrays.asList(new String[]{ - Blacklist.BLACKLIST_CRAWLER, - Blacklist.BLACKLIST_PROXY, - Blacklist.BLACKLIST_DHT, - Blacklist.BLACKLIST_SEARCH, - Blacklist.BLACKLIST_SURFTIPS, - Blacklist.BLACKLIST_NEWS - })); - public static final String BLACKLIST_TYPES_STRING="proxy,crawler,dht,search,surftips,news"; - - protected File blacklistRootPath = null; - protected HashMap> cachedUrlHashs = null; - //protected HashMap>> hostpaths = null; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here - protected HashMap>> hostpaths_matchable = null; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here - protected HashMap>> hostpaths_notmatchable = null; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here - - public AbstractBlacklist(final File rootPath) { - this.setRootPath(rootPath); - - this.blacklistRootPath = rootPath; - - // prepare the data structure - //this.hostpaths = new HashMap>>(); - this.hostpaths_matchable = new HashMap>>(); - this.hostpaths_notmatchable = new HashMap>>(); - this.cachedUrlHashs = new HashMap>(); - - final Iterator iter = BLACKLIST_TYPES.iterator(); - while (iter.hasNext()) { - final String blacklistType = iter.next(); - //this.hostpaths.put(blacklistType, new HashMap>()); - this.hostpaths_matchable.put(blacklistType, new HashMap>()); - this.hostpaths_notmatchable.put(blacklistType, new HashMap>()); - this.cachedUrlHashs.put(blacklistType, Collections.synchronizedSet(new HashSet())); - } - } - - public void setRootPath(final File rootPath) { - if (rootPath == null) - throw new NullPointerException("The blacklist root path must not be null."); - if (!rootPath.isDirectory()) - throw new IllegalArgumentException("The blacklist root path is not a directory."); - if (!rootPath.canRead()) - throw new IllegalArgumentException("The blacklist root path is not readable."); - - this.blacklistRootPath = rootPath; - } - - protected HashMap> getBlacklistMap(final String blacklistType,final boolean matchable) { - if (blacklistType == null) throw new IllegalArgumentException(); - if (!BLACKLIST_TYPES.contains(blacklistType)) throw new IllegalArgumentException("Unknown blacklist type: "+blacklistType+"."); - - return (matchable)? this.hostpaths_matchable.get(blacklistType) : this.hostpaths_notmatchable.get(blacklistType); - } - - protected Set getCacheUrlHashsSet(final String blacklistType) { - if (blacklistType == null) throw new IllegalArgumentException(); - if (!BLACKLIST_TYPES.contains(blacklistType)) throw new IllegalArgumentException("Unknown backlist type."); - - return this.cachedUrlHashs.get(blacklistType); - } - - public void clear() { - for(final HashMap> entry: this.hostpaths_matchable.values()) { - entry.clear(); - } - for(final HashMap> entry: this.hostpaths_notmatchable.values()) { - entry.clear(); - } - for(final Set entry: this.cachedUrlHashs.values()) { - entry.clear(); - } - - // clean up all search events in case that an old blacklist entry denied previously returned results, but does not anymore - SearchEventCache.cleanupEvents(true); - } - - public int size() { - int size = 0; - for(final String entry: this.hostpaths_matchable.keySet()) { - for(final ArrayList ientry: this.hostpaths_matchable.get(entry).values()) { - size += ientry.size(); - } - } - for(final String entry: this.hostpaths_notmatchable.keySet()) { - for(final ArrayList ientry: this.hostpaths_notmatchable.get(entry).values()) { - size += ientry.size(); - } - } - return size; - } - - public void loadList(final blacklistFile[] blFiles, final String sep) { - for (int j = 0; j < blFiles.length; j++) { - final blacklistFile blf = blFiles[j]; - loadList(blf.getType(), blf.getFileName(), sep); - } - } - - public void loadList(final blacklistFile blFile, final String sep) { - final HashMap> blacklistMapMatch = getBlacklistMap(blFile.getType(),true); - final HashMap> blacklistMapNotMatch = getBlacklistMap(blFile.getType(),false); - Set>> loadedBlacklist; - Map.Entry> loadedEntry; - ArrayList paths; - ArrayList loadedPaths; - - final String[] fileNames = blFile.getFileNamesUnified(); - if (fileNames.length > 0) { - for (int i = 0; i < fileNames.length; i++) { - // make sure all requested blacklist files exist - final File file = new File(this.blacklistRootPath, fileNames[i]); - try { - file.createNewFile(); - } catch (final IOException e) { /* */ } - - // join all blacklists from files into one internal blacklist map - loadedBlacklist = SetTools.loadMapMultiValsPerKey(file.toString(), sep).entrySet(); - for (final Iterator>> mi = loadedBlacklist.iterator(); mi.hasNext(); ) { - loadedEntry = mi.next(); - loadedPaths = loadedEntry.getValue(); - - // create new entry if host mask unknown, otherwise merge - // existing one with path patterns from blacklist file - paths = (isMatchable(loadedEntry.getKey())) ? blacklistMapMatch.get(loadedEntry.getKey()) : blacklistMapNotMatch.get(loadedEntry.getKey()); - if (paths == null) { - if(isMatchable(loadedEntry.getKey())) - blacklistMapMatch.put(loadedEntry.getKey(), loadedPaths); - else - blacklistMapNotMatch.put(loadedEntry.getKey(), loadedPaths); - } else { - // TODO check for duplicates? (refactor List -> Set) - paths.addAll(loadedPaths); - } - } - } - // clean up all search events in case that a (new) blacklist entry denies previously returned results - SearchEventCache.cleanupEvents(true); - } - } - - public void loadList(final String blacklistType, final String fileNames, final String sep) { - // method for not breaking older plasmaURLPattern interface - final blacklistFile blFile = new blacklistFile(fileNames, blacklistType); - - loadList(blFile, sep); - } - - public void removeAll(final String blacklistType, final String host) { - getBlacklistMap(blacklistType,true).remove(host); - getBlacklistMap(blacklistType,false).remove(host); - - // clean up all search events in case that an old blacklist entry denied previously returned results, but does not anymore - SearchEventCache.cleanupEvents(true); - } - - public void remove(final String blacklistType, final String host, final String path) { - - final HashMap> blacklistMap = getBlacklistMap(blacklistType,true); - ArrayList hostList = blacklistMap.get(host); - if(hostList != null) { - hostList.remove(path); - if (hostList.size() == 0) - blacklistMap.remove(host); - } - final HashMap> blacklistMapNotMatch = getBlacklistMap(blacklistType,false); - hostList = blacklistMapNotMatch.get(host); - if(hostList != null) { - hostList.remove(path); - if (hostList.size() == 0) - blacklistMapNotMatch.remove(host); - } - - // clean up all search events in case that an old blacklist entry denied previously returned results, but does not anymore - SearchEventCache.cleanupEvents(true); - } - - public void add(final String blacklistType, String host, String path) { - if (host == null) throw new NullPointerException(); - if (path == null) throw new NullPointerException(); - - if (path.length() > 0 && path.charAt(0) == '/') path = path.substring(1); - - HashMap> blacklistMap; - blacklistMap = (isMatchable(host)) ? getBlacklistMap(blacklistType,true) : getBlacklistMap(blacklistType,false); - - // avoid PatternSyntaxException e - if(!isMatchable(host) && host.startsWith("*")) - host = "." + host; - - ArrayList hostList = blacklistMap.get(host.toLowerCase()); - if (hostList == null) blacklistMap.put(host.toLowerCase(), (hostList = new ArrayList())); - hostList.add(path); - - // clean up all search events in case that a (new) blacklist entry denies previously returned results - SearchEventCache.cleanupEvents(true); - } - - public int blacklistCacheSize() { - int size = 0; - final Iterator iter = this.cachedUrlHashs.keySet().iterator(); - while (iter.hasNext()) { - final Set blacklistMap = this.cachedUrlHashs.get(iter.next()); - size += blacklistMap.size(); - } - return size; - } - - public boolean hashInBlacklistedCache(final String blacklistType, final String urlHash) { - final Set urlHashCache = getCacheUrlHashsSet(blacklistType); - return urlHashCache.contains(urlHash); - } - - public boolean contains(final String blacklistType, String host, String path) { - boolean ret = false; - - if (blacklistType != null && host != null && path != null) { - HashMap> blacklistMap; - blacklistMap = (isMatchable(host)) ? getBlacklistMap(blacklistType,true) : getBlacklistMap(blacklistType,false); - - // avoid PatternSyntaxException e - if(!isMatchable(host) && host.startsWith("*")) - host = "." + host; - - ArrayList hostList = blacklistMap.get(host.toLowerCase()); - if (hostList != null) ret = hostList.contains(path); - } - return ret; - } - - public boolean isListed(final String blacklistType, final DigestURI url) { - - final Set urlHashCache = getCacheUrlHashsSet(blacklistType); - if (!urlHashCache.contains(url.hash())) { - final boolean temp = isListed(blacklistType, url.getHost().toLowerCase(), url.getFile()); - if (temp) { - urlHashCache.add(url.hash()); - } - return temp; - } - return true; - } - - public static boolean isMatchable (final String host) { - try { - if(Pattern.matches("^[a-z0-9.-]*$", host)) // simple Domain (yacy.net or www.yacy.net) - return true; - if(Pattern.matches("^\\*\\.[a-z0-9-.]*$", host)) // start with *. (not .* and * must follow a dot) - return true; - if(Pattern.matches("^[a-z0-9-.]*\\.\\*$", host)) // ends with .* (not *. and befor * must be a dot) - return true; - } catch (final PatternSyntaxException e) { - //System.out.println(e.toString()); - return false; - } - return false; - } - -} +// indexDefaultReference.java +// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 11.07.2005 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package net.yacy.repository; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Map; +import java.util.Set; +import java.util.Map.Entry; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + + +import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.util.SetTools; + +public class Blacklist { + + + public static final String BLACKLIST_DHT = "dht"; + public static final String BLACKLIST_CRAWLER = "crawler"; + public static final String BLACKLIST_PROXY = "proxy"; + public static final String BLACKLIST_SEARCH = "search"; + public static final String BLACKLIST_SURFTIPS = "surftips"; + public static final String BLACKLIST_NEWS = "news"; + + public static final int ERR_TWO_WILDCARDS_IN_HOST = 1; + public static final int ERR_SUBDOMAIN_XOR_WILDCARD = 2; + public static final int ERR_PATH_REGEX = 3; + public static final int ERR_WILDCARD_BEGIN_OR_END = 4; + public static final int ERR_HOST_WRONG_CHARS = 5; + public static final int ERR_DOUBLE_OCCURANCE = 6; + public static final int ERR_HOST_REGEX = 7; + + protected static final HashSet BLACKLIST_TYPES = new HashSet(Arrays.asList(new String[]{ + Blacklist.BLACKLIST_CRAWLER, + Blacklist.BLACKLIST_PROXY, + Blacklist.BLACKLIST_DHT, + Blacklist.BLACKLIST_SEARCH, + Blacklist.BLACKLIST_SURFTIPS, + Blacklist.BLACKLIST_NEWS + })); + public static final String BLACKLIST_TYPES_STRING="proxy,crawler,dht,search,surftips,news"; + + protected File blacklistRootPath = null; + protected HashMap> cachedUrlHashs = null; + //protected HashMap>> hostpaths = null; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here + protected HashMap>> hostpaths_matchable = null; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here + protected HashMap>> hostpaths_notmatchable = null; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here + + public Blacklist(final File rootPath) { + this.setRootPath(rootPath); + + this.blacklistRootPath = rootPath; + + // prepare the data structure + //this.hostpaths = new HashMap>>(); + this.hostpaths_matchable = new HashMap>>(); + this.hostpaths_notmatchable = new HashMap>>(); + this.cachedUrlHashs = new HashMap>(); + + final Iterator iter = BLACKLIST_TYPES.iterator(); + while (iter.hasNext()) { + final String blacklistType = iter.next(); + //this.hostpaths.put(blacklistType, new HashMap>()); + this.hostpaths_matchable.put(blacklistType, new HashMap>()); + this.hostpaths_notmatchable.put(blacklistType, new HashMap>()); + this.cachedUrlHashs.put(blacklistType, Collections.synchronizedSet(new HashSet())); + } + } + + + + public void setRootPath(final File rootPath) { + if (rootPath == null) + throw new NullPointerException("The blacklist root path must not be null."); + if (!rootPath.isDirectory()) + throw new IllegalArgumentException("The blacklist root path is not a directory."); + if (!rootPath.canRead()) + throw new IllegalArgumentException("The blacklist root path is not readable."); + + this.blacklistRootPath = rootPath; + } + + protected HashMap> getBlacklistMap(final String blacklistType,final boolean matchable) { + if (blacklistType == null) throw new IllegalArgumentException(); + if (!BLACKLIST_TYPES.contains(blacklistType)) throw new IllegalArgumentException("Unknown blacklist type: "+blacklistType+"."); + + return (matchable)? this.hostpaths_matchable.get(blacklistType) : this.hostpaths_notmatchable.get(blacklistType); + } + + protected Set getCacheUrlHashsSet(final String blacklistType) { + if (blacklistType == null) throw new IllegalArgumentException(); + if (!BLACKLIST_TYPES.contains(blacklistType)) throw new IllegalArgumentException("Unknown backlist type."); + + return this.cachedUrlHashs.get(blacklistType); + } + + public void clear() { + for(final HashMap> entry: this.hostpaths_matchable.values()) { + entry.clear(); + } + for(final HashMap> entry: this.hostpaths_notmatchable.values()) { + entry.clear(); + } + for(final Set entry: this.cachedUrlHashs.values()) { + entry.clear(); + } + } + + public int size() { + int size = 0; + for(final String entry: this.hostpaths_matchable.keySet()) { + for(final ArrayList ientry: this.hostpaths_matchable.get(entry).values()) { + size += ientry.size(); + } + } + for(final String entry: this.hostpaths_notmatchable.keySet()) { + for(final ArrayList ientry: this.hostpaths_notmatchable.get(entry).values()) { + size += ientry.size(); + } + } + return size; + } + + public void loadList(final BlacklistFile[] blFiles, final String sep) { + for (int j = 0; j < blFiles.length; j++) { + final BlacklistFile blf = blFiles[j]; + loadList(blf.getType(), blf.getFileName(), sep); + } + } + + public void loadList(final BlacklistFile blFile, final String sep) { + final HashMap> blacklistMapMatch = getBlacklistMap(blFile.getType(),true); + final HashMap> blacklistMapNotMatch = getBlacklistMap(blFile.getType(),false); + Set>> loadedBlacklist; + Map.Entry> loadedEntry; + ArrayList paths; + ArrayList loadedPaths; + + final String[] fileNames = blFile.getFileNamesUnified(); + if (fileNames.length > 0) { + for (int i = 0; i < fileNames.length; i++) { + // make sure all requested blacklist files exist + final File file = new File(this.blacklistRootPath, fileNames[i]); + try { + file.createNewFile(); + } catch (final IOException e) { /* */ } + + // join all blacklists from files into one internal blacklist map + loadedBlacklist = SetTools.loadMapMultiValsPerKey(file.toString(), sep).entrySet(); + for (final Iterator>> mi = loadedBlacklist.iterator(); mi.hasNext(); ) { + loadedEntry = mi.next(); + loadedPaths = loadedEntry.getValue(); + + // create new entry if host mask unknown, otherwise merge + // existing one with path patterns from blacklist file + paths = (isMatchable(loadedEntry.getKey())) ? blacklistMapMatch.get(loadedEntry.getKey()) : blacklistMapNotMatch.get(loadedEntry.getKey()); + if (paths == null) { + if(isMatchable(loadedEntry.getKey())) + blacklistMapMatch.put(loadedEntry.getKey(), loadedPaths); + else + blacklistMapNotMatch.put(loadedEntry.getKey(), loadedPaths); + } else { + // TODO check for duplicates? (refactor List -> Set) + paths.addAll(loadedPaths); + } + } + } + } + } + + public void loadList(final String blacklistType, final String fileNames, final String sep) { + // method for not breaking older plasmaURLPattern interface + final BlacklistFile blFile = new BlacklistFile(fileNames, blacklistType); + + loadList(blFile, sep); + } + + public void removeAll(final String blacklistType, final String host) { + getBlacklistMap(blacklistType,true).remove(host); + getBlacklistMap(blacklistType,false).remove(host); + } + + public void remove(final String blacklistType, final String host, final String path) { + + final HashMap> blacklistMap = getBlacklistMap(blacklistType,true); + ArrayList hostList = blacklistMap.get(host); + if(hostList != null) { + hostList.remove(path); + if (hostList.size() == 0) + blacklistMap.remove(host); + } + final HashMap> blacklistMapNotMatch = getBlacklistMap(blacklistType,false); + hostList = blacklistMapNotMatch.get(host); + if (hostList != null) { + hostList.remove(path); + if (hostList.size() == 0) + blacklistMapNotMatch.remove(host); + } + } + + public void add(final String blacklistType, String host, String path) { + if (host == null) throw new NullPointerException(); + if (path == null) throw new NullPointerException(); + + if (path.length() > 0 && path.charAt(0) == '/') path = path.substring(1); + + HashMap> blacklistMap; + blacklistMap = (isMatchable(host)) ? getBlacklistMap(blacklistType,true) : getBlacklistMap(blacklistType,false); + + // avoid PatternSyntaxException e + if(!isMatchable(host) && host.startsWith("*")) + host = "." + host; + + ArrayList hostList = blacklistMap.get(host.toLowerCase()); + if (hostList == null) blacklistMap.put(host.toLowerCase(), (hostList = new ArrayList())); + hostList.add(path); + } + + public int blacklistCacheSize() { + int size = 0; + final Iterator iter = this.cachedUrlHashs.keySet().iterator(); + while (iter.hasNext()) { + final Set blacklistMap = this.cachedUrlHashs.get(iter.next()); + size += blacklistMap.size(); + } + return size; + } + + public boolean hashInBlacklistedCache(final String blacklistType, final String urlHash) { + final Set urlHashCache = getCacheUrlHashsSet(blacklistType); + return urlHashCache.contains(urlHash); + } + + public boolean contains(final String blacklistType, String host, String path) { + boolean ret = false; + + if (blacklistType != null && host != null && path != null) { + HashMap> blacklistMap; + blacklistMap = (isMatchable(host)) ? getBlacklistMap(blacklistType,true) : getBlacklistMap(blacklistType,false); + + // avoid PatternSyntaxException e + if(!isMatchable(host) && host.startsWith("*")) + host = "." + host; + + ArrayList hostList = blacklistMap.get(host.toLowerCase()); + if (hostList != null) ret = hostList.contains(path); + } + return ret; + } + + public boolean isListed(final String blacklistType, final DigestURI url) { + + final Set urlHashCache = getCacheUrlHashsSet(blacklistType); + if (!urlHashCache.contains(url.hash())) { + final boolean temp = isListed(blacklistType, url.getHost().toLowerCase(), url.getFile()); + if (temp) { + urlHashCache.add(url.hash()); + } + return temp; + } + return true; + } + + public static boolean isMatchable (final String host) { + try { + if(Pattern.matches("^[a-z0-9.-]*$", host)) // simple Domain (yacy.net or www.yacy.net) + return true; + if(Pattern.matches("^\\*\\.[a-z0-9-.]*$", host)) // start with *. (not .* and * must follow a dot) + return true; + if(Pattern.matches("^[a-z0-9-.]*\\.\\*$", host)) // ends with .* (not *. and befor * must be a dot) + return true; + } catch (final PatternSyntaxException e) { + //System.out.println(e.toString()); + return false; + } + return false; + } + + public String getEngineInfo() { + return "Default YaCy Blacklist Engine"; + } + + public boolean isListed(final String blacklistType, final String hostlow, String path) { + if (hostlow == null) throw new NullPointerException(); + if (path == null) throw new NullPointerException(); + + // getting the proper blacklist + final HashMap> blacklistMapMatched = getBlacklistMap(blacklistType,true); + + if (path.length() > 0 && path.charAt(0) == '/') path = path.substring(1); + ArrayList app; + boolean matched = false; + String pp = ""; // path-pattern + + // try to match complete domain + if (!matched && (app = blacklistMapMatched.get(hostlow)) != null) { + for (int i=app.size()-1; !matched && i>-1; i--) { + pp = app.get(i); + matched |= ((pp.equals("*")) || (path.matches(pp))); + } + } + // first try to match the domain with wildcard '*' + // [TL] While "." are found within the string + int index = 0; + while (!matched && (index = hostlow.indexOf('.', index + 1)) != -1) { + if ((app = blacklistMapMatched.get(hostlow.substring(0, index + 1) + "*")) != null) { + for (int i=app.size()-1; !matched && i>-1; i--) { + pp = app.get(i); + matched |= ((pp.equals("*")) || (path.matches(pp))); + } + } + if ((app = blacklistMapMatched.get(hostlow.substring(0, index))) != null) { + for (int i=app.size()-1; !matched && i>-1; i--) { + pp = app.get(i); + matched |= ((pp.equals("*")) || (path.matches(pp))); + } + } + } + index = hostlow.length(); + while (!matched && (index = hostlow.lastIndexOf('.', index - 1)) != -1) { + if ((app = blacklistMapMatched.get("*" + hostlow.substring(index, hostlow.length()))) != null) { + for (int i=app.size()-1; !matched && i>-1; i--) { + pp = app.get(i); + matched |= ((pp.equals("*")) || (path.matches(pp))); + } + } + if ((app = blacklistMapMatched.get(hostlow.substring(index +1, hostlow.length()))) != null) { + for (int i=app.size()-1; !matched && i>-1; i--) { + pp = app.get(i); + matched |= ((pp.equals("*")) || (path.matches(pp))); + } + } + } + + + // loop over all Regexentrys + if(!matched) { + final HashMap> blacklistMapNotMatched = getBlacklistMap(blacklistType,false); + String key; + for(final Entry> entry: blacklistMapNotMatched.entrySet()) { + key = entry.getKey(); + try { + if(Pattern.matches(key, hostlow)) { + app = entry.getValue(); + for (int i=0; i properties) { + + boolean allowRegex = true; + int slashPos; + String host, path; + + if (properties != null) { + allowRegex = properties.get("allowRegex").equalsIgnoreCase("true") ? true : false; + } + + if ((slashPos = element.indexOf("/")) == -1) { + host = element; + path = ".*"; + } else { + host = element.substring(0, slashPos); + path = element.substring(slashPos + 1); + } + + if (!allowRegex || !isValidRegex(host)) { + final int i = host.indexOf("*"); + + // check whether host begins illegally + if (!host.matches("([A-Za-z0-9_-]+|\\*)(\\.([A-Za-z0-9_-]+|\\*))*")) { + if (i == 0 && host.length() > 1 && host.charAt(1) != '.') { + return ERR_SUBDOMAIN_XOR_WILDCARD; + } + return ERR_HOST_WRONG_CHARS; + } + + // in host-part only full sub-domains may be wildcards + if (host.length() > 0 && i > -1) { + if (!(i == 0 || i == host.length() - 1)) { + return ERR_WILDCARD_BEGIN_OR_END; + } + + if (i == host.length() - 1 && host.length() > 1 && host.charAt(i - 1) != '.') { + return ERR_SUBDOMAIN_XOR_WILDCARD; + } + } + + // check for double-occurences of "*" in host + if (host.indexOf("*", i + 1) > -1) { + return ERR_TWO_WILDCARDS_IN_HOST; + } + } else if (allowRegex && !isValidRegex(host)) { + return ERR_HOST_REGEX; + } + + // check for errors on regex-compiling path + if (!isValidRegex(path) && !path.equals("*")) { + return ERR_PATH_REGEX; + } + + return 0; + } + + /** + * Checks if a given expression is a valid regular expression. + * @param expression The expression to be checked. + * @return True if the expression is a valid regular expression, else false. + */ + private static boolean isValidRegex(String expression) { + boolean ret = true; + try { + Pattern.compile(expression); + } catch (final PatternSyntaxException e) { + + ret = false; + } + return ret; + } + +} diff --git a/source/net/yacy/repository/BlacklistFile.java b/source/net/yacy/repository/BlacklistFile.java new file mode 100644 index 000000000..9b2ff074e --- /dev/null +++ b/source/net/yacy/repository/BlacklistFile.java @@ -0,0 +1,58 @@ +// BlacklistFile.java +// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 11.07.2005 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2009-09-29 23:28:49 +0200 (Di, 29 Sep 2009) $ +// $LastChangedRevision: 6359 $ +// $LastChangedBy: low012 $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package net.yacy.repository; + +import java.util.Arrays; +import java.util.HashSet; + +public class BlacklistFile { + + private final String filename; + private final String type; + + public BlacklistFile(final String filename, final String type) { + this.filename = filename; + this.type = type; + } + + public String getFileName() { return this.filename; } + + + /** + * Construct a unified array of file names from comma seperated file name + * list. + * + * @return unified String array of file names + */ + public String[] getFileNamesUnified() { + final HashSet hs = new HashSet(Arrays.asList(this.filename.split(","))); + + return hs.toArray(new String[hs.size()]); + } + + public String getType() { return this.type; } +} diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index de4a408cc..9059f2fce 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -421,11 +421,6 @@ public final class LoaderDispatcher { return null; } } - - public static Document parseDocument(final DigestURI url, final long contentLength, final InputStream resourceStream) throws ParserException { - return parseDocument(url, contentLength, resourceStream, null); - } - public static ContentScraper parseResource(final LoaderDispatcher loader, final DigestURI location, int cachePolicy) throws IOException { // load page