- refactoring of blacklists

- refactoring of event origin encoding


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6434 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 26fafd85a5
commit 5e8038ac4d

@ -43,17 +43,16 @@ import java.util.Map.Entry;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import de.anomic.data.AbstractBlacklist;
import de.anomic.data.Blacklist;
import de.anomic.data.DefaultBlacklist;
import de.anomic.data.listManager; import de.anomic.data.listManager;
import de.anomic.http.server.RequestHeader; import de.anomic.http.server.RequestHeader;
import de.anomic.search.SearchEventCache;
import de.anomic.search.Switchboard; import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch; import de.anomic.server.serverSwitch;
import java.util.Set; import java.util.Set;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.repository.Blacklist;
public class BlacklistCleaner_p { public class BlacklistCleaner_p {
@ -65,7 +64,7 @@ public class BlacklistCleaner_p {
private final static String BLACKLIST_FILENAME_FILTER = "^.*\\.black$"; private final static String BLACKLIST_FILENAME_FILTER = "^.*\\.black$";
public static final Class<?>[] supportedBLEngines = { public static final Class<?>[] supportedBLEngines = {
DefaultBlacklist.class Blacklist.class
}; };
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
@ -77,7 +76,7 @@ public class BlacklistCleaner_p {
String blacklistToUse = null; String blacklistToUse = null;
// get the list of supported blacklist types // get the list of supported blacklist types
final String supportedBlacklistTypesStr = AbstractBlacklist.BLACKLIST_TYPES_STRING; final String supportedBlacklistTypesStr = Blacklist.BLACKLIST_TYPES_STRING;
final String[] supportedBlacklistTypes = supportedBlacklistTypesStr.split(","); final String[] supportedBlacklistTypes = supportedBlacklistTypesStr.split(",");
prop.put(DISABLED+"checked", "1"); prop.put(DISABLED+"checked", "1");
@ -246,8 +245,6 @@ public class BlacklistCleaner_p {
final Map<String, String> properties= new HashMap<String, String>(); final Map<String, String> properties= new HashMap<String, String>();
properties.put("allowRegex", String.valueOf(allowRegex)); properties.put("allowRegex", String.valueOf(allowRegex));
if (blEngine instanceof AbstractBlacklist) {
int err = 0; int err = 0;
for (String element : list) { for (String element : list) {
@ -255,7 +252,7 @@ public class BlacklistCleaner_p {
// check for double-occurance // check for double-occurance
if (legalEntries.contains(element)) { if (legalEntries.contains(element)) {
illegalEntries.put(element, Integer.valueOf(AbstractBlacklist.ERR_DOUBLE_OCCURANCE)); illegalEntries.put(element, Integer.valueOf(Blacklist.ERR_DOUBLE_OCCURANCE));
continue; continue;
} }
legalEntries.add(element); legalEntries.add(element);
@ -266,7 +263,6 @@ public class BlacklistCleaner_p {
illegalEntries.put(element, err); illegalEntries.put(element, err);
} }
} }
}
return illegalEntries; return illegalEntries;
} }
@ -309,14 +305,14 @@ public class BlacklistCleaner_p {
final String host = (s.indexOf("/") == -1) ? s : s.substring(0, s.indexOf("/")); final String host = (s.indexOf("/") == -1) ? s : s.substring(0, s.indexOf("/"));
final String path = (s.indexOf("/") == -1) ? ".*" : s.substring(s.indexOf("/") + 1); final String path = (s.indexOf("/") == -1) ? ".*" : s.substring(s.indexOf("/") + 1);
try { try {
Switchboard.urlBlacklist.remove(supportedBlacklistTypes[blTypes], Switchboard.urlBlacklist.remove(supportedBlacklistTypes[blTypes], host, path);
host,path);
} catch (final RuntimeException e) { } catch (final RuntimeException e) {
//System.err.println(e.getMessage() + ": " + host + "/" + path); //System.err.println(e.getMessage() + ": " + host + "/" + path);
Log.logSevere("BLACKLIST-CLEANER", e.getMessage() + ": " + host + "/" + path); Log.logSevere("BLACKLIST-CLEANER", e.getMessage() + ": " + host + "/" + path);
} }
} }
} }
SearchEventCache.cleanupEvents(true);
} }
if (listChanged){ if (listChanged){
listManager.writeList(new File(listManager.listsPath, blacklistToUse), list.toArray(new String[list.size()])); listManager.writeList(new File(listManager.listsPath, blacklistToUse), list.toArray(new String[list.size()]));
@ -360,6 +356,7 @@ public class BlacklistCleaner_p {
path); path);
} }
} }
SearchEventCache.cleanupEvents(true);
} }
pw.close(); pw.close();
} catch (final IOException e) { } catch (final IOException e) {

@ -33,8 +33,8 @@ import java.io.File;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.repository.Blacklist;
import de.anomic.data.Blacklist;
import de.anomic.data.listManager; import de.anomic.data.listManager;
import de.anomic.http.server.RequestHeader; import de.anomic.http.server.RequestHeader;
import de.anomic.search.Switchboard; import de.anomic.search.Switchboard;

@ -41,11 +41,11 @@ import java.util.List;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.repository.Blacklist;
import de.anomic.data.AbstractBlacklist;
import de.anomic.data.Blacklist;
import de.anomic.data.listManager; import de.anomic.data.listManager;
import de.anomic.http.server.RequestHeader; import de.anomic.http.server.RequestHeader;
import de.anomic.search.SearchEventCache;
import de.anomic.search.Switchboard; import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch; import de.anomic.server.serverSwitch;
@ -66,7 +66,7 @@ public class Blacklist_p {
listManager.listsPath = new File(listManager.switchboard.getRootPath(),listManager.switchboard.getConfig("listManager.listsPath", "DATA/LISTS")); listManager.listsPath = new File(listManager.switchboard.getRootPath(),listManager.switchboard.getConfig("listManager.listsPath", "DATA/LISTS"));
// get the list of supported blacklist types // get the list of supported blacklist types
final String supportedBlacklistTypesStr = AbstractBlacklist.BLACKLIST_TYPES_STRING; final String supportedBlacklistTypesStr = Blacklist.BLACKLIST_TYPES_STRING;
final String[] supportedBlacklistTypes = supportedBlacklistTypesStr.split(","); final String[] supportedBlacklistTypes = supportedBlacklistTypesStr.split(",");
// load all blacklist files located in the directory // load all blacklist files located in the directory
@ -560,6 +560,7 @@ public class Blacklist_p {
Switchboard.urlBlacklist.add(supportedBlacklistTypes[blTypes], newEntry.substring(0, pos), newEntry.substring(pos + 1)); Switchboard.urlBlacklist.add(supportedBlacklistTypes[blTypes], newEntry.substring(0, pos), newEntry.substring(pos + 1));
} }
} }
SearchEventCache.cleanupEvents(true);
} }
return null; return null;
@ -610,6 +611,7 @@ public class Blacklist_p {
Switchboard.urlBlacklist.remove(supportedBlacklistTypes[blTypes],oldEntry.substring(0, pos), oldEntry.substring(pos + 1)); Switchboard.urlBlacklist.remove(supportedBlacklistTypes[blTypes],oldEntry.substring(0, pos), oldEntry.substring(pos + 1));
} }
} }
SearchEventCache.cleanupEvents(true);
return null; return null;
} }

@ -34,6 +34,7 @@ import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import de.anomic.crawler.retrieval.EventOrigin;
import de.anomic.http.server.RequestHeader; import de.anomic.http.server.RequestHeader;
import de.anomic.search.Segments; import de.anomic.search.Segments;
import de.anomic.search.Switchboard; import de.anomic.search.Switchboard;
@ -63,20 +64,24 @@ public class CrawlResults {
} }
// find process number // find process number
int tabletype; EventOrigin tabletype;
try { try {
tabletype = Integer.parseInt(post.get("process", "0")); tabletype = EventOrigin.getEvent(Integer.parseInt(post.get("process", "0")));
} catch (final NumberFormatException e) { } catch (final NumberFormatException e) {
tabletype = 0; tabletype = EventOrigin.UNKNOWN;
} }
if ((post != null) && (post.containsKey("autoforward")) && (tabletype == 5) && (sb.crawlResults.getStackSize(5) == 0)) { if (
post != null &&
post.containsKey("autoforward") &&
tabletype == EventOrigin.LOCAL_CRAWLING &&
sb.crawlResults.getStackSize(EventOrigin.LOCAL_CRAWLING) == 0) {
// the main menu does a request to the local crawler page, but in case this table is empty, the overview page is shown // the main menu does a request to the local crawler page, but in case this table is empty, the overview page is shown
tabletype = 0; tabletype = EventOrigin.UNKNOWN;
} }
// check if authorization is needed and/or given // check if authorization is needed and/or given
if (((tabletype > 0) && (tabletype < 6)) || if (tabletype != EventOrigin.UNKNOWN ||
(post != null && (post.containsKey("clearlist") || (post != null && (post.containsKey("clearlist") ||
post.containsKey("deleteentry")))) { post.containsKey("deleteentry")))) {
final String authorization = (header.get(RequestHeader.AUTHORIZATION, "xxxxxx")); final String authorization = (header.get(RequestHeader.AUTHORIZATION, "xxxxxx"));
@ -143,7 +148,7 @@ public class CrawlResults {
} // end != null } // end != null
// create table // create table
if (tabletype == 0) { if (tabletype == EventOrigin.UNKNOWN) {
prop.put("table", "2"); prop.put("table", "2");
} else if (sb.crawlResults.getStackSize(tabletype) == 0 && sb.crawlResults.getDomainListSize(tabletype) == 0) { } else if (sb.crawlResults.getStackSize(tabletype) == 0 && sb.crawlResults.getDomainListSize(tabletype) == 0) {
prop.put("table", "0"); prop.put("table", "0");
@ -159,7 +164,7 @@ public class CrawlResults {
prop.put("table_size_all", sb.crawlResults.getStackSize(tabletype)); prop.put("table_size_all", sb.crawlResults.getStackSize(tabletype));
prop.putHTML("table_feedbackpage", "CrawlResults.html"); prop.putHTML("table_feedbackpage", "CrawlResults.html");
prop.put("table_tabletype", tabletype); prop.put("table_tabletype", tabletype.getCode());
prop.put("table_showInit", (showInit) ? "1" : "0"); prop.put("table_showInit", (showInit) ? "1" : "0");
prop.put("table_showExec", (showExec) ? "1" : "0"); prop.put("table_showExec", (showExec) ? "1" : "0");
prop.put("table_showDate", (showDate) ? "1" : "0"); prop.put("table_showDate", (showDate) ? "1" : "0");
@ -196,7 +201,7 @@ public class CrawlResults {
prop.put("table_indexed_" + cnt + "_dark", (dark) ? "1" : "0"); prop.put("table_indexed_" + cnt + "_dark", (dark) ? "1" : "0");
prop.put("table_indexed_" + cnt + "_feedbackpage", "CrawlResults.html"); prop.put("table_indexed_" + cnt + "_feedbackpage", "CrawlResults.html");
prop.put("table_indexed_" + cnt + "_tabletype", tabletype); prop.put("table_indexed_" + cnt + "_tabletype", tabletype.getCode());
prop.put("table_indexed_" + cnt + "_urlhash", urlHash); prop.put("table_indexed_" + cnt + "_urlhash", urlHash);
if (showInit) { if (showInit) {
@ -266,7 +271,7 @@ public class CrawlResults {
if (domain == null) break; if (domain == null) break;
prop.put("table_domains_" + cnt + "_dark", (dark) ? "1" : "0"); prop.put("table_domains_" + cnt + "_dark", (dark) ? "1" : "0");
prop.put("table_domains_" + cnt + "_feedbackpage", "CrawlResults.html"); prop.put("table_domains_" + cnt + "_feedbackpage", "CrawlResults.html");
prop.put("table_domains_" + cnt + "_tabletype", tabletype); prop.put("table_domains_" + cnt + "_tabletype", tabletype.getCode());
prop.put("table_domains_" + cnt + "_domain", domain); prop.put("table_domains_" + cnt + "_domain", domain);
prop.put("table_domains_" + cnt + "_hashpart", DigestURI.hosthash6(domain)); prop.put("table_domains_" + cnt + "_hashpart", DigestURI.hosthash6(domain));
prop.put("table_domains_" + cnt + "_count", sb.crawlResults.domainCount(tabletype, domain)); prop.put("table_domains_" + cnt + "_count", sb.crawlResults.domainCount(tabletype, domain));
@ -275,7 +280,7 @@ public class CrawlResults {
} }
prop.put("table_domains", cnt); prop.put("table_domains", cnt);
} }
prop.put("process", tabletype); prop.put("process", tabletype.getCode());
// return rewrite properties // return rewrite properties
return prop; return prop;
} }

@ -47,9 +47,8 @@ import net.yacy.kelondro.rwi.Reference;
import net.yacy.kelondro.rwi.ReferenceContainer; import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.rwi.ReferenceContainerCache; import net.yacy.kelondro.rwi.ReferenceContainerCache;
import net.yacy.kelondro.util.DateFormatter; import net.yacy.kelondro.util.DateFormatter;
import net.yacy.repository.Blacklist;
import de.anomic.data.AbstractBlacklist;
import de.anomic.data.Blacklist;
import de.anomic.data.listManager; import de.anomic.data.listManager;
import de.anomic.http.server.RequestHeader; import de.anomic.http.server.RequestHeader;
import de.anomic.search.QueryParams; import de.anomic.search.QueryParams;
@ -320,6 +319,7 @@ public class IndexControlRWIs_p {
url.getFile()); url.getFile());
} }
} }
SearchEventCache.cleanupEvents(true);
} }
} }
pw.close(); pw.close();
@ -330,7 +330,7 @@ public class IndexControlRWIs_p {
if (post.containsKey("blacklistdomains")) { if (post.containsKey("blacklistdomains")) {
PrintWriter pw; PrintWriter pw;
try { try {
final String[] supportedBlacklistTypes = AbstractBlacklist.BLACKLIST_TYPES_STRING.split(","); final String[] supportedBlacklistTypes = Blacklist.BLACKLIST_TYPES_STRING.split(",");
pw = new PrintWriter(new FileWriter(new File(listManager.listsPath, blacklist), true)); pw = new PrintWriter(new FileWriter(new File(listManager.listsPath, blacklist), true));
DigestURI url; DigestURI url;
for (i = 0; i<urlx.length; i++) { for (i = 0; i<urlx.length; i++) {

@ -37,8 +37,8 @@ import net.yacy.kelondro.index.Row.Entry;
import net.yacy.kelondro.order.NaturalOrder; import net.yacy.kelondro.order.NaturalOrder;
import net.yacy.kelondro.util.DateFormatter; import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.util.ScoreCluster; import net.yacy.kelondro.util.ScoreCluster;
import net.yacy.repository.Blacklist;
import de.anomic.data.Blacklist;
import de.anomic.http.server.RequestHeader; import de.anomic.http.server.RequestHeader;
import de.anomic.search.Switchboard; import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;

@ -37,8 +37,8 @@ import net.yacy.kelondro.index.Row.Entry;
import net.yacy.kelondro.order.NaturalOrder; import net.yacy.kelondro.order.NaturalOrder;
import net.yacy.kelondro.util.DateFormatter; import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.util.ScoreCluster; import net.yacy.kelondro.util.ScoreCluster;
import net.yacy.repository.Blacklist;
import de.anomic.data.Blacklist;
import de.anomic.http.server.RequestHeader; import de.anomic.http.server.RequestHeader;
import de.anomic.search.Switchboard; import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;

@ -267,7 +267,7 @@ public class ViewFile {
// parsing the resource content // parsing the resource content
Document document = null; Document document = null;
try { try {
document = LoaderDispatcher.parseDocument(url, resourceLength, resource); document = LoaderDispatcher.parseDocument(url, resourceLength, resource, null);
if (document == null) { if (document == null) {
prop.put("error", "5"); prop.put("error", "5");
prop.put("error_errorText", "Unknown error"); prop.put("error_errorText", "Unknown error");

@ -2,7 +2,8 @@
import java.io.File; import java.io.File;
import java.util.List; import java.util.List;
import de.anomic.data.AbstractBlacklist; import net.yacy.repository.Blacklist;
import de.anomic.data.listManager; import de.anomic.data.listManager;
import de.anomic.http.server.RequestHeader; import de.anomic.http.server.RequestHeader;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
@ -34,7 +35,7 @@ public class blacklists_p {
prop.put("lists_" + blacklistCount + "_shared", "0"); prop.put("lists_" + blacklistCount + "_shared", "0");
} }
final String[] types = AbstractBlacklist.BLACKLIST_TYPES_STRING.split(","); final String[] types = Blacklist.BLACKLIST_TYPES_STRING.split(",");
for (int j=0; j<types.length; j++) { for (int j=0; j<types.length; j++) {
prop.putXML("lists_" + blacklistCount + "_types_" + j + "_name", types[j]); prop.putXML("lists_" + blacklistCount + "_types_" + j + "_name", types[j]);
prop.put("lists_" + blacklistCount + "_types_" + j + "_value", prop.put("lists_" + blacklistCount + "_types_" + j + "_value",

@ -40,13 +40,13 @@ import java.util.Iterator;
import java.util.List; import java.util.List;
import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.data.AbstractBlacklist;
import de.anomic.data.listManager; import de.anomic.data.listManager;
import de.anomic.data.list.ListAccumulator; import de.anomic.data.list.ListAccumulator;
import de.anomic.data.list.XMLBlacklistImporter; import de.anomic.data.list.XMLBlacklistImporter;
import de.anomic.http.client.Client; import de.anomic.http.client.Client;
import de.anomic.http.server.HeaderFramework; import de.anomic.http.server.HeaderFramework;
import de.anomic.http.server.RequestHeader; import de.anomic.http.server.RequestHeader;
import de.anomic.search.SearchEventCache;
import de.anomic.search.Switchboard; import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch; import de.anomic.server.serverSwitch;
@ -54,6 +54,7 @@ import de.anomic.yacy.yacySeed;
import net.yacy.document.parser.html.CharacterCoding; import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
import net.yacy.repository.Blacklist;
import org.xml.sax.SAXException; import org.xml.sax.SAXException;
@ -249,7 +250,7 @@ public class sharedBlacklist_p {
count++; count++;
if (Switchboard.urlBlacklist != null) { if (Switchboard.urlBlacklist != null) {
final String supportedBlacklistTypesStr = AbstractBlacklist.BLACKLIST_TYPES_STRING; final String supportedBlacklistTypesStr = Blacklist.BLACKLIST_TYPES_STRING;
final String[] supportedBlacklistTypes = supportedBlacklistTypesStr.split(","); final String[] supportedBlacklistTypes = supportedBlacklistTypesStr.split(",");
for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) { for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) {
@ -257,6 +258,7 @@ public class sharedBlacklist_p {
Switchboard.urlBlacklist.add(supportedBlacklistTypes[blTypes],newItem.substring(0, pos), newItem.substring(pos + 1)); Switchboard.urlBlacklist.add(supportedBlacklistTypes[blTypes],newItem.substring(0, pos), newItem.substring(pos + 1));
} }
} }
SearchEventCache.cleanupEvents(true);
} }
} }
} }

@ -33,6 +33,7 @@ import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import de.anomic.crawler.ZURL; import de.anomic.crawler.ZURL;
import de.anomic.crawler.retrieval.EventOrigin;
import de.anomic.http.server.RequestHeader; import de.anomic.http.server.RequestHeader;
import de.anomic.search.Segments; import de.anomic.search.Segments;
import de.anomic.search.Switchboard; import de.anomic.search.Switchboard;
@ -139,7 +140,7 @@ public final class crawlReceipt {
if (result.equals("fill")) try { if (result.equals("fill")) try {
// put new entry into database // put new entry into database
sb.indexSegments.urlMetadata(Segments.Process.RECEIPTS).store(entry); sb.indexSegments.urlMetadata(Segments.Process.RECEIPTS).store(entry);
sb.crawlResults.stack(entry, youare, iam, 1); sb.crawlResults.stack(entry, youare, iam, EventOrigin.REMOTE_RECEIPTS);
sb.crawlQueues.delegatedURL.remove(entry.hash()); // the delegated work has been done sb.crawlQueues.delegatedURL.remove(entry.hash()); // the delegated work has been done
log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + entry.hash() + ":" + metadata.url().toNormalform(false, true)); log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + entry.hash() + ":" + metadata.url().toNormalform(false, true));

@ -37,8 +37,8 @@ import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.kelondro.data.word.WordReferenceRow; import net.yacy.kelondro.data.word.WordReferenceRow;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
import net.yacy.repository.Blacklist;
import de.anomic.data.Blacklist;
import de.anomic.http.server.RequestHeader; import de.anomic.http.server.RequestHeader;
import de.anomic.search.Segments; import de.anomic.search.Segments;
import de.anomic.search.Switchboard; import de.anomic.search.Switchboard;

@ -33,8 +33,9 @@ import net.yacy.document.content.RSSMessage;
import net.yacy.document.parser.xml.RSSFeed; import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.util.DateFormatter; import net.yacy.kelondro.util.DateFormatter;
import net.yacy.repository.Blacklist;
import de.anomic.data.Blacklist; import de.anomic.crawler.retrieval.EventOrigin;
import de.anomic.http.server.RequestHeader; import de.anomic.http.server.RequestHeader;
import de.anomic.search.Segments; import de.anomic.search.Segments;
import de.anomic.search.Switchboard; import de.anomic.search.Switchboard;
@ -141,7 +142,7 @@ public final class transferURL {
yacyCore.log.logInfo("Accepting URL " + i + "/" + urlc + " from peer " + otherPeerName + ": " + lEntry.metadata().url().toNormalform(true, false)); yacyCore.log.logInfo("Accepting URL " + i + "/" + urlc + " from peer " + otherPeerName + ": " + lEntry.metadata().url().toNormalform(true, false));
try { try {
sb.indexSegments.urlMetadata(Segments.Process.DHTIN).store(lEntry); sb.indexSegments.urlMetadata(Segments.Process.DHTIN).store(lEntry);
sb.crawlResults.stack(lEntry, iam, iam, 3); sb.crawlResults.stack(lEntry, iam, iam, EventOrigin.DHT_TRANSFER);
if (yacyCore.log.isFine()) yacyCore.log.logFine("transferURL: received URL '" + metadata.url().toNormalform(false, true) + "' from peer " + otherPeerName); if (yacyCore.log.isFine()) yacyCore.log.logFine("transferURL: received URL '" + metadata.url().toNormalform(false, true) + "' from peer " + otherPeerName);
received++; received++;
} catch (final IOException e) { } catch (final IOException e) {

@ -36,9 +36,9 @@ import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.Domains; import net.yacy.kelondro.util.Domains;
import net.yacy.kelondro.workflow.WorkflowProcessor; import net.yacy.kelondro.workflow.WorkflowProcessor;
import net.yacy.repository.Blacklist;
import de.anomic.crawler.retrieval.Request; import de.anomic.crawler.retrieval.Request;
import de.anomic.data.Blacklist;
import de.anomic.search.Segment; import de.anomic.search.Segment;
import de.anomic.search.Switchboard; import de.anomic.search.Switchboard;
import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.yacySeedDB;

@ -35,9 +35,11 @@ package de.anomic.crawler;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.util.Date; import java.util.Date;
import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.Map;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadataRow;
@ -46,6 +48,7 @@ import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Bitfield; import net.yacy.kelondro.order.Bitfield;
import net.yacy.kelondro.util.ScoreCluster; import net.yacy.kelondro.util.ScoreCluster;
import de.anomic.crawler.retrieval.EventOrigin;
import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.yacySeedDB;
public final class ResultURLs { public final class ResultURLs {
@ -53,38 +56,20 @@ public final class ResultURLs {
// result stacks; // result stacks;
// these have all entries of form // these have all entries of form
// strings: urlHash + initiatorHash + ExecutorHash // strings: urlHash + initiatorHash + ExecutorHash
private final LinkedList<String> externResultStack; // 1 - remote index: retrieved by other peer private final Map<EventOrigin, LinkedList<String>> resultStacks;
private final LinkedList<String> searchResultStack; // 2 - partly remote/local index: result of search queries private final Map<EventOrigin, ScoreCluster<String>> resultDomains;
private final LinkedList<String> transfResultStack; // 3 - partly remote/local index: result of index transfer
private final LinkedList<String> proxyResultStack; // 4 - local index: result of proxy fetch/prefetch
private final LinkedList<String> lcrawlResultStack; // 5 - local index: result of local crawling
private final LinkedList<String> gcrawlResultStack; // 6 - local index: triggered external
private final ScoreCluster<String> externResultDomains;
private final ScoreCluster<String> searchResultDomains;
private final ScoreCluster<String> transfResultDomains;
private final ScoreCluster<String> proxyResultDomains;
private final ScoreCluster<String> lcrawlResultDomains;
private final ScoreCluster<String> gcrawlResultDomains;
public ResultURLs() { public ResultURLs() {
// init result stacks // init result stacks
externResultStack = new LinkedList<String>(); resultStacks = new HashMap<EventOrigin, LinkedList<String>>();
searchResultStack = new LinkedList<String>(); resultDomains = new HashMap<EventOrigin, ScoreCluster<String>>();
transfResultStack = new LinkedList<String>(); for (EventOrigin origin: EventOrigin.values()) {
proxyResultStack = new LinkedList<String>(); resultStacks.put(origin, new LinkedList<String>());
lcrawlResultStack = new LinkedList<String>(); resultDomains.put(origin, new ScoreCluster<String>());
gcrawlResultStack = new LinkedList<String>(); }
// init result domain statistics
externResultDomains = new ScoreCluster<String>();
searchResultDomains = new ScoreCluster<String>();
transfResultDomains = new ScoreCluster<String>();
proxyResultDomains = new ScoreCluster<String>();
lcrawlResultDomains = new ScoreCluster<String>();
gcrawlResultDomains = new ScoreCluster<String>();
} }
public synchronized void stack(final URIMetadataRow e, final String initiatorHash, final String executorHash, final int stackType) { public synchronized void stack(final URIMetadataRow e, final String initiatorHash, final String executorHash, final EventOrigin stackType) {
assert initiatorHash != null; assert initiatorHash != null;
assert executorHash != null; assert executorHash != null;
if (e == null) { return; } if (e == null) { return; }
@ -108,27 +93,27 @@ public final class ResultURLs {
} }
} }
public synchronized int getStackSize(final int stack) { public synchronized int getStackSize(final EventOrigin stack) {
final List<String> resultStack = getStack(stack); final List<String> resultStack = getStack(stack);
if (resultStack == null) return 0; if (resultStack == null) return 0;
return resultStack.size(); return resultStack.size();
} }
public synchronized int getDomainListSize(final int stack) { public synchronized int getDomainListSize(final EventOrigin stack) {
final ScoreCluster<String> domains = getDomains(stack); final ScoreCluster<String> domains = getDomains(stack);
if (domains == null) return 0; if (domains == null) return 0;
return domains.size(); return domains.size();
} }
public synchronized String getUrlHash(final int stack, final int pos) { public synchronized String getUrlHash(final EventOrigin stack, final int pos) {
return getHashNo(stack, pos, 0); return getHashNo(stack, pos, 0);
} }
public synchronized String getInitiatorHash(final int stack, final int pos) { public synchronized String getInitiatorHash(final EventOrigin stack, final int pos) {
return getHashNo(stack, pos, 1); return getHashNo(stack, pos, 1);
} }
public synchronized String getExecutorHash(final int stack, final int pos) { public synchronized String getExecutorHash(final EventOrigin stack, final int pos) {
return getHashNo(stack, pos, 2); return getHashNo(stack, pos, 2);
} }
@ -150,7 +135,7 @@ public final class ResultURLs {
* @param index starting at 0 * @param index starting at 0
* @return * @return
*/ */
public synchronized String getHashNo(final int stack, final int pos, final int index) { public synchronized String getHashNo(final EventOrigin stack, final int pos, final int index) {
final String result = getResultStackAt(stack, pos); final String result = getResultStackAt(stack, pos);
if(result != null) { if(result != null) {
if(result.length() < Word.commonHashLength * 3) { if(result.length() < Word.commonHashLength * 3) {
@ -175,7 +160,7 @@ public final class ResultURLs {
* @param pos * @param pos
* @return null if either stack or element do not exist * @return null if either stack or element do not exist
*/ */
private String getResultStackAt(final int stack, final int pos) { private String getResultStackAt(final EventOrigin stack, final int pos) {
assert pos >= 0 : "precondition violated: " + pos + " >= 0"; assert pos >= 0 : "precondition violated: " + pos + " >= 0";
final List<String> resultStack = getStack(stack); final List<String> resultStack = getStack(stack);
@ -196,12 +181,12 @@ public final class ResultURLs {
* iterate all domains in the result domain statistic * iterate all domains in the result domain statistic
* @return iterator of domains in reverse order (downwards) * @return iterator of domains in reverse order (downwards)
*/ */
public Iterator<String> domains(final int stack) { public Iterator<String> domains(final EventOrigin stack) {
assert getDomains(stack) != null : "getDomains(" + stack + ") = null"; assert getDomains(stack) != null : "getDomains(" + stack + ") = null";
return getDomains(stack).scores(false); return getDomains(stack).scores(false);
} }
public int deleteDomain(final int stack, String host, String hosthash) { public int deleteDomain(final EventOrigin stack, String host, String hosthash) {
assert hosthash.length() == 6; assert hosthash.length() == 6;
int i = 0; int i = 0;
while (i < getStackSize(stack)) { while (i < getStackSize(stack)) {
@ -218,41 +203,23 @@ public final class ResultURLs {
* @param domain name * @param domain name
* @return the number of occurrences of the domain in the stack statistics * @return the number of occurrences of the domain in the stack statistics
*/ */
public int domainCount(final int stack, String domain) { public int domainCount(final EventOrigin stack, String domain) {
assert domain != null : "domain = null"; assert domain != null : "domain = null";
assert getDomains(stack) != null : "getDomains(" + stack + ") = null"; assert getDomains(stack) != null : "getDomains(" + stack + ") = null";
return getDomains(stack).getScore(domain); return getDomains(stack).getScore(domain);
} }
/** /**
* returns the stack indentified by the id <em>stack</em> * returns the stack identified by the id <em>stack</em>
* *
* @param stack id of resultStack * @param stack id of resultStack
* @return null if stack does not exist (id is unknown or stack is null (which should not occur and an error is logged)) * @return null if stack does not exist (id is unknown or stack is null (which should not occur and an error is logged))
*/ */
private List<String> getStack(final int stack) { private List<String> getStack(final EventOrigin stack) {
switch (stack) { return resultStacks.get(stack);
case 1: return externResultStack;
case 2: return searchResultStack;
case 3: return transfResultStack;
case 4: return proxyResultStack;
case 5: return lcrawlResultStack;
case 6: return gcrawlResultStack;
default:
return null;
}
}
private ScoreCluster<String> getDomains(final int stack) {
switch (stack) {
case 1: return externResultDomains;
case 2: return searchResultDomains;
case 3: return transfResultDomains;
case 4: return proxyResultDomains;
case 5: return lcrawlResultDomains;
case 6: return gcrawlResultDomains;
default:
return null;
} }
private ScoreCluster<String> getDomains(final EventOrigin stack) {
return resultDomains.get(stack);
} }
/** /**
@ -261,11 +228,11 @@ public final class ResultURLs {
* @param stack * @param stack
* @return * @return
*/ */
private boolean isValidStack(final int stack) { private boolean isValidStack(final EventOrigin stack) {
return getStack(stack) != null; return getStack(stack) != null;
} }
public synchronized boolean removeStack(final int stack, final int pos) { public synchronized boolean removeStack(final EventOrigin stack, final int pos) {
final List<String> resultStack = getStack(stack); final List<String> resultStack = getStack(stack);
if (resultStack == null) { if (resultStack == null) {
return false; return false;
@ -273,7 +240,7 @@ public final class ResultURLs {
return resultStack.remove(pos) != null; return resultStack.remove(pos) != null;
} }
public synchronized void clearStack(final int stack) { public synchronized void clearStack(final EventOrigin stack) {
final List<String> resultStack = getStack(stack); final List<String> resultStack = getStack(stack);
if (resultStack != null) resultStack.clear(); if (resultStack != null) resultStack.clear();
final ScoreCluster<String> resultDomains = getDomains(stack); final ScoreCluster<String> resultDomains = getDomains(stack);
@ -287,11 +254,11 @@ public final class ResultURLs {
public synchronized boolean remove(final String urlHash) { public synchronized boolean remove(final String urlHash) {
if (urlHash == null) return false; if (urlHash == null) return false;
String hash; String hash;
for (int stack = 1; stack <= 6; stack++) { for (EventOrigin origin: EventOrigin.values()) {
for (int i = getStackSize(stack) - 1; i >= 0; i--) { for (int i = getStackSize(origin) - 1; i >= 0; i--) {
hash = getUrlHash(stack, i); hash = getUrlHash(origin, i);
if (hash != null && hash.equals(urlHash)) { if (hash != null && hash.equals(urlHash)) {
removeStack(stack, i); removeStack(origin, i);
return true; return true;
} }
} }
@ -308,7 +275,7 @@ public final class ResultURLs {
try { try {
final DigestURI url = new DigestURI("http", "www.yacy.net", 80, "/"); final DigestURI url = new DigestURI("http", "www.yacy.net", 80, "/");
final URIMetadataRow urlRef = new URIMetadataRow(url, "YaCy Homepage", "", "", "", new Date(), new Date(), new Date(), "", new byte[] {}, 123, 42, '?', new Bitfield(), "de", 0, 0, 0, 0, 0, 0); final URIMetadataRow urlRef = new URIMetadataRow(url, "YaCy Homepage", "", "", "", new Date(), new Date(), new Date(), "", new byte[] {}, 123, 42, '?', new Bitfield(), "de", 0, 0, 0, 0, 0, 0);
int stackNo = 1; EventOrigin stackNo = EventOrigin.LOCAL_CRAWLING;
System.out.println("valid test:\n======="); System.out.println("valid test:\n=======");
// add // add
results.stack(urlRef, urlRef.hash(), url.hash(), stackNo); results.stack(urlRef, urlRef.hash(), url.hash(), stackNo);
@ -324,29 +291,6 @@ public final class ResultURLs {
System.out.println("url hash:\t"+ results.getUrlHash(stackNo, 1)); System.out.println("url hash:\t"+ results.getUrlHash(stackNo, 1));
System.out.println("executor hash:\t"+ results.getExecutorHash(stackNo, 1)); System.out.println("executor hash:\t"+ results.getExecutorHash(stackNo, 1));
System.out.println("initiator hash:\t"+ results.getInitiatorHash(stackNo, 1)); System.out.println("initiator hash:\t"+ results.getInitiatorHash(stackNo, 1));
stackNo = 42;
System.out.println("size of stack:\t"+ results.getStackSize(stackNo));
// get
System.out.println("url hash:\t"+ results.getUrlHash(stackNo, 0));
System.out.println("executor hash:\t"+ results.getExecutorHash(stackNo, 0));
System.out.println("initiator hash:\t"+ results.getInitiatorHash(stackNo, 0));
// benchmark
final long start = System.currentTimeMillis();
for(int i = 0; i < 1000000; i++) {
stackNo = i % 6;
// add
results.stack(urlRef, urlRef.hash(), url.hash(), stackNo);
// size
results.getStackSize(stackNo);
// get
for(int j = 0; j < 10; j++) {
results.getUrlHash(stackNo, i / 6);
results.getExecutorHash(stackNo, i / 6);
results.getInitiatorHash(stackNo, i / 6);
}
}
System.out.println("benschmark: "+ (System.currentTimeMillis() - start) + " ms");
} catch (final MalformedURLException e) { } catch (final MalformedURLException e) {
e.printStackTrace(); e.printStackTrace();
} }

@ -0,0 +1,34 @@
package de.anomic.crawler.retrieval;
public enum EventOrigin {
// we must distinguish the following cases: resource-load was initiated by
// 1) global crawling: the index is extern, not here (not possible here)
// 2) result of search queries, some indexes are here (not possible here)
// 3) result of index transfer, some of them are here (not possible here)
// 4) proxy-load (initiator is "------------")
// 5) local prefetch/crawling (initiator is own seedHash)
// 6) local fetching for global crawling (other known or unknown initiator)
UNKNOWN(0),
REMOTE_RECEIPTS(1),
QUERIES(2),
DHT_TRANSFER(3),
PROXY_LOAD(4),
LOCAL_CRAWLING(5),
GLOBAL_CRAWLING(6);
protected int code;
private static final EventOrigin[] list = {
UNKNOWN, REMOTE_RECEIPTS, QUERIES, DHT_TRANSFER, PROXY_LOAD, LOCAL_CRAWLING, GLOBAL_CRAWLING};
private EventOrigin(int code) {
this.code = code;
}
public int getCode() {
return this.code;
}
public static final EventOrigin getEvent(int key) {
return list[key];
}
}

@ -31,9 +31,9 @@ import java.util.Date;
import net.yacy.document.TextParser; import net.yacy.document.TextParser;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.repository.Blacklist;
import de.anomic.crawler.Latency; import de.anomic.crawler.Latency;
import de.anomic.data.Blacklist;
import de.anomic.http.client.Client; import de.anomic.http.client.Client;
import de.anomic.http.server.HeaderFramework; import de.anomic.http.server.HeaderFramework;
import de.anomic.http.server.RequestHeader; import de.anomic.http.server.RequestHeader;

@ -37,7 +37,6 @@ import de.anomic.crawler.CrawlProfile;
import de.anomic.http.server.HeaderFramework; import de.anomic.http.server.HeaderFramework;
import de.anomic.http.server.RequestHeader; import de.anomic.http.server.RequestHeader;
import de.anomic.http.server.ResponseHeader; import de.anomic.http.server.ResponseHeader;
import de.anomic.search.SwitchboardConstants;
public class Response { public class Response {
@ -770,7 +769,7 @@ public class Response {
(requestHeader.get(HeaderFramework.X_YACY_INDEX_CONTROL)).toUpperCase().equals("NO-INDEX"); (requestHeader.get(HeaderFramework.X_YACY_INDEX_CONTROL)).toUpperCase().equals("NO-INDEX");
} }
public int processCase(String mySeedHash) { public EventOrigin processCase(String mySeedHash) {
// we must distinguish the following cases: resource-load was initiated by // we must distinguish the following cases: resource-load was initiated by
// 1) global crawling: the index is extern, not here (not possible here) // 1) global crawling: the index is extern, not here (not possible here)
// 2) result of search queries, some indexes are here (not possible here) // 2) result of search queries, some indexes are here (not possible here)
@ -778,17 +777,17 @@ public class Response {
// 4) proxy-load (initiator is "------------") // 4) proxy-load (initiator is "------------")
// 5) local prefetch/crawling (initiator is own seedHash) // 5) local prefetch/crawling (initiator is own seedHash)
// 6) local fetching for global crawling (other known or unknwon initiator) // 6) local fetching for global crawling (other known or unknwon initiator)
int processCase = SwitchboardConstants.PROCESSCASE_0_UNKNOWN; EventOrigin processCase = EventOrigin.UNKNOWN;
// FIXME the equals seems to be incorrect: String.equals(boolean) // FIXME the equals seems to be incorrect: String.equals(boolean)
if ((initiator() == null) || initiator().length() == 0 || initiator().equals("------------")) { if ((initiator() == null) || initiator().length() == 0 || initiator().equals("------------")) {
// proxy-load // proxy-load
processCase = SwitchboardConstants.PROCESSCASE_4_PROXY_LOAD; processCase = EventOrigin.PROXY_LOAD;
} else if (initiator().equals(mySeedHash)) { } else if (initiator().equals(mySeedHash)) {
// normal crawling // normal crawling
processCase = SwitchboardConstants.PROCESSCASE_5_LOCAL_CRAWLING; processCase = EventOrigin.LOCAL_CRAWLING;
} else { } else {
// this was done for remote peer (a global crawl) // this was done for remote peer (a global crawl)
processCase = SwitchboardConstants.PROCESSCASE_6_GLOBAL_CRAWLING; processCase = EventOrigin.GLOBAL_CRAWLING;
} }
return processCase; return processCase;
} }

@ -1,102 +0,0 @@
// Blacklist.java
// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 26.03.2008 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.data;
import java.io.File;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Map;
import net.yacy.kelondro.data.meta.DigestURI;
public interface Blacklist {
public static final String BLACKLIST_DHT = "dht";
public static final String BLACKLIST_CRAWLER = "crawler";
public static final String BLACKLIST_PROXY = "proxy";
public static final String BLACKLIST_SEARCH = "search";
public static final String BLACKLIST_SURFTIPS = "surftips";
public static final String BLACKLIST_NEWS = "news";
public static final class blacklistFile {
private final String filename;
private final String type;
public blacklistFile(final String filename, final String type) {
this.filename = filename;
this.type = type;
}
public String getFileName() { return this.filename; }
/**
* Construct a unified array of file names from comma seperated file name
* list.
*
* @return unified String array of file names
*/
public String[] getFileNamesUnified() {
final HashSet<String> hs = new HashSet<String>(Arrays.asList(this.filename.split(",")));
return hs.toArray(new String[hs.size()]);
}
public String getType() { return this.type; }
}
public String getEngineInfo();
public void setRootPath(File rootPath);
public int blacklistCacheSize();
public int size();
public void clear();
public void removeAll(String blacklistType, String host);
public void remove(String blacklistType, String host, String path);
public void add(String blacklistType, String host, String path);
public void loadList(String blacklistType, String filenames, String sep);
public void loadList(blacklistFile[] blFiles, String sep);
public boolean contains(String blacklistType, String host, String path);
public boolean hashInBlacklistedCache(String blacklistType, String urlHash);
public boolean isListed(String blacklistType, DigestURI url);
public boolean isListed(String blacklistType, String hostlow, String path);
public int checkError(String entry, Map<String, String> properties);
}

@ -1,194 +0,0 @@
// indexDefaultReference.java
// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 11.07.2005 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.data;
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
public class DefaultBlacklist extends AbstractBlacklist implements Blacklist {
public DefaultBlacklist(final File rootPath) {
super(rootPath);
}
public String getEngineInfo() {
return "Default YaCy Blacklist Engine";
}
public boolean isListed(final String blacklistType, final String hostlow, String path) {
if (hostlow == null) throw new NullPointerException();
if (path == null) throw new NullPointerException();
// getting the proper blacklist
final HashMap<String, ArrayList<String>> blacklistMapMatched = super.getBlacklistMap(blacklistType,true);
if (path.length() > 0 && path.charAt(0) == '/') path = path.substring(1);
ArrayList<String> app;
boolean matched = false;
String pp = ""; // path-pattern
// try to match complete domain
if (!matched && (app = blacklistMapMatched.get(hostlow)) != null) {
for (int i=app.size()-1; !matched && i>-1; i--) {
pp = app.get(i);
matched |= ((pp.equals("*")) || (path.matches(pp)));
}
}
// first try to match the domain with wildcard '*'
// [TL] While "." are found within the string
int index = 0;
while (!matched && (index = hostlow.indexOf('.', index + 1)) != -1) {
if ((app = blacklistMapMatched.get(hostlow.substring(0, index + 1) + "*")) != null) {
for (int i=app.size()-1; !matched && i>-1; i--) {
pp = app.get(i);
matched |= ((pp.equals("*")) || (path.matches(pp)));
}
}
if ((app = blacklistMapMatched.get(hostlow.substring(0, index))) != null) {
for (int i=app.size()-1; !matched && i>-1; i--) {
pp = app.get(i);
matched |= ((pp.equals("*")) || (path.matches(pp)));
}
}
}
index = hostlow.length();
while (!matched && (index = hostlow.lastIndexOf('.', index - 1)) != -1) {
if ((app = blacklistMapMatched.get("*" + hostlow.substring(index, hostlow.length()))) != null) {
for (int i=app.size()-1; !matched && i>-1; i--) {
pp = app.get(i);
matched |= ((pp.equals("*")) || (path.matches(pp)));
}
}
if ((app = blacklistMapMatched.get(hostlow.substring(index +1, hostlow.length()))) != null) {
for (int i=app.size()-1; !matched && i>-1; i--) {
pp = app.get(i);
matched |= ((pp.equals("*")) || (path.matches(pp)));
}
}
}
// loop over all Regexentrys
if(!matched) {
final HashMap<String, ArrayList<String>> blacklistMapNotMatched = super.getBlacklistMap(blacklistType,false);
String key;
for(final Entry<String, ArrayList<String>> entry: blacklistMapNotMatched.entrySet()) {
key = entry.getKey();
try {
if(Pattern.matches(key, hostlow)) {
app = entry.getValue();
for (int i=0; i<app.size(); i++) {
if(Pattern.matches(app.get(i), path))
return true;
}
}
} catch (final PatternSyntaxException e) {
//System.out.println(e.toString());
}
}
}
return matched;
}
public int checkError(String element, Map<String, String> properties) {
boolean allowRegex = true;
int slashPos;
String host, path;
if (properties != null) {
allowRegex = properties.get("allowRegex").equalsIgnoreCase("true") ? true : false;
}
if ((slashPos = element.indexOf("/")) == -1) {
host = element;
path = ".*";
} else {
host = element.substring(0, slashPos);
path = element.substring(slashPos + 1);
}
if (!allowRegex || !isValidRegex(host)) {
final int i = host.indexOf("*");
// check whether host begins illegally
if (!host.matches("([A-Za-z0-9_-]+|\\*)(\\.([A-Za-z0-9_-]+|\\*))*")) {
if (i == 0 && host.length() > 1 && host.charAt(1) != '.') {
return ERR_SUBDOMAIN_XOR_WILDCARD;
}
return ERR_HOST_WRONG_CHARS;
}
// in host-part only full sub-domains may be wildcards
if (host.length() > 0 && i > -1) {
if (!(i == 0 || i == host.length() - 1)) {
return ERR_WILDCARD_BEGIN_OR_END;
}
if (i == host.length() - 1 && host.length() > 1 && host.charAt(i - 1) != '.') {
return ERR_SUBDOMAIN_XOR_WILDCARD;
}
}
// check for double-occurences of "*" in host
if (host.indexOf("*", i + 1) > -1) {
return ERR_TWO_WILDCARDS_IN_HOST;
}
} else if (allowRegex && !isValidRegex(host)) {
return ERR_HOST_REGEX;
}
// check for errors on regex-compiling path
if (!isValidRegex(path) && !path.equals("*")) {
return ERR_PATH_REGEX;
}
return 0;
}
/**
* Checks if a given expression is a valid regular expression.
* @param expression The expression to be checked.
* @return True if the expression is a valid regular expression, else false.
*/
private static boolean isValidRegex(String expression) {
boolean ret = true;
try {
Pattern.compile(expression);
} catch (final PatternSyntaxException e) {
ret = false;
}
return ret;
}
}

@ -42,7 +42,10 @@ import java.util.List;
import java.util.Set; import java.util.Set;
import java.util.Vector; import java.util.Vector;
import de.anomic.data.Blacklist.blacklistFile; import net.yacy.repository.Blacklist;
import net.yacy.repository.BlacklistFile;
import de.anomic.search.SearchEventCache;
import de.anomic.search.Switchboard; import de.anomic.search.Switchboard;
import de.anomic.server.serverCore; import de.anomic.server.serverCore;
@ -391,12 +394,12 @@ public class listManager {
* Load or reload all active Blacklists * Load or reload all active Blacklists
*/ */
public static void reloadBlacklists(){ public static void reloadBlacklists(){
final String supportedBlacklistTypesStr = AbstractBlacklist.BLACKLIST_TYPES_STRING; final String supportedBlacklistTypesStr = Blacklist.BLACKLIST_TYPES_STRING;
final String[] supportedBlacklistTypes = supportedBlacklistTypesStr.split(","); final String[] supportedBlacklistTypes = supportedBlacklistTypesStr.split(",");
final ArrayList<blacklistFile> blacklistFiles = new ArrayList<blacklistFile>(supportedBlacklistTypes.length); final ArrayList<BlacklistFile> blacklistFiles = new ArrayList<BlacklistFile>(supportedBlacklistTypes.length);
for (int i=0; i < supportedBlacklistTypes.length; i++) { for (int i=0; i < supportedBlacklistTypes.length; i++) {
final blacklistFile blFile = new blacklistFile( final BlacklistFile blFile = new BlacklistFile(
switchboard.getConfig( switchboard.getConfig(
supportedBlacklistTypes[i] + ".BlackLists", switchboard.getConfig("BlackLists.DefaultList", "url.default.black")), supportedBlacklistTypes[i] + ".BlackLists", switchboard.getConfig("BlackLists.DefaultList", "url.default.black")),
supportedBlacklistTypes[i]); supportedBlacklistTypes[i]);
@ -405,8 +408,9 @@ public class listManager {
Switchboard.urlBlacklist.clear(); Switchboard.urlBlacklist.clear();
Switchboard.urlBlacklist.loadList( Switchboard.urlBlacklist.loadList(
blacklistFiles.toArray(new blacklistFile[blacklistFiles.size()]), blacklistFiles.toArray(new BlacklistFile[blacklistFiles.size()]),
"/"); "/");
SearchEventCache.cleanupEvents(true);
// switchboard.urlBlacklist.clear(); // switchboard.urlBlacklist.clear();
// if (f != "") switchboard.urlBlacklist.loadLists("black", f, "/"); // if (f != "") switchboard.urlBlacklist.loadLists("black", f, "/");

@ -80,11 +80,11 @@ import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.DateFormatter; import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.util.Domains; import net.yacy.kelondro.util.Domains;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
import net.yacy.repository.Blacklist;
import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.Request; import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.retrieval.Response; import de.anomic.crawler.retrieval.Response;
import de.anomic.data.Blacklist;
import de.anomic.http.client.MultiOutputStream; import de.anomic.http.client.MultiOutputStream;
import de.anomic.http.client.Client; import de.anomic.http.client.Client;
import de.anomic.http.client.RemoteProxyConfig; import de.anomic.http.client.RemoteProxyConfig;

@ -9,8 +9,9 @@ import net.yacy.document.content.RSSMessage;
import net.yacy.document.parser.xml.RSSFeed; import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.util.DateFormatter; import net.yacy.kelondro.util.DateFormatter;
import net.yacy.repository.Blacklist;
import de.anomic.data.Blacklist; import de.anomic.crawler.retrieval.EventOrigin;
import de.anomic.http.server.RequestHeader; import de.anomic.http.server.RequestHeader;
import de.anomic.search.Segments; import de.anomic.search.Segments;
import de.anomic.search.Switchboard; import de.anomic.search.Switchboard;
@ -141,7 +142,7 @@ public final class transferURL {
yacyCore.log.logInfo("Accepting URL " + i + "/" + urlc + " from peer " + otherPeerName + ": " + lEntry.metadata().url().toNormalform(true, false)); yacyCore.log.logInfo("Accepting URL " + i + "/" + urlc + " from peer " + otherPeerName + ": " + lEntry.metadata().url().toNormalform(true, false));
try { try {
sb.indexSegments.urlMetadata(Segments.Process.DHTIN).store(lEntry); sb.indexSegments.urlMetadata(Segments.Process.DHTIN).store(lEntry);
sb.crawlResults.stack(lEntry, iam, iam, 3); sb.crawlResults.stack(lEntry, iam, iam, EventOrigin.DHT_TRANSFER);
if (yacyCore.log.isFine()) yacyCore.log.logFine("transferURL: received URL '" + metadata.url().toNormalform(false, true) + "' from peer " + otherPeerName); if (yacyCore.log.isFine()) yacyCore.log.logFine("transferURL: received URL '" + metadata.url().toNormalform(false, true) + "' from peer " + otherPeerName);
received++; received++;
} catch (final IOException e) { } catch (final IOException e) {

@ -50,8 +50,8 @@ import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.CloneableIterator; import net.yacy.kelondro.order.CloneableIterator;
import net.yacy.kelondro.table.SplitTable; import net.yacy.kelondro.table.SplitTable;
import net.yacy.kelondro.util.ScoreCluster; import net.yacy.kelondro.util.ScoreCluster;
import net.yacy.repository.Blacklist;
import de.anomic.data.Blacklist;
import de.anomic.http.client.Client; import de.anomic.http.client.Client;
import de.anomic.http.client.RemoteProxyConfig; import de.anomic.http.client.RemoteProxyConfig;
import de.anomic.http.server.ResponseContainer; import de.anomic.http.server.ResponseContainer;

@ -52,9 +52,9 @@ import net.yacy.kelondro.rwi.IndexCell;
import net.yacy.kelondro.rwi.ReferenceContainer; import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.rwi.ReferenceFactory; import net.yacy.kelondro.rwi.ReferenceFactory;
import net.yacy.kelondro.util.ISO639; import net.yacy.kelondro.util.ISO639;
import net.yacy.repository.Blacklist;
import de.anomic.crawler.retrieval.Response; import de.anomic.crawler.retrieval.Response;
import de.anomic.data.Blacklist;
public class Segment { public class Segment {

@ -139,6 +139,7 @@ import net.yacy.kelondro.workflow.InstantBusyThread;
import net.yacy.kelondro.workflow.WorkflowJob; import net.yacy.kelondro.workflow.WorkflowJob;
import net.yacy.kelondro.workflow.WorkflowProcessor; import net.yacy.kelondro.workflow.WorkflowProcessor;
import net.yacy.kelondro.workflow.WorkflowThread; import net.yacy.kelondro.workflow.WorkflowThread;
import net.yacy.repository.Blacklist;
import net.yacy.repository.LoaderDispatcher; import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.CrawlProfile;
@ -153,11 +154,10 @@ import de.anomic.crawler.ResultURLs;
import de.anomic.crawler.RobotsTxt; import de.anomic.crawler.RobotsTxt;
import de.anomic.crawler.ZURL; import de.anomic.crawler.ZURL;
import de.anomic.crawler.CrawlProfile.entry; import de.anomic.crawler.CrawlProfile.entry;
import de.anomic.crawler.retrieval.EventOrigin;
import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.Request; import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.retrieval.Response; import de.anomic.crawler.retrieval.Response;
import de.anomic.data.Blacklist;
import de.anomic.data.DefaultBlacklist;
import de.anomic.data.LibraryProvider; import de.anomic.data.LibraryProvider;
import de.anomic.data.URLLicense; import de.anomic.data.URLLicense;
import de.anomic.data.blogBoard; import de.anomic.data.blogBoard;
@ -429,7 +429,7 @@ public final class Switchboard extends serverSwitch {
// load blacklist // load blacklist
this.log.logConfig("Loading blacklist ..."); this.log.logConfig("Loading blacklist ...");
final File blacklistsPath = getConfigPath(SwitchboardConstants.LISTS_PATH, SwitchboardConstants.LISTS_PATH_DEFAULT); final File blacklistsPath = getConfigPath(SwitchboardConstants.LISTS_PATH, SwitchboardConstants.LISTS_PATH_DEFAULT);
urlBlacklist = new DefaultBlacklist(blacklistsPath); urlBlacklist = new Blacklist(blacklistsPath);
listManager.switchboard = this; listManager.switchboard = this;
listManager.listsPath = blacklistsPath; listManager.listsPath = blacklistsPath;
listManager.reloadBlacklists(); listManager.reloadBlacklists();
@ -1156,7 +1156,7 @@ public final class Switchboard extends serverSwitch {
// check if the document should be indexed based on proxy/crawler rules // check if the document should be indexed based on proxy/crawler rules
String noIndexReason = "unspecified indexing error"; String noIndexReason = "unspecified indexing error";
if (response.processCase(peers.mySeed().hash) == SwitchboardConstants.PROCESSCASE_4_PROXY_LOAD) { if (response.processCase(peers.mySeed().hash) == EventOrigin.PROXY_LOAD) {
// proxy-load // proxy-load
noIndexReason = response.shallIndexCacheForProxy(); noIndexReason = response.shallIndexCacheForProxy();
} else { } else {
@ -1329,8 +1329,8 @@ public final class Switchboard extends serverSwitch {
int c = 0; int c = 0;
if ((crawlQueues.delegatedURL.stackSize() > 1000)) c++; if ((crawlQueues.delegatedURL.stackSize() > 1000)) c++;
if ((crawlQueues.errorURL.stackSize() > 1000)) c++; if ((crawlQueues.errorURL.stackSize() > 1000)) c++;
for (int i = 1; i <= 6; i++) { for (EventOrigin origin: EventOrigin.values()) {
if (crawlResults.getStackSize(i) > 1000) c++; if (crawlResults.getStackSize(origin) > 1000) c++;
} }
return c; return c;
} }
@ -1410,11 +1410,11 @@ public final class Switchboard extends serverSwitch {
} }
// clean up loadedURL stack // clean up loadedURL stack
for (int i = 1; i <= 6; i++) { for (EventOrigin origin: EventOrigin.values()) {
checkInterruption(); checkInterruption();
if (crawlResults.getStackSize(i) > 1000) { if (crawlResults.getStackSize(origin) > 1000) {
if (this.log.isFine()) log.logFine("Cleaning Loaded-URLs report stack, " + crawlResults.getStackSize(i) + " entries on stack " + i); if (this.log.isFine()) log.logFine("Cleaning Loaded-URLs report stack, " + crawlResults.getStackSize(origin) + " entries on stack " + origin.getCode());
crawlResults.clearStack(i); crawlResults.clearStack(origin);
hasDoneSomething = true; hasDoneSomething = true;
} }
} }
@ -1585,7 +1585,7 @@ public final class Switchboard extends serverSwitch {
private Document parseDocument(Response entry) throws InterruptedException { private Document parseDocument(Response entry) throws InterruptedException {
Document document = null; Document document = null;
final int processCase = entry.processCase(peers.mySeed().hash); final EventOrigin processCase = entry.processCase(peers.mySeed().hash);
if (this.log.isFine()) log.logFine("processResourceStack processCase=" + processCase + if (this.log.isFine()) log.logFine("processResourceStack processCase=" + processCase +
", depth=" + entry.depth() + ", depth=" + entry.depth() +
@ -1635,7 +1635,7 @@ public final class Switchboard extends serverSwitch {
// put anchors on crawl stack // put anchors on crawl stack
final long stackStartTime = System.currentTimeMillis(); final long stackStartTime = System.currentTimeMillis();
if ( if (
((processCase == SwitchboardConstants.PROCESSCASE_4_PROXY_LOAD) || (processCase == SwitchboardConstants.PROCESSCASE_5_LOCAL_CRAWLING)) && ((processCase == EventOrigin.PROXY_LOAD) || (processCase == EventOrigin.LOCAL_CRAWLING)) &&
((entry.profile() == null) || (entry.depth() < entry.profile().depth())) ((entry.profile() == null) || (entry.depth() < entry.profile().depth()))
) { ) {
final Map<DigestURI, String> hl = document.getHyperlinks(); final Map<DigestURI, String> hl = document.getHyperlinks();
@ -1715,7 +1715,7 @@ public final class Switchboard extends serverSwitch {
// CREATE INDEX // CREATE INDEX
final String dc_title = document.dc_title(); final String dc_title = document.dc_title();
final DigestURI referrerURL = queueEntry.referrerURL(); final DigestURI referrerURL = queueEntry.referrerURL();
final int processCase = queueEntry.processCase(peers.mySeed().hash); final EventOrigin processCase = queueEntry.processCase(peers.mySeed().hash);
// remove stopwords // remove stopwords
log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + queueEntry.url()); log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + queueEntry.url());
@ -1765,7 +1765,7 @@ public final class Switchboard extends serverSwitch {
MemoryTracker.update("indexed", queueEntry.url().toNormalform(true, false), false); MemoryTracker.update("indexed", queueEntry.url().toNormalform(true, false), false);
// if this was performed for a remote crawl request, notify requester // if this was performed for a remote crawl request, notify requester
if ((processCase == SwitchboardConstants.PROCESSCASE_6_GLOBAL_CRAWLING) && (queueEntry.initiator() != null)) { if ((processCase == EventOrigin.GLOBAL_CRAWLING) && (queueEntry.initiator() != null)) {
final yacySeed initiatorPeer = peers.get(queueEntry.initiator()); final yacySeed initiatorPeer = peers.get(queueEntry.initiator());
if (initiatorPeer != null) { if (initiatorPeer != null) {
log.logInfo("Sending crawl receipt for '" + queueEntry.url().toNormalform(false, true) + "' to " + initiatorPeer.getName()); log.logInfo("Sending crawl receipt for '" + queueEntry.url().toNormalform(false, true) + "' to " + initiatorPeer.getName());
@ -1841,7 +1841,7 @@ public final class Switchboard extends serverSwitch {
final Long resourceContentLength = (Long) resource[1]; final Long resourceContentLength = (Long) resource[1];
// parse the resource // parse the resource
final Document document = LoaderDispatcher.parseDocument(metadata.url(), resourceContentLength.longValue(), resourceContent); final Document document = LoaderDispatcher.parseDocument(metadata.url(), resourceContentLength.longValue(), resourceContent, null);
// get the word set // get the word set
Set<String> words = null; Set<String> words = null;

@ -385,17 +385,6 @@ public final class SwitchboardConstants {
public static final String WORK_PATH = "workPath"; public static final String WORK_PATH = "workPath";
public static final String WORK_PATH_DEFAULT = "DATA/WORK"; public static final String WORK_PATH_DEFAULT = "DATA/WORK";
// we must distinguish the following cases: resource-load was initiated by
// 1) global crawling: the index is extern, not here (not possible here)
// 2) result of search queries, some indexes are here (not possible here)
// 3) result of index transfer, some of them are here (not possible here)
// 4) proxy-load (initiator is "------------")
// 5) local prefetch/crawling (initiator is own seedHash)
// 6) local fetching for global crawling (other known or unknown initiator)
public static final int PROCESSCASE_0_UNKNOWN = 0;
public static final int PROCESSCASE_4_PROXY_LOAD = 4;
public static final int PROCESSCASE_5_LOCAL_CRAWLING = 5;
public static final int PROCESSCASE_6_GLOBAL_CRAWLING = 6;
/* /*
* Some constants * Some constants
*/ */

@ -68,13 +68,14 @@ import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.rwi.ReferenceContainerCache; import net.yacy.kelondro.rwi.ReferenceContainerCache;
import net.yacy.kelondro.util.ByteBuffer; import net.yacy.kelondro.util.ByteBuffer;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
import net.yacy.repository.Blacklist;
import org.apache.commons.httpclient.methods.multipart.ByteArrayPartSource; import org.apache.commons.httpclient.methods.multipart.ByteArrayPartSource;
import org.apache.commons.httpclient.methods.multipart.Part; import org.apache.commons.httpclient.methods.multipart.Part;
import de.anomic.crawler.ResultURLs; import de.anomic.crawler.ResultURLs;
import de.anomic.crawler.retrieval.EventOrigin;
import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.data.Blacklist;
import de.anomic.http.client.DefaultCharsetFilePart; import de.anomic.http.client.DefaultCharsetFilePart;
import de.anomic.http.client.DefaultCharsetStringPart; import de.anomic.http.client.DefaultCharsetStringPart;
import de.anomic.http.client.Client; import de.anomic.http.client.Client;
@ -576,7 +577,7 @@ public final class yacyClient {
// passed all checks, store url // passed all checks, store url
try { try {
indexSegment.urlMetadata().store(urlEntry); indexSegment.urlMetadata().store(urlEntry);
crawlResults.stack(urlEntry, mySeed.hash, target.hash, 2); crawlResults.stack(urlEntry, mySeed.hash, target.hash, EventOrigin.QUERIES);
} catch (final IOException e) { } catch (final IOException e) {
yacyCore.log.logSevere("could not store search result", e); yacyCore.log.logSevere("could not store search result", e);
continue; // db-error continue; // db-error

@ -51,8 +51,8 @@ import java.util.Iterator;
import java.util.Map; import java.util.Map;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.repository.Blacklist;
import de.anomic.data.Blacklist;
import de.anomic.search.Switchboard; import de.anomic.search.Switchboard;
public class yacyNewsPool { public class yacyNewsPool {

@ -34,9 +34,9 @@ import java.util.TreeSet;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Bitfield; import net.yacy.kelondro.order.Bitfield;
import net.yacy.kelondro.util.ScoreCluster; import net.yacy.kelondro.util.ScoreCluster;
import net.yacy.repository.Blacklist;
import de.anomic.crawler.ResultURLs; import de.anomic.crawler.ResultURLs;
import de.anomic.data.Blacklist;
import de.anomic.search.QueryParams; import de.anomic.search.QueryParams;
import de.anomic.search.RankingProfile; import de.anomic.search.RankingProfile;
import de.anomic.search.RankingProcess; import de.anomic.search.RankingProcess;

@ -1,7 +1,6 @@
// AbstractBlacklist.java // indexDefaultReference.java
// first published on http://www.yacy.net // (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// (C) 2007 by Bjoern Krombholz // first published 11.07.2005 on http://yacy.net
// last major change: 12. August 2006 (theli) ?
// //
// This is a part of YaCy, a peer-to-peer based web search engine // This is a part of YaCy, a peer-to-peer based web search engine
// //
@ -25,7 +24,7 @@
// along with this program; if not, write to the Free Software // along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.data; package net.yacy.repository;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
@ -37,15 +36,23 @@ import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.Map.Entry;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException; import java.util.regex.PatternSyntaxException;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.SetTools; import net.yacy.kelondro.util.SetTools;
import de.anomic.search.SearchEventCache; public class Blacklist {
public abstract class AbstractBlacklist implements Blacklist { public static final String BLACKLIST_DHT = "dht";
public static final String BLACKLIST_CRAWLER = "crawler";
public static final String BLACKLIST_PROXY = "proxy";
public static final String BLACKLIST_SEARCH = "search";
public static final String BLACKLIST_SURFTIPS = "surftips";
public static final String BLACKLIST_NEWS = "news";
public static final int ERR_TWO_WILDCARDS_IN_HOST = 1; public static final int ERR_TWO_WILDCARDS_IN_HOST = 1;
public static final int ERR_SUBDOMAIN_XOR_WILDCARD = 2; public static final int ERR_SUBDOMAIN_XOR_WILDCARD = 2;
@ -71,7 +78,7 @@ public abstract class AbstractBlacklist implements Blacklist {
protected HashMap<String, HashMap<String, ArrayList<String>>> hostpaths_matchable = null; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here protected HashMap<String, HashMap<String, ArrayList<String>>> hostpaths_matchable = null; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here
protected HashMap<String, HashMap<String, ArrayList<String>>> hostpaths_notmatchable = null; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here protected HashMap<String, HashMap<String, ArrayList<String>>> hostpaths_notmatchable = null; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here
public AbstractBlacklist(final File rootPath) { public Blacklist(final File rootPath) {
this.setRootPath(rootPath); this.setRootPath(rootPath);
this.blacklistRootPath = rootPath; this.blacklistRootPath = rootPath;
@ -92,6 +99,8 @@ public abstract class AbstractBlacklist implements Blacklist {
} }
} }
public void setRootPath(final File rootPath) { public void setRootPath(final File rootPath) {
if (rootPath == null) if (rootPath == null)
throw new NullPointerException("The blacklist root path must not be null."); throw new NullPointerException("The blacklist root path must not be null.");
@ -127,9 +136,6 @@ public abstract class AbstractBlacklist implements Blacklist {
for(final Set<String> entry: this.cachedUrlHashs.values()) { for(final Set<String> entry: this.cachedUrlHashs.values()) {
entry.clear(); entry.clear();
} }
// clean up all search events in case that an old blacklist entry denied previously returned results, but does not anymore
SearchEventCache.cleanupEvents(true);
} }
public int size() { public int size() {
@ -147,14 +153,14 @@ public abstract class AbstractBlacklist implements Blacklist {
return size; return size;
} }
public void loadList(final blacklistFile[] blFiles, final String sep) { public void loadList(final BlacklistFile[] blFiles, final String sep) {
for (int j = 0; j < blFiles.length; j++) { for (int j = 0; j < blFiles.length; j++) {
final blacklistFile blf = blFiles[j]; final BlacklistFile blf = blFiles[j];
loadList(blf.getType(), blf.getFileName(), sep); loadList(blf.getType(), blf.getFileName(), sep);
} }
} }
public void loadList(final blacklistFile blFile, final String sep) { public void loadList(final BlacklistFile blFile, final String sep) {
final HashMap<String, ArrayList<String>> blacklistMapMatch = getBlacklistMap(blFile.getType(),true); final HashMap<String, ArrayList<String>> blacklistMapMatch = getBlacklistMap(blFile.getType(),true);
final HashMap<String, ArrayList<String>> blacklistMapNotMatch = getBlacklistMap(blFile.getType(),false); final HashMap<String, ArrayList<String>> blacklistMapNotMatch = getBlacklistMap(blFile.getType(),false);
Set<Map.Entry<String, ArrayList<String>>> loadedBlacklist; Set<Map.Entry<String, ArrayList<String>>> loadedBlacklist;
@ -191,14 +197,12 @@ public abstract class AbstractBlacklist implements Blacklist {
} }
} }
} }
// clean up all search events in case that a (new) blacklist entry denies previously returned results
SearchEventCache.cleanupEvents(true);
} }
} }
public void loadList(final String blacklistType, final String fileNames, final String sep) { public void loadList(final String blacklistType, final String fileNames, final String sep) {
// method for not breaking older plasmaURLPattern interface // method for not breaking older plasmaURLPattern interface
final blacklistFile blFile = new blacklistFile(fileNames, blacklistType); final BlacklistFile blFile = new BlacklistFile(fileNames, blacklistType);
loadList(blFile, sep); loadList(blFile, sep);
} }
@ -206,9 +210,6 @@ public abstract class AbstractBlacklist implements Blacklist {
public void removeAll(final String blacklistType, final String host) { public void removeAll(final String blacklistType, final String host) {
getBlacklistMap(blacklistType,true).remove(host); getBlacklistMap(blacklistType,true).remove(host);
getBlacklistMap(blacklistType,false).remove(host); getBlacklistMap(blacklistType,false).remove(host);
// clean up all search events in case that an old blacklist entry denied previously returned results, but does not anymore
SearchEventCache.cleanupEvents(true);
} }
public void remove(final String blacklistType, final String host, final String path) { public void remove(final String blacklistType, final String host, final String path) {
@ -222,14 +223,11 @@ public abstract class AbstractBlacklist implements Blacklist {
} }
final HashMap<String, ArrayList<String>> blacklistMapNotMatch = getBlacklistMap(blacklistType,false); final HashMap<String, ArrayList<String>> blacklistMapNotMatch = getBlacklistMap(blacklistType,false);
hostList = blacklistMapNotMatch.get(host); hostList = blacklistMapNotMatch.get(host);
if(hostList != null) { if (hostList != null) {
hostList.remove(path); hostList.remove(path);
if (hostList.size() == 0) if (hostList.size() == 0)
blacklistMapNotMatch.remove(host); blacklistMapNotMatch.remove(host);
} }
// clean up all search events in case that an old blacklist entry denied previously returned results, but does not anymore
SearchEventCache.cleanupEvents(true);
} }
public void add(final String blacklistType, String host, String path) { public void add(final String blacklistType, String host, String path) {
@ -248,9 +246,6 @@ public abstract class AbstractBlacklist implements Blacklist {
ArrayList<String> hostList = blacklistMap.get(host.toLowerCase()); ArrayList<String> hostList = blacklistMap.get(host.toLowerCase());
if (hostList == null) blacklistMap.put(host.toLowerCase(), (hostList = new ArrayList<String>())); if (hostList == null) blacklistMap.put(host.toLowerCase(), (hostList = new ArrayList<String>()));
hostList.add(path); hostList.add(path);
// clean up all search events in case that a (new) blacklist entry denies previously returned results
SearchEventCache.cleanupEvents(true);
} }
public int blacklistCacheSize() { public int blacklistCacheSize() {
@ -313,4 +308,155 @@ public abstract class AbstractBlacklist implements Blacklist {
return false; return false;
} }
public String getEngineInfo() {
return "Default YaCy Blacklist Engine";
}
public boolean isListed(final String blacklistType, final String hostlow, String path) {
if (hostlow == null) throw new NullPointerException();
if (path == null) throw new NullPointerException();
// getting the proper blacklist
final HashMap<String, ArrayList<String>> blacklistMapMatched = getBlacklistMap(blacklistType,true);
if (path.length() > 0 && path.charAt(0) == '/') path = path.substring(1);
ArrayList<String> app;
boolean matched = false;
String pp = ""; // path-pattern
// try to match complete domain
if (!matched && (app = blacklistMapMatched.get(hostlow)) != null) {
for (int i=app.size()-1; !matched && i>-1; i--) {
pp = app.get(i);
matched |= ((pp.equals("*")) || (path.matches(pp)));
}
}
// first try to match the domain with wildcard '*'
// [TL] While "." are found within the string
int index = 0;
while (!matched && (index = hostlow.indexOf('.', index + 1)) != -1) {
if ((app = blacklistMapMatched.get(hostlow.substring(0, index + 1) + "*")) != null) {
for (int i=app.size()-1; !matched && i>-1; i--) {
pp = app.get(i);
matched |= ((pp.equals("*")) || (path.matches(pp)));
}
}
if ((app = blacklistMapMatched.get(hostlow.substring(0, index))) != null) {
for (int i=app.size()-1; !matched && i>-1; i--) {
pp = app.get(i);
matched |= ((pp.equals("*")) || (path.matches(pp)));
}
}
}
index = hostlow.length();
while (!matched && (index = hostlow.lastIndexOf('.', index - 1)) != -1) {
if ((app = blacklistMapMatched.get("*" + hostlow.substring(index, hostlow.length()))) != null) {
for (int i=app.size()-1; !matched && i>-1; i--) {
pp = app.get(i);
matched |= ((pp.equals("*")) || (path.matches(pp)));
}
}
if ((app = blacklistMapMatched.get(hostlow.substring(index +1, hostlow.length()))) != null) {
for (int i=app.size()-1; !matched && i>-1; i--) {
pp = app.get(i);
matched |= ((pp.equals("*")) || (path.matches(pp)));
}
}
}
// loop over all Regexentrys
if(!matched) {
final HashMap<String, ArrayList<String>> blacklistMapNotMatched = getBlacklistMap(blacklistType,false);
String key;
for(final Entry<String, ArrayList<String>> entry: blacklistMapNotMatched.entrySet()) {
key = entry.getKey();
try {
if(Pattern.matches(key, hostlow)) {
app = entry.getValue();
for (int i=0; i<app.size(); i++) {
if(Pattern.matches(app.get(i), path))
return true;
}
}
} catch (final PatternSyntaxException e) {
//System.out.println(e.toString());
}
}
}
return matched;
}
public int checkError(String element, Map<String, String> properties) {
boolean allowRegex = true;
int slashPos;
String host, path;
if (properties != null) {
allowRegex = properties.get("allowRegex").equalsIgnoreCase("true") ? true : false;
}
if ((slashPos = element.indexOf("/")) == -1) {
host = element;
path = ".*";
} else {
host = element.substring(0, slashPos);
path = element.substring(slashPos + 1);
}
if (!allowRegex || !isValidRegex(host)) {
final int i = host.indexOf("*");
// check whether host begins illegally
if (!host.matches("([A-Za-z0-9_-]+|\\*)(\\.([A-Za-z0-9_-]+|\\*))*")) {
if (i == 0 && host.length() > 1 && host.charAt(1) != '.') {
return ERR_SUBDOMAIN_XOR_WILDCARD;
}
return ERR_HOST_WRONG_CHARS;
}
// in host-part only full sub-domains may be wildcards
if (host.length() > 0 && i > -1) {
if (!(i == 0 || i == host.length() - 1)) {
return ERR_WILDCARD_BEGIN_OR_END;
}
if (i == host.length() - 1 && host.length() > 1 && host.charAt(i - 1) != '.') {
return ERR_SUBDOMAIN_XOR_WILDCARD;
}
}
// check for double-occurences of "*" in host
if (host.indexOf("*", i + 1) > -1) {
return ERR_TWO_WILDCARDS_IN_HOST;
}
} else if (allowRegex && !isValidRegex(host)) {
return ERR_HOST_REGEX;
}
// check for errors on regex-compiling path
if (!isValidRegex(path) && !path.equals("*")) {
return ERR_PATH_REGEX;
}
return 0;
}
/**
* Checks if a given expression is a valid regular expression.
* @param expression The expression to be checked.
* @return True if the expression is a valid regular expression, else false.
*/
private static boolean isValidRegex(String expression) {
boolean ret = true;
try {
Pattern.compile(expression);
} catch (final PatternSyntaxException e) {
ret = false;
}
return ret;
}
} }

@ -0,0 +1,58 @@
// BlacklistFile.java
// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 11.07.2005 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2009-09-29 23:28:49 +0200 (Di, 29 Sep 2009) $
// $LastChangedRevision: 6359 $
// $LastChangedBy: low012 $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.repository;
import java.util.Arrays;
import java.util.HashSet;
public class BlacklistFile {
private final String filename;
private final String type;
public BlacklistFile(final String filename, final String type) {
this.filename = filename;
this.type = type;
}
public String getFileName() { return this.filename; }
/**
* Construct a unified array of file names from comma seperated file name
* list.
*
* @return unified String array of file names
*/
public String[] getFileNamesUnified() {
final HashSet<String> hs = new HashSet<String>(Arrays.asList(this.filename.split(",")));
return hs.toArray(new String[hs.size()]);
}
public String getType() { return this.type; }
}

@ -422,11 +422,6 @@ public final class LoaderDispatcher {
} }
} }
public static Document parseDocument(final DigestURI url, final long contentLength, final InputStream resourceStream) throws ParserException {
return parseDocument(url, contentLength, resourceStream, null);
}
public static ContentScraper parseResource(final LoaderDispatcher loader, final DigestURI location, int cachePolicy) throws IOException { public static ContentScraper parseResource(final LoaderDispatcher loader, final DigestURI location, int cachePolicy) throws IOException {
// load page // load page
Response r = loader.load(location, true, false, cachePolicy); Response r = loader.load(location, true, false, cachePolicy);

Loading…
Cancel
Save