- refactoring of blacklists

- refactoring of event origin encoding


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6434 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 26fafd85a5
commit 5e8038ac4d

@ -43,17 +43,16 @@ import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import de.anomic.data.AbstractBlacklist;
import de.anomic.data.Blacklist;
import de.anomic.data.DefaultBlacklist;
import de.anomic.data.listManager;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.SearchEventCache;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import java.util.Set;
import net.yacy.kelondro.logging.Log;
import net.yacy.repository.Blacklist;
public class BlacklistCleaner_p {
@ -65,7 +64,7 @@ public class BlacklistCleaner_p {
private final static String BLACKLIST_FILENAME_FILTER = "^.*\\.black$";
public static final Class<?>[] supportedBLEngines = {
DefaultBlacklist.class
Blacklist.class
};
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
@ -77,7 +76,7 @@ public class BlacklistCleaner_p {
String blacklistToUse = null;
// get the list of supported blacklist types
final String supportedBlacklistTypesStr = AbstractBlacklist.BLACKLIST_TYPES_STRING;
final String supportedBlacklistTypesStr = Blacklist.BLACKLIST_TYPES_STRING;
final String[] supportedBlacklistTypes = supportedBlacklistTypesStr.split(",");
prop.put(DISABLED+"checked", "1");
@ -245,26 +244,23 @@ public class BlacklistCleaner_p {
final List<String> list = listManager.getListArray(new File(listManager.listsPath, blacklistToUse));
final Map<String, String> properties= new HashMap<String, String>();
properties.put("allowRegex", String.valueOf(allowRegex));
if (blEngine instanceof AbstractBlacklist) {
int err = 0;
int err = 0;
for (String element : list) {
element = element.trim();
// check for double-occurance
if (legalEntries.contains(element)) {
illegalEntries.put(element, Integer.valueOf(AbstractBlacklist.ERR_DOUBLE_OCCURANCE));
continue;
}
legalEntries.add(element);
for (String element : list) {
element = element.trim();
// check for double-occurance
if (legalEntries.contains(element)) {
illegalEntries.put(element, Integer.valueOf(Blacklist.ERR_DOUBLE_OCCURANCE));
continue;
}
legalEntries.add(element);
err = blEngine.checkError(element, properties);
err = blEngine.checkError(element, properties);
if (err > 0) {
illegalEntries.put(element, err);
}
if (err > 0) {
illegalEntries.put(element, err);
}
}
@ -309,14 +305,14 @@ public class BlacklistCleaner_p {
final String host = (s.indexOf("/") == -1) ? s : s.substring(0, s.indexOf("/"));
final String path = (s.indexOf("/") == -1) ? ".*" : s.substring(s.indexOf("/") + 1);
try {
Switchboard.urlBlacklist.remove(supportedBlacklistTypes[blTypes],
host,path);
Switchboard.urlBlacklist.remove(supportedBlacklistTypes[blTypes], host, path);
} catch (final RuntimeException e) {
//System.err.println(e.getMessage() + ": " + host + "/" + path);
Log.logSevere("BLACKLIST-CLEANER", e.getMessage() + ": " + host + "/" + path);
}
}
}
}
SearchEventCache.cleanupEvents(true);
}
if (listChanged){
listManager.writeList(new File(listManager.listsPath, blacklistToUse), list.toArray(new String[list.size()]));
@ -360,6 +356,7 @@ public class BlacklistCleaner_p {
path);
}
}
SearchEventCache.cleanupEvents(true);
}
pw.close();
} catch (final IOException e) {

@ -33,8 +33,8 @@ import java.io.File;
import java.net.MalformedURLException;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.repository.Blacklist;
import de.anomic.data.Blacklist;
import de.anomic.data.listManager;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Switchboard;

@ -41,11 +41,11 @@ import java.util.List;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.repository.Blacklist;
import de.anomic.data.AbstractBlacklist;
import de.anomic.data.Blacklist;
import de.anomic.data.listManager;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.SearchEventCache;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -66,7 +66,7 @@ public class Blacklist_p {
listManager.listsPath = new File(listManager.switchboard.getRootPath(),listManager.switchboard.getConfig("listManager.listsPath", "DATA/LISTS"));
// get the list of supported blacklist types
final String supportedBlacklistTypesStr = AbstractBlacklist.BLACKLIST_TYPES_STRING;
final String supportedBlacklistTypesStr = Blacklist.BLACKLIST_TYPES_STRING;
final String[] supportedBlacklistTypes = supportedBlacklistTypesStr.split(",");
// load all blacklist files located in the directory
@ -560,6 +560,7 @@ public class Blacklist_p {
Switchboard.urlBlacklist.add(supportedBlacklistTypes[blTypes], newEntry.substring(0, pos), newEntry.substring(pos + 1));
}
}
SearchEventCache.cleanupEvents(true);
}
return null;
@ -610,6 +611,7 @@ public class Blacklist_p {
Switchboard.urlBlacklist.remove(supportedBlacklistTypes[blTypes],oldEntry.substring(0, pos), oldEntry.substring(pos + 1));
}
}
SearchEventCache.cleanupEvents(true);
return null;
}

@ -34,6 +34,7 @@ import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log;
import de.anomic.crawler.retrieval.EventOrigin;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;
@ -63,20 +64,24 @@ public class CrawlResults {
}
// find process number
int tabletype;
EventOrigin tabletype;
try {
tabletype = Integer.parseInt(post.get("process", "0"));
tabletype = EventOrigin.getEvent(Integer.parseInt(post.get("process", "0")));
} catch (final NumberFormatException e) {
tabletype = 0;
tabletype = EventOrigin.UNKNOWN;
}
if ((post != null) && (post.containsKey("autoforward")) && (tabletype == 5) && (sb.crawlResults.getStackSize(5) == 0)) {
if (
post != null &&
post.containsKey("autoforward") &&
tabletype == EventOrigin.LOCAL_CRAWLING &&
sb.crawlResults.getStackSize(EventOrigin.LOCAL_CRAWLING) == 0) {
// the main menu does a request to the local crawler page, but in case this table is empty, the overview page is shown
tabletype = 0;
tabletype = EventOrigin.UNKNOWN;
}
// check if authorization is needed and/or given
if (((tabletype > 0) && (tabletype < 6)) ||
if (tabletype != EventOrigin.UNKNOWN ||
(post != null && (post.containsKey("clearlist") ||
post.containsKey("deleteentry")))) {
final String authorization = (header.get(RequestHeader.AUTHORIZATION, "xxxxxx"));
@ -143,7 +148,7 @@ public class CrawlResults {
} // end != null
// create table
if (tabletype == 0) {
if (tabletype == EventOrigin.UNKNOWN) {
prop.put("table", "2");
} else if (sb.crawlResults.getStackSize(tabletype) == 0 && sb.crawlResults.getDomainListSize(tabletype) == 0) {
prop.put("table", "0");
@ -159,7 +164,7 @@ public class CrawlResults {
prop.put("table_size_all", sb.crawlResults.getStackSize(tabletype));
prop.putHTML("table_feedbackpage", "CrawlResults.html");
prop.put("table_tabletype", tabletype);
prop.put("table_tabletype", tabletype.getCode());
prop.put("table_showInit", (showInit) ? "1" : "0");
prop.put("table_showExec", (showExec) ? "1" : "0");
prop.put("table_showDate", (showDate) ? "1" : "0");
@ -196,7 +201,7 @@ public class CrawlResults {
prop.put("table_indexed_" + cnt + "_dark", (dark) ? "1" : "0");
prop.put("table_indexed_" + cnt + "_feedbackpage", "CrawlResults.html");
prop.put("table_indexed_" + cnt + "_tabletype", tabletype);
prop.put("table_indexed_" + cnt + "_tabletype", tabletype.getCode());
prop.put("table_indexed_" + cnt + "_urlhash", urlHash);
if (showInit) {
@ -266,7 +271,7 @@ public class CrawlResults {
if (domain == null) break;
prop.put("table_domains_" + cnt + "_dark", (dark) ? "1" : "0");
prop.put("table_domains_" + cnt + "_feedbackpage", "CrawlResults.html");
prop.put("table_domains_" + cnt + "_tabletype", tabletype);
prop.put("table_domains_" + cnt + "_tabletype", tabletype.getCode());
prop.put("table_domains_" + cnt + "_domain", domain);
prop.put("table_domains_" + cnt + "_hashpart", DigestURI.hosthash6(domain));
prop.put("table_domains_" + cnt + "_count", sb.crawlResults.domainCount(tabletype, domain));
@ -275,7 +280,7 @@ public class CrawlResults {
}
prop.put("table_domains", cnt);
}
prop.put("process", tabletype);
prop.put("process", tabletype.getCode());
// return rewrite properties
return prop;
}

@ -47,9 +47,8 @@ import net.yacy.kelondro.rwi.Reference;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.rwi.ReferenceContainerCache;
import net.yacy.kelondro.util.DateFormatter;
import net.yacy.repository.Blacklist;
import de.anomic.data.AbstractBlacklist;
import de.anomic.data.Blacklist;
import de.anomic.data.listManager;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.QueryParams;
@ -320,6 +319,7 @@ public class IndexControlRWIs_p {
url.getFile());
}
}
SearchEventCache.cleanupEvents(true);
}
}
pw.close();
@ -330,7 +330,7 @@ public class IndexControlRWIs_p {
if (post.containsKey("blacklistdomains")) {
PrintWriter pw;
try {
final String[] supportedBlacklistTypes = AbstractBlacklist.BLACKLIST_TYPES_STRING.split(",");
final String[] supportedBlacklistTypes = Blacklist.BLACKLIST_TYPES_STRING.split(",");
pw = new PrintWriter(new FileWriter(new File(listManager.listsPath, blacklist), true));
DigestURI url;
for (i = 0; i<urlx.length; i++) {

@ -37,8 +37,8 @@ import net.yacy.kelondro.index.Row.Entry;
import net.yacy.kelondro.order.NaturalOrder;
import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.util.ScoreCluster;
import net.yacy.repository.Blacklist;
import de.anomic.data.Blacklist;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;

@ -37,8 +37,8 @@ import net.yacy.kelondro.index.Row.Entry;
import net.yacy.kelondro.order.NaturalOrder;
import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.util.ScoreCluster;
import net.yacy.repository.Blacklist;
import de.anomic.data.Blacklist;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;

@ -267,7 +267,7 @@ public class ViewFile {
// parsing the resource content
Document document = null;
try {
document = LoaderDispatcher.parseDocument(url, resourceLength, resource);
document = LoaderDispatcher.parseDocument(url, resourceLength, resource, null);
if (document == null) {
prop.put("error", "5");
prop.put("error_errorText", "Unknown error");

@ -2,7 +2,8 @@
import java.io.File;
import java.util.List;
import de.anomic.data.AbstractBlacklist;
import net.yacy.repository.Blacklist;
import de.anomic.data.listManager;
import de.anomic.http.server.RequestHeader;
import de.anomic.server.serverObjects;
@ -34,7 +35,7 @@ public class blacklists_p {
prop.put("lists_" + blacklistCount + "_shared", "0");
}
final String[] types = AbstractBlacklist.BLACKLIST_TYPES_STRING.split(",");
final String[] types = Blacklist.BLACKLIST_TYPES_STRING.split(",");
for (int j=0; j<types.length; j++) {
prop.putXML("lists_" + blacklistCount + "_types_" + j + "_name", types[j]);
prop.put("lists_" + blacklistCount + "_types_" + j + "_value",

@ -40,13 +40,13 @@ import java.util.Iterator;
import java.util.List;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.data.AbstractBlacklist;
import de.anomic.data.listManager;
import de.anomic.data.list.ListAccumulator;
import de.anomic.data.list.XMLBlacklistImporter;
import de.anomic.http.client.Client;
import de.anomic.http.server.HeaderFramework;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.SearchEventCache;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -54,6 +54,7 @@ import de.anomic.yacy.yacySeed;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.repository.Blacklist;
import org.xml.sax.SAXException;
@ -249,7 +250,7 @@ public class sharedBlacklist_p {
count++;
if (Switchboard.urlBlacklist != null) {
final String supportedBlacklistTypesStr = AbstractBlacklist.BLACKLIST_TYPES_STRING;
final String supportedBlacklistTypesStr = Blacklist.BLACKLIST_TYPES_STRING;
final String[] supportedBlacklistTypes = supportedBlacklistTypesStr.split(",");
for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) {
@ -257,6 +258,7 @@ public class sharedBlacklist_p {
Switchboard.urlBlacklist.add(supportedBlacklistTypes[blTypes],newItem.substring(0, pos), newItem.substring(pos + 1));
}
}
SearchEventCache.cleanupEvents(true);
}
}
}

@ -33,6 +33,7 @@ import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log;
import de.anomic.crawler.ZURL;
import de.anomic.crawler.retrieval.EventOrigin;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;
@ -139,7 +140,7 @@ public final class crawlReceipt {
if (result.equals("fill")) try {
// put new entry into database
sb.indexSegments.urlMetadata(Segments.Process.RECEIPTS).store(entry);
sb.crawlResults.stack(entry, youare, iam, 1);
sb.crawlResults.stack(entry, youare, iam, EventOrigin.REMOTE_RECEIPTS);
sb.crawlQueues.delegatedURL.remove(entry.hash()); // the delegated work has been done
log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + entry.hash() + ":" + metadata.url().toNormalform(false, true));

@ -37,8 +37,8 @@ import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.kelondro.data.word.WordReferenceRow;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.repository.Blacklist;
import de.anomic.data.Blacklist;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;

@ -33,8 +33,9 @@ import net.yacy.document.content.RSSMessage;
import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.util.DateFormatter;
import net.yacy.repository.Blacklist;
import de.anomic.data.Blacklist;
import de.anomic.crawler.retrieval.EventOrigin;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;
@ -141,7 +142,7 @@ public final class transferURL {
yacyCore.log.logInfo("Accepting URL " + i + "/" + urlc + " from peer " + otherPeerName + ": " + lEntry.metadata().url().toNormalform(true, false));
try {
sb.indexSegments.urlMetadata(Segments.Process.DHTIN).store(lEntry);
sb.crawlResults.stack(lEntry, iam, iam, 3);
sb.crawlResults.stack(lEntry, iam, iam, EventOrigin.DHT_TRANSFER);
if (yacyCore.log.isFine()) yacyCore.log.logFine("transferURL: received URL '" + metadata.url().toNormalform(false, true) + "' from peer " + otherPeerName);
received++;
} catch (final IOException e) {

@ -36,9 +36,9 @@ import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.Domains;
import net.yacy.kelondro.workflow.WorkflowProcessor;
import net.yacy.repository.Blacklist;
import de.anomic.crawler.retrieval.Request;
import de.anomic.data.Blacklist;
import de.anomic.search.Segment;
import de.anomic.search.Switchboard;
import de.anomic.yacy.yacySeedDB;

@ -35,9 +35,11 @@ package de.anomic.crawler;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
@ -46,6 +48,7 @@ import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Bitfield;
import net.yacy.kelondro.util.ScoreCluster;
import de.anomic.crawler.retrieval.EventOrigin;
import de.anomic.yacy.yacySeedDB;
public final class ResultURLs {
@ -53,38 +56,20 @@ public final class ResultURLs {
// result stacks;
// these have all entries of form
// strings: urlHash + initiatorHash + ExecutorHash
private final LinkedList<String> externResultStack; // 1 - remote index: retrieved by other peer
private final LinkedList<String> searchResultStack; // 2 - partly remote/local index: result of search queries
private final LinkedList<String> transfResultStack; // 3 - partly remote/local index: result of index transfer
private final LinkedList<String> proxyResultStack; // 4 - local index: result of proxy fetch/prefetch
private final LinkedList<String> lcrawlResultStack; // 5 - local index: result of local crawling
private final LinkedList<String> gcrawlResultStack; // 6 - local index: triggered external
private final ScoreCluster<String> externResultDomains;
private final ScoreCluster<String> searchResultDomains;
private final ScoreCluster<String> transfResultDomains;
private final ScoreCluster<String> proxyResultDomains;
private final ScoreCluster<String> lcrawlResultDomains;
private final ScoreCluster<String> gcrawlResultDomains;
private final Map<EventOrigin, LinkedList<String>> resultStacks;
private final Map<EventOrigin, ScoreCluster<String>> resultDomains;
public ResultURLs() {
// init result stacks
externResultStack = new LinkedList<String>();
searchResultStack = new LinkedList<String>();
transfResultStack = new LinkedList<String>();
proxyResultStack = new LinkedList<String>();
lcrawlResultStack = new LinkedList<String>();
gcrawlResultStack = new LinkedList<String>();
// init result domain statistics
externResultDomains = new ScoreCluster<String>();
searchResultDomains = new ScoreCluster<String>();
transfResultDomains = new ScoreCluster<String>();
proxyResultDomains = new ScoreCluster<String>();
lcrawlResultDomains = new ScoreCluster<String>();
gcrawlResultDomains = new ScoreCluster<String>();
resultStacks = new HashMap<EventOrigin, LinkedList<String>>();
resultDomains = new HashMap<EventOrigin, ScoreCluster<String>>();
for (EventOrigin origin: EventOrigin.values()) {
resultStacks.put(origin, new LinkedList<String>());
resultDomains.put(origin, new ScoreCluster<String>());
}
}
public synchronized void stack(final URIMetadataRow e, final String initiatorHash, final String executorHash, final int stackType) {
public synchronized void stack(final URIMetadataRow e, final String initiatorHash, final String executorHash, final EventOrigin stackType) {
assert initiatorHash != null;
assert executorHash != null;
if (e == null) { return; }
@ -108,27 +93,27 @@ public final class ResultURLs {
}
}
public synchronized int getStackSize(final int stack) {
public synchronized int getStackSize(final EventOrigin stack) {
final List<String> resultStack = getStack(stack);
if (resultStack == null) return 0;
return resultStack.size();
}
public synchronized int getDomainListSize(final int stack) {
public synchronized int getDomainListSize(final EventOrigin stack) {
final ScoreCluster<String> domains = getDomains(stack);
if (domains == null) return 0;
return domains.size();
}
public synchronized String getUrlHash(final int stack, final int pos) {
public synchronized String getUrlHash(final EventOrigin stack, final int pos) {
return getHashNo(stack, pos, 0);
}
public synchronized String getInitiatorHash(final int stack, final int pos) {
public synchronized String getInitiatorHash(final EventOrigin stack, final int pos) {
return getHashNo(stack, pos, 1);
}
public synchronized String getExecutorHash(final int stack, final int pos) {
public synchronized String getExecutorHash(final EventOrigin stack, final int pos) {
return getHashNo(stack, pos, 2);
}
@ -150,7 +135,7 @@ public final class ResultURLs {
* @param index starting at 0
* @return
*/
public synchronized String getHashNo(final int stack, final int pos, final int index) {
public synchronized String getHashNo(final EventOrigin stack, final int pos, final int index) {
final String result = getResultStackAt(stack, pos);
if(result != null) {
if(result.length() < Word.commonHashLength * 3) {
@ -175,7 +160,7 @@ public final class ResultURLs {
* @param pos
* @return null if either stack or element do not exist
*/
private String getResultStackAt(final int stack, final int pos) {
private String getResultStackAt(final EventOrigin stack, final int pos) {
assert pos >= 0 : "precondition violated: " + pos + " >= 0";
final List<String> resultStack = getStack(stack);
@ -196,12 +181,12 @@ public final class ResultURLs {
* iterate all domains in the result domain statistic
* @return iterator of domains in reverse order (downwards)
*/
public Iterator<String> domains(final int stack) {
public Iterator<String> domains(final EventOrigin stack) {
assert getDomains(stack) != null : "getDomains(" + stack + ") = null";
return getDomains(stack).scores(false);
}
public int deleteDomain(final int stack, String host, String hosthash) {
public int deleteDomain(final EventOrigin stack, String host, String hosthash) {
assert hosthash.length() == 6;
int i = 0;
while (i < getStackSize(stack)) {
@ -218,41 +203,23 @@ public final class ResultURLs {
* @param domain name
* @return the number of occurrences of the domain in the stack statistics
*/
public int domainCount(final int stack, String domain) {
public int domainCount(final EventOrigin stack, String domain) {
assert domain != null : "domain = null";
assert getDomains(stack) != null : "getDomains(" + stack + ") = null";
return getDomains(stack).getScore(domain);
}
/**
* returns the stack indentified by the id <em>stack</em>
* returns the stack identified by the id <em>stack</em>
*
* @param stack id of resultStack
* @return null if stack does not exist (id is unknown or stack is null (which should not occur and an error is logged))
*/
private List<String> getStack(final int stack) {
switch (stack) {
case 1: return externResultStack;
case 2: return searchResultStack;
case 3: return transfResultStack;
case 4: return proxyResultStack;
case 5: return lcrawlResultStack;
case 6: return gcrawlResultStack;
default:
return null;
}
private List<String> getStack(final EventOrigin stack) {
return resultStacks.get(stack);
}
private ScoreCluster<String> getDomains(final int stack) {
switch (stack) {
case 1: return externResultDomains;
case 2: return searchResultDomains;
case 3: return transfResultDomains;
case 4: return proxyResultDomains;
case 5: return lcrawlResultDomains;
case 6: return gcrawlResultDomains;
default:
return null;
}
private ScoreCluster<String> getDomains(final EventOrigin stack) {
return resultDomains.get(stack);
}
/**
@ -261,11 +228,11 @@ public final class ResultURLs {
* @param stack
* @return
*/
private boolean isValidStack(final int stack) {
private boolean isValidStack(final EventOrigin stack) {
return getStack(stack) != null;
}
public synchronized boolean removeStack(final int stack, final int pos) {
public synchronized boolean removeStack(final EventOrigin stack, final int pos) {
final List<String> resultStack = getStack(stack);
if (resultStack == null) {
return false;
@ -273,7 +240,7 @@ public final class ResultURLs {
return resultStack.remove(pos) != null;
}
public synchronized void clearStack(final int stack) {
public synchronized void clearStack(final EventOrigin stack) {
final List<String> resultStack = getStack(stack);
if (resultStack != null) resultStack.clear();
final ScoreCluster<String> resultDomains = getDomains(stack);
@ -287,11 +254,11 @@ public final class ResultURLs {
public synchronized boolean remove(final String urlHash) {
if (urlHash == null) return false;
String hash;
for (int stack = 1; stack <= 6; stack++) {
for (int i = getStackSize(stack) - 1; i >= 0; i--) {
hash = getUrlHash(stack, i);
for (EventOrigin origin: EventOrigin.values()) {
for (int i = getStackSize(origin) - 1; i >= 0; i--) {
hash = getUrlHash(origin, i);
if (hash != null && hash.equals(urlHash)) {
removeStack(stack, i);
removeStack(origin, i);
return true;
}
}
@ -308,7 +275,7 @@ public final class ResultURLs {
try {
final DigestURI url = new DigestURI("http", "www.yacy.net", 80, "/");
final URIMetadataRow urlRef = new URIMetadataRow(url, "YaCy Homepage", "", "", "", new Date(), new Date(), new Date(), "", new byte[] {}, 123, 42, '?', new Bitfield(), "de", 0, 0, 0, 0, 0, 0);
int stackNo = 1;
EventOrigin stackNo = EventOrigin.LOCAL_CRAWLING;
System.out.println("valid test:\n=======");
// add
results.stack(urlRef, urlRef.hash(), url.hash(), stackNo);
@ -324,29 +291,6 @@ public final class ResultURLs {
System.out.println("url hash:\t"+ results.getUrlHash(stackNo, 1));
System.out.println("executor hash:\t"+ results.getExecutorHash(stackNo, 1));
System.out.println("initiator hash:\t"+ results.getInitiatorHash(stackNo, 1));
stackNo = 42;
System.out.println("size of stack:\t"+ results.getStackSize(stackNo));
// get
System.out.println("url hash:\t"+ results.getUrlHash(stackNo, 0));
System.out.println("executor hash:\t"+ results.getExecutorHash(stackNo, 0));
System.out.println("initiator hash:\t"+ results.getInitiatorHash(stackNo, 0));
// benchmark
final long start = System.currentTimeMillis();
for(int i = 0; i < 1000000; i++) {
stackNo = i % 6;
// add
results.stack(urlRef, urlRef.hash(), url.hash(), stackNo);
// size
results.getStackSize(stackNo);
// get
for(int j = 0; j < 10; j++) {
results.getUrlHash(stackNo, i / 6);
results.getExecutorHash(stackNo, i / 6);
results.getInitiatorHash(stackNo, i / 6);
}
}
System.out.println("benschmark: "+ (System.currentTimeMillis() - start) + " ms");
} catch (final MalformedURLException e) {
e.printStackTrace();
}

@ -0,0 +1,34 @@
package de.anomic.crawler.retrieval;
public enum EventOrigin {
// we must distinguish the following cases: resource-load was initiated by
// 1) global crawling: the index is extern, not here (not possible here)
// 2) result of search queries, some indexes are here (not possible here)
// 3) result of index transfer, some of them are here (not possible here)
// 4) proxy-load (initiator is "------------")
// 5) local prefetch/crawling (initiator is own seedHash)
// 6) local fetching for global crawling (other known or unknown initiator)
UNKNOWN(0),
REMOTE_RECEIPTS(1),
QUERIES(2),
DHT_TRANSFER(3),
PROXY_LOAD(4),
LOCAL_CRAWLING(5),
GLOBAL_CRAWLING(6);
protected int code;
private static final EventOrigin[] list = {
UNKNOWN, REMOTE_RECEIPTS, QUERIES, DHT_TRANSFER, PROXY_LOAD, LOCAL_CRAWLING, GLOBAL_CRAWLING};
private EventOrigin(int code) {
this.code = code;
}
public int getCode() {
return this.code;
}
public static final EventOrigin getEvent(int key) {
return list[key];
}
}

@ -31,9 +31,9 @@ import java.util.Date;
import net.yacy.document.TextParser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.repository.Blacklist;
import de.anomic.crawler.Latency;
import de.anomic.data.Blacklist;
import de.anomic.http.client.Client;
import de.anomic.http.server.HeaderFramework;
import de.anomic.http.server.RequestHeader;

@ -37,7 +37,6 @@ import de.anomic.crawler.CrawlProfile;
import de.anomic.http.server.HeaderFramework;
import de.anomic.http.server.RequestHeader;
import de.anomic.http.server.ResponseHeader;
import de.anomic.search.SwitchboardConstants;
public class Response {
@ -770,7 +769,7 @@ public class Response {
(requestHeader.get(HeaderFramework.X_YACY_INDEX_CONTROL)).toUpperCase().equals("NO-INDEX");
}
public int processCase(String mySeedHash) {
public EventOrigin processCase(String mySeedHash) {
// we must distinguish the following cases: resource-load was initiated by
// 1) global crawling: the index is extern, not here (not possible here)
// 2) result of search queries, some indexes are here (not possible here)
@ -778,17 +777,17 @@ public class Response {
// 4) proxy-load (initiator is "------------")
// 5) local prefetch/crawling (initiator is own seedHash)
// 6) local fetching for global crawling (other known or unknwon initiator)
int processCase = SwitchboardConstants.PROCESSCASE_0_UNKNOWN;
EventOrigin processCase = EventOrigin.UNKNOWN;
// FIXME the equals seems to be incorrect: String.equals(boolean)
if ((initiator() == null) || initiator().length() == 0 || initiator().equals("------------")) {
// proxy-load
processCase = SwitchboardConstants.PROCESSCASE_4_PROXY_LOAD;
processCase = EventOrigin.PROXY_LOAD;
} else if (initiator().equals(mySeedHash)) {
// normal crawling
processCase = SwitchboardConstants.PROCESSCASE_5_LOCAL_CRAWLING;
processCase = EventOrigin.LOCAL_CRAWLING;
} else {
// this was done for remote peer (a global crawl)
processCase = SwitchboardConstants.PROCESSCASE_6_GLOBAL_CRAWLING;
processCase = EventOrigin.GLOBAL_CRAWLING;
}
return processCase;
}

@ -1,102 +0,0 @@
// Blacklist.java
// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 26.03.2008 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.data;
import java.io.File;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Map;
import net.yacy.kelondro.data.meta.DigestURI;
public interface Blacklist {
public static final String BLACKLIST_DHT = "dht";
public static final String BLACKLIST_CRAWLER = "crawler";
public static final String BLACKLIST_PROXY = "proxy";
public static final String BLACKLIST_SEARCH = "search";
public static final String BLACKLIST_SURFTIPS = "surftips";
public static final String BLACKLIST_NEWS = "news";
public static final class blacklistFile {
private final String filename;
private final String type;
public blacklistFile(final String filename, final String type) {
this.filename = filename;
this.type = type;
}
public String getFileName() { return this.filename; }
/**
* Construct a unified array of file names from comma seperated file name
* list.
*
* @return unified String array of file names
*/
public String[] getFileNamesUnified() {
final HashSet<String> hs = new HashSet<String>(Arrays.asList(this.filename.split(",")));
return hs.toArray(new String[hs.size()]);
}
public String getType() { return this.type; }
}
public String getEngineInfo();
public void setRootPath(File rootPath);
public int blacklistCacheSize();
public int size();
public void clear();
public void removeAll(String blacklistType, String host);
public void remove(String blacklistType, String host, String path);
public void add(String blacklistType, String host, String path);
public void loadList(String blacklistType, String filenames, String sep);
public void loadList(blacklistFile[] blFiles, String sep);
public boolean contains(String blacklistType, String host, String path);
public boolean hashInBlacklistedCache(String blacklistType, String urlHash);
public boolean isListed(String blacklistType, DigestURI url);
public boolean isListed(String blacklistType, String hostlow, String path);
public int checkError(String entry, Map<String, String> properties);
}

@ -1,194 +0,0 @@
// indexDefaultReference.java
// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 11.07.2005 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.data;
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
public class DefaultBlacklist extends AbstractBlacklist implements Blacklist {
public DefaultBlacklist(final File rootPath) {
super(rootPath);
}
public String getEngineInfo() {
return "Default YaCy Blacklist Engine";
}
public boolean isListed(final String blacklistType, final String hostlow, String path) {
if (hostlow == null) throw new NullPointerException();
if (path == null) throw new NullPointerException();
// getting the proper blacklist
final HashMap<String, ArrayList<String>> blacklistMapMatched = super.getBlacklistMap(blacklistType,true);
if (path.length() > 0 && path.charAt(0) == '/') path = path.substring(1);
ArrayList<String> app;
boolean matched = false;
String pp = ""; // path-pattern
// try to match complete domain
if (!matched && (app = blacklistMapMatched.get(hostlow)) != null) {
for (int i=app.size()-1; !matched && i>-1; i--) {
pp = app.get(i);
matched |= ((pp.equals("*")) || (path.matches(pp)));
}
}
// first try to match the domain with wildcard '*'
// [TL] While "." are found within the string
int index = 0;
while (!matched && (index = hostlow.indexOf('.', index + 1)) != -1) {
if ((app = blacklistMapMatched.get(hostlow.substring(0, index + 1) + "*")) != null) {
for (int i=app.size()-1; !matched && i>-1; i--) {
pp = app.get(i);
matched |= ((pp.equals("*")) || (path.matches(pp)));
}
}
if ((app = blacklistMapMatched.get(hostlow.substring(0, index))) != null) {
for (int i=app.size()-1; !matched && i>-1; i--) {
pp = app.get(i);
matched |= ((pp.equals("*")) || (path.matches(pp)));
}
}
}
index = hostlow.length();
while (!matched && (index = hostlow.lastIndexOf('.', index - 1)) != -1) {
if ((app = blacklistMapMatched.get("*" + hostlow.substring(index, hostlow.length()))) != null) {
for (int i=app.size()-1; !matched && i>-1; i--) {
pp = app.get(i);
matched |= ((pp.equals("*")) || (path.matches(pp)));
}
}
if ((app = blacklistMapMatched.get(hostlow.substring(index +1, hostlow.length()))) != null) {
for (int i=app.size()-1; !matched && i>-1; i--) {
pp = app.get(i);
matched |= ((pp.equals("*")) || (path.matches(pp)));
}
}
}
// loop over all Regexentrys
if(!matched) {
final HashMap<String, ArrayList<String>> blacklistMapNotMatched = super.getBlacklistMap(blacklistType,false);
String key;
for(final Entry<String, ArrayList<String>> entry: blacklistMapNotMatched.entrySet()) {
key = entry.getKey();
try {
if(Pattern.matches(key, hostlow)) {
app = entry.getValue();
for (int i=0; i<app.size(); i++) {
if(Pattern.matches(app.get(i), path))
return true;
}
}
} catch (final PatternSyntaxException e) {
//System.out.println(e.toString());
}
}
}
return matched;
}
public int checkError(String element, Map<String, String> properties) {
boolean allowRegex = true;
int slashPos;
String host, path;
if (properties != null) {
allowRegex = properties.get("allowRegex").equalsIgnoreCase("true") ? true : false;
}
if ((slashPos = element.indexOf("/")) == -1) {
host = element;
path = ".*";
} else {
host = element.substring(0, slashPos);
path = element.substring(slashPos + 1);
}
if (!allowRegex || !isValidRegex(host)) {
final int i = host.indexOf("*");
// check whether host begins illegally
if (!host.matches("([A-Za-z0-9_-]+|\\*)(\\.([A-Za-z0-9_-]+|\\*))*")) {
if (i == 0 && host.length() > 1 && host.charAt(1) != '.') {
return ERR_SUBDOMAIN_XOR_WILDCARD;
}
return ERR_HOST_WRONG_CHARS;
}
// in host-part only full sub-domains may be wildcards
if (host.length() > 0 && i > -1) {
if (!(i == 0 || i == host.length() - 1)) {
return ERR_WILDCARD_BEGIN_OR_END;
}
if (i == host.length() - 1 && host.length() > 1 && host.charAt(i - 1) != '.') {
return ERR_SUBDOMAIN_XOR_WILDCARD;
}
}
// check for double-occurences of "*" in host
if (host.indexOf("*", i + 1) > -1) {
return ERR_TWO_WILDCARDS_IN_HOST;
}
} else if (allowRegex && !isValidRegex(host)) {
return ERR_HOST_REGEX;
}
// check for errors on regex-compiling path
if (!isValidRegex(path) && !path.equals("*")) {
return ERR_PATH_REGEX;
}
return 0;
}
/**
* Checks if a given expression is a valid regular expression.
* @param expression The expression to be checked.
* @return True if the expression is a valid regular expression, else false.
*/
private static boolean isValidRegex(String expression) {
boolean ret = true;
try {
Pattern.compile(expression);
} catch (final PatternSyntaxException e) {
ret = false;
}
return ret;
}
}

@ -42,7 +42,10 @@ import java.util.List;
import java.util.Set;
import java.util.Vector;
import de.anomic.data.Blacklist.blacklistFile;
import net.yacy.repository.Blacklist;
import net.yacy.repository.BlacklistFile;
import de.anomic.search.SearchEventCache;
import de.anomic.search.Switchboard;
import de.anomic.server.serverCore;
@ -391,12 +394,12 @@ public class listManager {
* Load or reload all active Blacklists
*/
public static void reloadBlacklists(){
final String supportedBlacklistTypesStr = AbstractBlacklist.BLACKLIST_TYPES_STRING;
final String supportedBlacklistTypesStr = Blacklist.BLACKLIST_TYPES_STRING;
final String[] supportedBlacklistTypes = supportedBlacklistTypesStr.split(",");
final ArrayList<blacklistFile> blacklistFiles = new ArrayList<blacklistFile>(supportedBlacklistTypes.length);
final ArrayList<BlacklistFile> blacklistFiles = new ArrayList<BlacklistFile>(supportedBlacklistTypes.length);
for (int i=0; i < supportedBlacklistTypes.length; i++) {
final blacklistFile blFile = new blacklistFile(
final BlacklistFile blFile = new BlacklistFile(
switchboard.getConfig(
supportedBlacklistTypes[i] + ".BlackLists", switchboard.getConfig("BlackLists.DefaultList", "url.default.black")),
supportedBlacklistTypes[i]);
@ -405,8 +408,9 @@ public class listManager {
Switchboard.urlBlacklist.clear();
Switchboard.urlBlacklist.loadList(
blacklistFiles.toArray(new blacklistFile[blacklistFiles.size()]),
blacklistFiles.toArray(new BlacklistFile[blacklistFiles.size()]),
"/");
SearchEventCache.cleanupEvents(true);
// switchboard.urlBlacklist.clear();
// if (f != "") switchboard.urlBlacklist.loadLists("black", f, "/");

@ -80,11 +80,11 @@ import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.util.Domains;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.repository.Blacklist;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.retrieval.Response;
import de.anomic.data.Blacklist;
import de.anomic.http.client.MultiOutputStream;
import de.anomic.http.client.Client;
import de.anomic.http.client.RemoteProxyConfig;

@ -9,8 +9,9 @@ import net.yacy.document.content.RSSMessage;
import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.util.DateFormatter;
import net.yacy.repository.Blacklist;
import de.anomic.data.Blacklist;
import de.anomic.crawler.retrieval.EventOrigin;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;
@ -141,7 +142,7 @@ public final class transferURL {
yacyCore.log.logInfo("Accepting URL " + i + "/" + urlc + " from peer " + otherPeerName + ": " + lEntry.metadata().url().toNormalform(true, false));
try {
sb.indexSegments.urlMetadata(Segments.Process.DHTIN).store(lEntry);
sb.crawlResults.stack(lEntry, iam, iam, 3);
sb.crawlResults.stack(lEntry, iam, iam, EventOrigin.DHT_TRANSFER);
if (yacyCore.log.isFine()) yacyCore.log.logFine("transferURL: received URL '" + metadata.url().toNormalform(false, true) + "' from peer " + otherPeerName);
received++;
} catch (final IOException e) {

@ -50,8 +50,8 @@ import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.CloneableIterator;
import net.yacy.kelondro.table.SplitTable;
import net.yacy.kelondro.util.ScoreCluster;
import net.yacy.repository.Blacklist;
import de.anomic.data.Blacklist;
import de.anomic.http.client.Client;
import de.anomic.http.client.RemoteProxyConfig;
import de.anomic.http.server.ResponseContainer;

@ -52,9 +52,9 @@ import net.yacy.kelondro.rwi.IndexCell;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.rwi.ReferenceFactory;
import net.yacy.kelondro.util.ISO639;
import net.yacy.repository.Blacklist;
import de.anomic.crawler.retrieval.Response;
import de.anomic.data.Blacklist;
public class Segment {

@ -139,6 +139,7 @@ import net.yacy.kelondro.workflow.InstantBusyThread;
import net.yacy.kelondro.workflow.WorkflowJob;
import net.yacy.kelondro.workflow.WorkflowProcessor;
import net.yacy.kelondro.workflow.WorkflowThread;
import net.yacy.repository.Blacklist;
import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.CrawlProfile;
@ -153,11 +154,10 @@ import de.anomic.crawler.ResultURLs;
import de.anomic.crawler.RobotsTxt;
import de.anomic.crawler.ZURL;
import de.anomic.crawler.CrawlProfile.entry;
import de.anomic.crawler.retrieval.EventOrigin;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.retrieval.Response;
import de.anomic.data.Blacklist;
import de.anomic.data.DefaultBlacklist;
import de.anomic.data.LibraryProvider;
import de.anomic.data.URLLicense;
import de.anomic.data.blogBoard;
@ -429,7 +429,7 @@ public final class Switchboard extends serverSwitch {
// load blacklist
this.log.logConfig("Loading blacklist ...");
final File blacklistsPath = getConfigPath(SwitchboardConstants.LISTS_PATH, SwitchboardConstants.LISTS_PATH_DEFAULT);
urlBlacklist = new DefaultBlacklist(blacklistsPath);
urlBlacklist = new Blacklist(blacklistsPath);
listManager.switchboard = this;
listManager.listsPath = blacklistsPath;
listManager.reloadBlacklists();
@ -1156,7 +1156,7 @@ public final class Switchboard extends serverSwitch {
// check if the document should be indexed based on proxy/crawler rules
String noIndexReason = "unspecified indexing error";
if (response.processCase(peers.mySeed().hash) == SwitchboardConstants.PROCESSCASE_4_PROXY_LOAD) {
if (response.processCase(peers.mySeed().hash) == EventOrigin.PROXY_LOAD) {
// proxy-load
noIndexReason = response.shallIndexCacheForProxy();
} else {
@ -1329,8 +1329,8 @@ public final class Switchboard extends serverSwitch {
int c = 0;
if ((crawlQueues.delegatedURL.stackSize() > 1000)) c++;
if ((crawlQueues.errorURL.stackSize() > 1000)) c++;
for (int i = 1; i <= 6; i++) {
if (crawlResults.getStackSize(i) > 1000) c++;
for (EventOrigin origin: EventOrigin.values()) {
if (crawlResults.getStackSize(origin) > 1000) c++;
}
return c;
}
@ -1410,11 +1410,11 @@ public final class Switchboard extends serverSwitch {
}
// clean up loadedURL stack
for (int i = 1; i <= 6; i++) {
for (EventOrigin origin: EventOrigin.values()) {
checkInterruption();
if (crawlResults.getStackSize(i) > 1000) {
if (this.log.isFine()) log.logFine("Cleaning Loaded-URLs report stack, " + crawlResults.getStackSize(i) + " entries on stack " + i);
crawlResults.clearStack(i);
if (crawlResults.getStackSize(origin) > 1000) {
if (this.log.isFine()) log.logFine("Cleaning Loaded-URLs report stack, " + crawlResults.getStackSize(origin) + " entries on stack " + origin.getCode());
crawlResults.clearStack(origin);
hasDoneSomething = true;
}
}
@ -1585,7 +1585,7 @@ public final class Switchboard extends serverSwitch {
private Document parseDocument(Response entry) throws InterruptedException {
Document document = null;
final int processCase = entry.processCase(peers.mySeed().hash);
final EventOrigin processCase = entry.processCase(peers.mySeed().hash);
if (this.log.isFine()) log.logFine("processResourceStack processCase=" + processCase +
", depth=" + entry.depth() +
@ -1635,7 +1635,7 @@ public final class Switchboard extends serverSwitch {
// put anchors on crawl stack
final long stackStartTime = System.currentTimeMillis();
if (
((processCase == SwitchboardConstants.PROCESSCASE_4_PROXY_LOAD) || (processCase == SwitchboardConstants.PROCESSCASE_5_LOCAL_CRAWLING)) &&
((processCase == EventOrigin.PROXY_LOAD) || (processCase == EventOrigin.LOCAL_CRAWLING)) &&
((entry.profile() == null) || (entry.depth() < entry.profile().depth()))
) {
final Map<DigestURI, String> hl = document.getHyperlinks();
@ -1715,7 +1715,7 @@ public final class Switchboard extends serverSwitch {
// CREATE INDEX
final String dc_title = document.dc_title();
final DigestURI referrerURL = queueEntry.referrerURL();
final int processCase = queueEntry.processCase(peers.mySeed().hash);
final EventOrigin processCase = queueEntry.processCase(peers.mySeed().hash);
// remove stopwords
log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + queueEntry.url());
@ -1765,7 +1765,7 @@ public final class Switchboard extends serverSwitch {
MemoryTracker.update("indexed", queueEntry.url().toNormalform(true, false), false);
// if this was performed for a remote crawl request, notify requester
if ((processCase == SwitchboardConstants.PROCESSCASE_6_GLOBAL_CRAWLING) && (queueEntry.initiator() != null)) {
if ((processCase == EventOrigin.GLOBAL_CRAWLING) && (queueEntry.initiator() != null)) {
final yacySeed initiatorPeer = peers.get(queueEntry.initiator());
if (initiatorPeer != null) {
log.logInfo("Sending crawl receipt for '" + queueEntry.url().toNormalform(false, true) + "' to " + initiatorPeer.getName());
@ -1841,7 +1841,7 @@ public final class Switchboard extends serverSwitch {
final Long resourceContentLength = (Long) resource[1];
// parse the resource
final Document document = LoaderDispatcher.parseDocument(metadata.url(), resourceContentLength.longValue(), resourceContent);
final Document document = LoaderDispatcher.parseDocument(metadata.url(), resourceContentLength.longValue(), resourceContent, null);
// get the word set
Set<String> words = null;

@ -385,17 +385,6 @@ public final class SwitchboardConstants {
public static final String WORK_PATH = "workPath";
public static final String WORK_PATH_DEFAULT = "DATA/WORK";
// we must distinguish the following cases: resource-load was initiated by
// 1) global crawling: the index is extern, not here (not possible here)
// 2) result of search queries, some indexes are here (not possible here)
// 3) result of index transfer, some of them are here (not possible here)
// 4) proxy-load (initiator is "------------")
// 5) local prefetch/crawling (initiator is own seedHash)
// 6) local fetching for global crawling (other known or unknown initiator)
public static final int PROCESSCASE_0_UNKNOWN = 0;
public static final int PROCESSCASE_4_PROXY_LOAD = 4;
public static final int PROCESSCASE_5_LOCAL_CRAWLING = 5;
public static final int PROCESSCASE_6_GLOBAL_CRAWLING = 6;
/*
* Some constants
*/

@ -68,13 +68,14 @@ import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.rwi.ReferenceContainerCache;
import net.yacy.kelondro.util.ByteBuffer;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.repository.Blacklist;
import org.apache.commons.httpclient.methods.multipart.ByteArrayPartSource;
import org.apache.commons.httpclient.methods.multipart.Part;
import de.anomic.crawler.ResultURLs;
import de.anomic.crawler.retrieval.EventOrigin;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.data.Blacklist;
import de.anomic.http.client.DefaultCharsetFilePart;
import de.anomic.http.client.DefaultCharsetStringPart;
import de.anomic.http.client.Client;
@ -576,7 +577,7 @@ public final class yacyClient {
// passed all checks, store url
try {
indexSegment.urlMetadata().store(urlEntry);
crawlResults.stack(urlEntry, mySeed.hash, target.hash, 2);
crawlResults.stack(urlEntry, mySeed.hash, target.hash, EventOrigin.QUERIES);
} catch (final IOException e) {
yacyCore.log.logSevere("could not store search result", e);
continue; // db-error

@ -51,8 +51,8 @@ import java.util.Iterator;
import java.util.Map;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.repository.Blacklist;
import de.anomic.data.Blacklist;
import de.anomic.search.Switchboard;
public class yacyNewsPool {

@ -34,9 +34,9 @@ import java.util.TreeSet;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Bitfield;
import net.yacy.kelondro.util.ScoreCluster;
import net.yacy.repository.Blacklist;
import de.anomic.crawler.ResultURLs;
import de.anomic.data.Blacklist;
import de.anomic.search.QueryParams;
import de.anomic.search.RankingProfile;
import de.anomic.search.RankingProcess;

@ -1,316 +1,462 @@
// AbstractBlacklist.java
// first published on http://www.yacy.net
// (C) 2007 by Bjoern Krombholz
// last major change: 12. August 2006 (theli) ?
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.data;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.SetTools;
import de.anomic.search.SearchEventCache;
public abstract class AbstractBlacklist implements Blacklist {
public static final int ERR_TWO_WILDCARDS_IN_HOST = 1;
public static final int ERR_SUBDOMAIN_XOR_WILDCARD = 2;
public static final int ERR_PATH_REGEX = 3;
public static final int ERR_WILDCARD_BEGIN_OR_END = 4;
public static final int ERR_HOST_WRONG_CHARS = 5;
public static final int ERR_DOUBLE_OCCURANCE = 6;
public static final int ERR_HOST_REGEX = 7;
protected static final HashSet<String> BLACKLIST_TYPES = new HashSet<String>(Arrays.asList(new String[]{
Blacklist.BLACKLIST_CRAWLER,
Blacklist.BLACKLIST_PROXY,
Blacklist.BLACKLIST_DHT,
Blacklist.BLACKLIST_SEARCH,
Blacklist.BLACKLIST_SURFTIPS,
Blacklist.BLACKLIST_NEWS
}));
public static final String BLACKLIST_TYPES_STRING="proxy,crawler,dht,search,surftips,news";
protected File blacklistRootPath = null;
protected HashMap<String, Set<String>> cachedUrlHashs = null;
//protected HashMap<String, HashMap<String, ArrayList<String>>> hostpaths = null; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here
protected HashMap<String, HashMap<String, ArrayList<String>>> hostpaths_matchable = null; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here
protected HashMap<String, HashMap<String, ArrayList<String>>> hostpaths_notmatchable = null; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here
public AbstractBlacklist(final File rootPath) {
this.setRootPath(rootPath);
this.blacklistRootPath = rootPath;
// prepare the data structure
//this.hostpaths = new HashMap<String, HashMap<String, ArrayList<String>>>();
this.hostpaths_matchable = new HashMap<String, HashMap<String, ArrayList<String>>>();
this.hostpaths_notmatchable = new HashMap<String, HashMap<String, ArrayList<String>>>();
this.cachedUrlHashs = new HashMap<String, Set<String>>();
final Iterator<String> iter = BLACKLIST_TYPES.iterator();
while (iter.hasNext()) {
final String blacklistType = iter.next();
//this.hostpaths.put(blacklistType, new HashMap<String, ArrayList<String>>());
this.hostpaths_matchable.put(blacklistType, new HashMap<String, ArrayList<String>>());
this.hostpaths_notmatchable.put(blacklistType, new HashMap<String, ArrayList<String>>());
this.cachedUrlHashs.put(blacklistType, Collections.synchronizedSet(new HashSet<String>()));
}
}
public void setRootPath(final File rootPath) {
if (rootPath == null)
throw new NullPointerException("The blacklist root path must not be null.");
if (!rootPath.isDirectory())
throw new IllegalArgumentException("The blacklist root path is not a directory.");
if (!rootPath.canRead())
throw new IllegalArgumentException("The blacklist root path is not readable.");
this.blacklistRootPath = rootPath;
}
protected HashMap<String, ArrayList<String>> getBlacklistMap(final String blacklistType,final boolean matchable) {
if (blacklistType == null) throw new IllegalArgumentException();
if (!BLACKLIST_TYPES.contains(blacklistType)) throw new IllegalArgumentException("Unknown blacklist type: "+blacklistType+".");
return (matchable)? this.hostpaths_matchable.get(blacklistType) : this.hostpaths_notmatchable.get(blacklistType);
}
protected Set<String> getCacheUrlHashsSet(final String blacklistType) {
if (blacklistType == null) throw new IllegalArgumentException();
if (!BLACKLIST_TYPES.contains(blacklistType)) throw new IllegalArgumentException("Unknown backlist type.");
return this.cachedUrlHashs.get(blacklistType);
}
public void clear() {
for(final HashMap<String, ArrayList<String>> entry: this.hostpaths_matchable.values()) {
entry.clear();
}
for(final HashMap<String, ArrayList<String>> entry: this.hostpaths_notmatchable.values()) {
entry.clear();
}
for(final Set<String> entry: this.cachedUrlHashs.values()) {
entry.clear();
}
// clean up all search events in case that an old blacklist entry denied previously returned results, but does not anymore
SearchEventCache.cleanupEvents(true);
}
public int size() {
int size = 0;
for(final String entry: this.hostpaths_matchable.keySet()) {
for(final ArrayList<String> ientry: this.hostpaths_matchable.get(entry).values()) {
size += ientry.size();
}
}
for(final String entry: this.hostpaths_notmatchable.keySet()) {
for(final ArrayList<String> ientry: this.hostpaths_notmatchable.get(entry).values()) {
size += ientry.size();
}
}
return size;
}
public void loadList(final blacklistFile[] blFiles, final String sep) {
for (int j = 0; j < blFiles.length; j++) {
final blacklistFile blf = blFiles[j];
loadList(blf.getType(), blf.getFileName(), sep);
}
}
public void loadList(final blacklistFile blFile, final String sep) {
final HashMap<String, ArrayList<String>> blacklistMapMatch = getBlacklistMap(blFile.getType(),true);
final HashMap<String, ArrayList<String>> blacklistMapNotMatch = getBlacklistMap(blFile.getType(),false);
Set<Map.Entry<String, ArrayList<String>>> loadedBlacklist;
Map.Entry<String, ArrayList<String>> loadedEntry;
ArrayList<String> paths;
ArrayList<String> loadedPaths;
final String[] fileNames = blFile.getFileNamesUnified();
if (fileNames.length > 0) {
for (int i = 0; i < fileNames.length; i++) {
// make sure all requested blacklist files exist
final File file = new File(this.blacklistRootPath, fileNames[i]);
try {
file.createNewFile();
} catch (final IOException e) { /* */ }
// join all blacklists from files into one internal blacklist map
loadedBlacklist = SetTools.loadMapMultiValsPerKey(file.toString(), sep).entrySet();
for (final Iterator<Map.Entry<String, ArrayList<String>>> mi = loadedBlacklist.iterator(); mi.hasNext(); ) {
loadedEntry = mi.next();
loadedPaths = loadedEntry.getValue();
// create new entry if host mask unknown, otherwise merge
// existing one with path patterns from blacklist file
paths = (isMatchable(loadedEntry.getKey())) ? blacklistMapMatch.get(loadedEntry.getKey()) : blacklistMapNotMatch.get(loadedEntry.getKey());
if (paths == null) {
if(isMatchable(loadedEntry.getKey()))
blacklistMapMatch.put(loadedEntry.getKey(), loadedPaths);
else
blacklistMapNotMatch.put(loadedEntry.getKey(), loadedPaths);
} else {
// TODO check for duplicates? (refactor List -> Set)
paths.addAll(loadedPaths);
}
}
}
// clean up all search events in case that a (new) blacklist entry denies previously returned results
SearchEventCache.cleanupEvents(true);
}
}
public void loadList(final String blacklistType, final String fileNames, final String sep) {
// method for not breaking older plasmaURLPattern interface
final blacklistFile blFile = new blacklistFile(fileNames, blacklistType);
loadList(blFile, sep);
}
public void removeAll(final String blacklistType, final String host) {
getBlacklistMap(blacklistType,true).remove(host);
getBlacklistMap(blacklistType,false).remove(host);
// clean up all search events in case that an old blacklist entry denied previously returned results, but does not anymore
SearchEventCache.cleanupEvents(true);
}
public void remove(final String blacklistType, final String host, final String path) {
final HashMap<String, ArrayList<String>> blacklistMap = getBlacklistMap(blacklistType,true);
ArrayList<String> hostList = blacklistMap.get(host);
if(hostList != null) {
hostList.remove(path);
if (hostList.size() == 0)
blacklistMap.remove(host);
}
final HashMap<String, ArrayList<String>> blacklistMapNotMatch = getBlacklistMap(blacklistType,false);
hostList = blacklistMapNotMatch.get(host);
if(hostList != null) {
hostList.remove(path);
if (hostList.size() == 0)
blacklistMapNotMatch.remove(host);
}
// clean up all search events in case that an old blacklist entry denied previously returned results, but does not anymore
SearchEventCache.cleanupEvents(true);
}
public void add(final String blacklistType, String host, String path) {
if (host == null) throw new NullPointerException();
if (path == null) throw new NullPointerException();
if (path.length() > 0 && path.charAt(0) == '/') path = path.substring(1);
HashMap<String, ArrayList<String>> blacklistMap;
blacklistMap = (isMatchable(host)) ? getBlacklistMap(blacklistType,true) : getBlacklistMap(blacklistType,false);
// avoid PatternSyntaxException e
if(!isMatchable(host) && host.startsWith("*"))
host = "." + host;
ArrayList<String> hostList = blacklistMap.get(host.toLowerCase());
if (hostList == null) blacklistMap.put(host.toLowerCase(), (hostList = new ArrayList<String>()));
hostList.add(path);
// clean up all search events in case that a (new) blacklist entry denies previously returned results
SearchEventCache.cleanupEvents(true);
}
public int blacklistCacheSize() {
int size = 0;
final Iterator<String> iter = this.cachedUrlHashs.keySet().iterator();
while (iter.hasNext()) {
final Set<String> blacklistMap = this.cachedUrlHashs.get(iter.next());
size += blacklistMap.size();
}
return size;
}
public boolean hashInBlacklistedCache(final String blacklistType, final String urlHash) {
final Set<String> urlHashCache = getCacheUrlHashsSet(blacklistType);
return urlHashCache.contains(urlHash);
}
public boolean contains(final String blacklistType, String host, String path) {
boolean ret = false;
if (blacklistType != null && host != null && path != null) {
HashMap<String, ArrayList<String>> blacklistMap;
blacklistMap = (isMatchable(host)) ? getBlacklistMap(blacklistType,true) : getBlacklistMap(blacklistType,false);
// avoid PatternSyntaxException e
if(!isMatchable(host) && host.startsWith("*"))
host = "." + host;
ArrayList<String> hostList = blacklistMap.get(host.toLowerCase());
if (hostList != null) ret = hostList.contains(path);
}
return ret;
}
public boolean isListed(final String blacklistType, final DigestURI url) {
final Set<String> urlHashCache = getCacheUrlHashsSet(blacklistType);
if (!urlHashCache.contains(url.hash())) {
final boolean temp = isListed(blacklistType, url.getHost().toLowerCase(), url.getFile());
if (temp) {
urlHashCache.add(url.hash());
}
return temp;
}
return true;
}
public static boolean isMatchable (final String host) {
try {
if(Pattern.matches("^[a-z0-9.-]*$", host)) // simple Domain (yacy.net or www.yacy.net)
return true;
if(Pattern.matches("^\\*\\.[a-z0-9-.]*$", host)) // start with *. (not .* and * must follow a dot)
return true;
if(Pattern.matches("^[a-z0-9-.]*\\.\\*$", host)) // ends with .* (not *. and befor * must be a dot)
return true;
} catch (final PatternSyntaxException e) {
//System.out.println(e.toString());
return false;
}
return false;
}
}
// indexDefaultReference.java
// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 11.07.2005 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.repository;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.Map.Entry;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.SetTools;
public class Blacklist {
public static final String BLACKLIST_DHT = "dht";
public static final String BLACKLIST_CRAWLER = "crawler";
public static final String BLACKLIST_PROXY = "proxy";
public static final String BLACKLIST_SEARCH = "search";
public static final String BLACKLIST_SURFTIPS = "surftips";
public static final String BLACKLIST_NEWS = "news";
public static final int ERR_TWO_WILDCARDS_IN_HOST = 1;
public static final int ERR_SUBDOMAIN_XOR_WILDCARD = 2;
public static final int ERR_PATH_REGEX = 3;
public static final int ERR_WILDCARD_BEGIN_OR_END = 4;
public static final int ERR_HOST_WRONG_CHARS = 5;
public static final int ERR_DOUBLE_OCCURANCE = 6;
public static final int ERR_HOST_REGEX = 7;
protected static final HashSet<String> BLACKLIST_TYPES = new HashSet<String>(Arrays.asList(new String[]{
Blacklist.BLACKLIST_CRAWLER,
Blacklist.BLACKLIST_PROXY,
Blacklist.BLACKLIST_DHT,
Blacklist.BLACKLIST_SEARCH,
Blacklist.BLACKLIST_SURFTIPS,
Blacklist.BLACKLIST_NEWS
}));
public static final String BLACKLIST_TYPES_STRING="proxy,crawler,dht,search,surftips,news";
protected File blacklistRootPath = null;
protected HashMap<String, Set<String>> cachedUrlHashs = null;
//protected HashMap<String, HashMap<String, ArrayList<String>>> hostpaths = null; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here
protected HashMap<String, HashMap<String, ArrayList<String>>> hostpaths_matchable = null; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here
protected HashMap<String, HashMap<String, ArrayList<String>>> hostpaths_notmatchable = null; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here
public Blacklist(final File rootPath) {
this.setRootPath(rootPath);
this.blacklistRootPath = rootPath;
// prepare the data structure
//this.hostpaths = new HashMap<String, HashMap<String, ArrayList<String>>>();
this.hostpaths_matchable = new HashMap<String, HashMap<String, ArrayList<String>>>();
this.hostpaths_notmatchable = new HashMap<String, HashMap<String, ArrayList<String>>>();
this.cachedUrlHashs = new HashMap<String, Set<String>>();
final Iterator<String> iter = BLACKLIST_TYPES.iterator();
while (iter.hasNext()) {
final String blacklistType = iter.next();
//this.hostpaths.put(blacklistType, new HashMap<String, ArrayList<String>>());
this.hostpaths_matchable.put(blacklistType, new HashMap<String, ArrayList<String>>());
this.hostpaths_notmatchable.put(blacklistType, new HashMap<String, ArrayList<String>>());
this.cachedUrlHashs.put(blacklistType, Collections.synchronizedSet(new HashSet<String>()));
}
}
public void setRootPath(final File rootPath) {
if (rootPath == null)
throw new NullPointerException("The blacklist root path must not be null.");
if (!rootPath.isDirectory())
throw new IllegalArgumentException("The blacklist root path is not a directory.");
if (!rootPath.canRead())
throw new IllegalArgumentException("The blacklist root path is not readable.");
this.blacklistRootPath = rootPath;
}
protected HashMap<String, ArrayList<String>> getBlacklistMap(final String blacklistType,final boolean matchable) {
if (blacklistType == null) throw new IllegalArgumentException();
if (!BLACKLIST_TYPES.contains(blacklistType)) throw new IllegalArgumentException("Unknown blacklist type: "+blacklistType+".");
return (matchable)? this.hostpaths_matchable.get(blacklistType) : this.hostpaths_notmatchable.get(blacklistType);
}
protected Set<String> getCacheUrlHashsSet(final String blacklistType) {
if (blacklistType == null) throw new IllegalArgumentException();
if (!BLACKLIST_TYPES.contains(blacklistType)) throw new IllegalArgumentException("Unknown backlist type.");
return this.cachedUrlHashs.get(blacklistType);
}
public void clear() {
for(final HashMap<String, ArrayList<String>> entry: this.hostpaths_matchable.values()) {
entry.clear();
}
for(final HashMap<String, ArrayList<String>> entry: this.hostpaths_notmatchable.values()) {
entry.clear();
}
for(final Set<String> entry: this.cachedUrlHashs.values()) {
entry.clear();
}
}
public int size() {
int size = 0;
for(final String entry: this.hostpaths_matchable.keySet()) {
for(final ArrayList<String> ientry: this.hostpaths_matchable.get(entry).values()) {
size += ientry.size();
}
}
for(final String entry: this.hostpaths_notmatchable.keySet()) {
for(final ArrayList<String> ientry: this.hostpaths_notmatchable.get(entry).values()) {
size += ientry.size();
}
}
return size;
}
public void loadList(final BlacklistFile[] blFiles, final String sep) {
for (int j = 0; j < blFiles.length; j++) {
final BlacklistFile blf = blFiles[j];
loadList(blf.getType(), blf.getFileName(), sep);
}
}
public void loadList(final BlacklistFile blFile, final String sep) {
final HashMap<String, ArrayList<String>> blacklistMapMatch = getBlacklistMap(blFile.getType(),true);
final HashMap<String, ArrayList<String>> blacklistMapNotMatch = getBlacklistMap(blFile.getType(),false);
Set<Map.Entry<String, ArrayList<String>>> loadedBlacklist;
Map.Entry<String, ArrayList<String>> loadedEntry;
ArrayList<String> paths;
ArrayList<String> loadedPaths;
final String[] fileNames = blFile.getFileNamesUnified();
if (fileNames.length > 0) {
for (int i = 0; i < fileNames.length; i++) {
// make sure all requested blacklist files exist
final File file = new File(this.blacklistRootPath, fileNames[i]);
try {
file.createNewFile();
} catch (final IOException e) { /* */ }
// join all blacklists from files into one internal blacklist map
loadedBlacklist = SetTools.loadMapMultiValsPerKey(file.toString(), sep).entrySet();
for (final Iterator<Map.Entry<String, ArrayList<String>>> mi = loadedBlacklist.iterator(); mi.hasNext(); ) {
loadedEntry = mi.next();
loadedPaths = loadedEntry.getValue();
// create new entry if host mask unknown, otherwise merge
// existing one with path patterns from blacklist file
paths = (isMatchable(loadedEntry.getKey())) ? blacklistMapMatch.get(loadedEntry.getKey()) : blacklistMapNotMatch.get(loadedEntry.getKey());
if (paths == null) {
if(isMatchable(loadedEntry.getKey()))
blacklistMapMatch.put(loadedEntry.getKey(), loadedPaths);
else
blacklistMapNotMatch.put(loadedEntry.getKey(), loadedPaths);
} else {
// TODO check for duplicates? (refactor List -> Set)
paths.addAll(loadedPaths);
}
}
}
}
}
public void loadList(final String blacklistType, final String fileNames, final String sep) {
// method for not breaking older plasmaURLPattern interface
final BlacklistFile blFile = new BlacklistFile(fileNames, blacklistType);
loadList(blFile, sep);
}
public void removeAll(final String blacklistType, final String host) {
getBlacklistMap(blacklistType,true).remove(host);
getBlacklistMap(blacklistType,false).remove(host);
}
public void remove(final String blacklistType, final String host, final String path) {
final HashMap<String, ArrayList<String>> blacklistMap = getBlacklistMap(blacklistType,true);
ArrayList<String> hostList = blacklistMap.get(host);
if(hostList != null) {
hostList.remove(path);
if (hostList.size() == 0)
blacklistMap.remove(host);
}
final HashMap<String, ArrayList<String>> blacklistMapNotMatch = getBlacklistMap(blacklistType,false);
hostList = blacklistMapNotMatch.get(host);
if (hostList != null) {
hostList.remove(path);
if (hostList.size() == 0)
blacklistMapNotMatch.remove(host);
}
}
public void add(final String blacklistType, String host, String path) {
if (host == null) throw new NullPointerException();
if (path == null) throw new NullPointerException();
if (path.length() > 0 && path.charAt(0) == '/') path = path.substring(1);
HashMap<String, ArrayList<String>> blacklistMap;
blacklistMap = (isMatchable(host)) ? getBlacklistMap(blacklistType,true) : getBlacklistMap(blacklistType,false);
// avoid PatternSyntaxException e
if(!isMatchable(host) && host.startsWith("*"))
host = "." + host;
ArrayList<String> hostList = blacklistMap.get(host.toLowerCase());
if (hostList == null) blacklistMap.put(host.toLowerCase(), (hostList = new ArrayList<String>()));
hostList.add(path);
}
public int blacklistCacheSize() {
int size = 0;
final Iterator<String> iter = this.cachedUrlHashs.keySet().iterator();
while (iter.hasNext()) {
final Set<String> blacklistMap = this.cachedUrlHashs.get(iter.next());
size += blacklistMap.size();
}
return size;
}
public boolean hashInBlacklistedCache(final String blacklistType, final String urlHash) {
final Set<String> urlHashCache = getCacheUrlHashsSet(blacklistType);
return urlHashCache.contains(urlHash);
}
public boolean contains(final String blacklistType, String host, String path) {
boolean ret = false;
if (blacklistType != null && host != null && path != null) {
HashMap<String, ArrayList<String>> blacklistMap;
blacklistMap = (isMatchable(host)) ? getBlacklistMap(blacklistType,true) : getBlacklistMap(blacklistType,false);
// avoid PatternSyntaxException e
if(!isMatchable(host) && host.startsWith("*"))
host = "." + host;
ArrayList<String> hostList = blacklistMap.get(host.toLowerCase());
if (hostList != null) ret = hostList.contains(path);
}
return ret;
}
public boolean isListed(final String blacklistType, final DigestURI url) {
final Set<String> urlHashCache = getCacheUrlHashsSet(blacklistType);
if (!urlHashCache.contains(url.hash())) {
final boolean temp = isListed(blacklistType, url.getHost().toLowerCase(), url.getFile());
if (temp) {
urlHashCache.add(url.hash());
}
return temp;
}
return true;
}
public static boolean isMatchable (final String host) {
try {
if(Pattern.matches("^[a-z0-9.-]*$", host)) // simple Domain (yacy.net or www.yacy.net)
return true;
if(Pattern.matches("^\\*\\.[a-z0-9-.]*$", host)) // start with *. (not .* and * must follow a dot)
return true;
if(Pattern.matches("^[a-z0-9-.]*\\.\\*$", host)) // ends with .* (not *. and befor * must be a dot)
return true;
} catch (final PatternSyntaxException e) {
//System.out.println(e.toString());
return false;
}
return false;
}
public String getEngineInfo() {
return "Default YaCy Blacklist Engine";
}
public boolean isListed(final String blacklistType, final String hostlow, String path) {
if (hostlow == null) throw new NullPointerException();
if (path == null) throw new NullPointerException();
// getting the proper blacklist
final HashMap<String, ArrayList<String>> blacklistMapMatched = getBlacklistMap(blacklistType,true);
if (path.length() > 0 && path.charAt(0) == '/') path = path.substring(1);
ArrayList<String> app;
boolean matched = false;
String pp = ""; // path-pattern
// try to match complete domain
if (!matched && (app = blacklistMapMatched.get(hostlow)) != null) {
for (int i=app.size()-1; !matched && i>-1; i--) {
pp = app.get(i);
matched |= ((pp.equals("*")) || (path.matches(pp)));
}
}
// first try to match the domain with wildcard '*'
// [TL] While "." are found within the string
int index = 0;
while (!matched && (index = hostlow.indexOf('.', index + 1)) != -1) {
if ((app = blacklistMapMatched.get(hostlow.substring(0, index + 1) + "*")) != null) {
for (int i=app.size()-1; !matched && i>-1; i--) {
pp = app.get(i);
matched |= ((pp.equals("*")) || (path.matches(pp)));
}
}
if ((app = blacklistMapMatched.get(hostlow.substring(0, index))) != null) {
for (int i=app.size()-1; !matched && i>-1; i--) {
pp = app.get(i);
matched |= ((pp.equals("*")) || (path.matches(pp)));
}
}
}
index = hostlow.length();
while (!matched && (index = hostlow.lastIndexOf('.', index - 1)) != -1) {
if ((app = blacklistMapMatched.get("*" + hostlow.substring(index, hostlow.length()))) != null) {
for (int i=app.size()-1; !matched && i>-1; i--) {
pp = app.get(i);
matched |= ((pp.equals("*")) || (path.matches(pp)));
}
}
if ((app = blacklistMapMatched.get(hostlow.substring(index +1, hostlow.length()))) != null) {
for (int i=app.size()-1; !matched && i>-1; i--) {
pp = app.get(i);
matched |= ((pp.equals("*")) || (path.matches(pp)));
}
}
}
// loop over all Regexentrys
if(!matched) {
final HashMap<String, ArrayList<String>> blacklistMapNotMatched = getBlacklistMap(blacklistType,false);
String key;
for(final Entry<String, ArrayList<String>> entry: blacklistMapNotMatched.entrySet()) {
key = entry.getKey();
try {
if(Pattern.matches(key, hostlow)) {
app = entry.getValue();
for (int i=0; i<app.size(); i++) {
if(Pattern.matches(app.get(i), path))
return true;
}
}
} catch (final PatternSyntaxException e) {
//System.out.println(e.toString());
}
}
}
return matched;
}
public int checkError(String element, Map<String, String> properties) {
boolean allowRegex = true;
int slashPos;
String host, path;
if (properties != null) {
allowRegex = properties.get("allowRegex").equalsIgnoreCase("true") ? true : false;
}
if ((slashPos = element.indexOf("/")) == -1) {
host = element;
path = ".*";
} else {
host = element.substring(0, slashPos);
path = element.substring(slashPos + 1);
}
if (!allowRegex || !isValidRegex(host)) {
final int i = host.indexOf("*");
// check whether host begins illegally
if (!host.matches("([A-Za-z0-9_-]+|\\*)(\\.([A-Za-z0-9_-]+|\\*))*")) {
if (i == 0 && host.length() > 1 && host.charAt(1) != '.') {
return ERR_SUBDOMAIN_XOR_WILDCARD;
}
return ERR_HOST_WRONG_CHARS;
}
// in host-part only full sub-domains may be wildcards
if (host.length() > 0 && i > -1) {
if (!(i == 0 || i == host.length() - 1)) {
return ERR_WILDCARD_BEGIN_OR_END;
}
if (i == host.length() - 1 && host.length() > 1 && host.charAt(i - 1) != '.') {
return ERR_SUBDOMAIN_XOR_WILDCARD;
}
}
// check for double-occurences of "*" in host
if (host.indexOf("*", i + 1) > -1) {
return ERR_TWO_WILDCARDS_IN_HOST;
}
} else if (allowRegex && !isValidRegex(host)) {
return ERR_HOST_REGEX;
}
// check for errors on regex-compiling path
if (!isValidRegex(path) && !path.equals("*")) {
return ERR_PATH_REGEX;
}
return 0;
}
/**
* Checks if a given expression is a valid regular expression.
* @param expression The expression to be checked.
* @return True if the expression is a valid regular expression, else false.
*/
private static boolean isValidRegex(String expression) {
boolean ret = true;
try {
Pattern.compile(expression);
} catch (final PatternSyntaxException e) {
ret = false;
}
return ret;
}
}

@ -0,0 +1,58 @@
// BlacklistFile.java
// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 11.07.2005 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2009-09-29 23:28:49 +0200 (Di, 29 Sep 2009) $
// $LastChangedRevision: 6359 $
// $LastChangedBy: low012 $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.repository;
import java.util.Arrays;
import java.util.HashSet;
public class BlacklistFile {
private final String filename;
private final String type;
public BlacklistFile(final String filename, final String type) {
this.filename = filename;
this.type = type;
}
public String getFileName() { return this.filename; }
/**
* Construct a unified array of file names from comma seperated file name
* list.
*
* @return unified String array of file names
*/
public String[] getFileNamesUnified() {
final HashSet<String> hs = new HashSet<String>(Arrays.asList(this.filename.split(",")));
return hs.toArray(new String[hs.size()]);
}
public String getType() { return this.type; }
}

@ -421,11 +421,6 @@ public final class LoaderDispatcher {
return null;
}
}
public static Document parseDocument(final DigestURI url, final long contentLength, final InputStream resourceStream) throws ParserException {
return parseDocument(url, contentLength, resourceStream, null);
}
public static ContentScraper parseResource(final LoaderDispatcher loader, final DigestURI location, int cachePolicy) throws IOException {
// load page

Loading…
Cancel
Save