Team added support for URLs with unicode characters in host part to

blacklist. Punycode is used to handle unicode characters.
pull/1/head
Marc Nause 11 years ago
parent fbf1656a67
commit 809b4e1fd9

@ -41,6 +41,7 @@ import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.yacy.cora.document.id.Punycode.PunycodeException;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.data.ListManager;
@ -55,84 +56,122 @@ import net.yacy.server.serverSwitch;
public class BlacklistCleaner_p {
/** Used for logging. */
private static final String APP_NAME = "BLACKLIST-CLEANER";
private static final String RESULTS = "results_";
private static final String DISABLED = "disabled_";
private static final String BLACKLISTS = "blacklists_";
private static final String ENTRIES = "entries_";
public static final Class<?>[] supportedBLEngines = {
Blacklist.class
};
public static final Class<?>[] supportedBLEngines = { Blacklist.class };
public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, @SuppressWarnings("unused") final serverSwitch env) {
public static serverObjects respond(
@SuppressWarnings("unused") final RequestHeader header,
final serverObjects post,
@SuppressWarnings("unused") final serverSwitch env) {
final serverObjects prop = new serverObjects();
String blacklistToUse = null;
prop.put(DISABLED+"checked", "1");
prop.put(DISABLED + "checked", "1");
if (post != null) {
final boolean allowRegex = post.get("allowRegex", "off").equalsIgnoreCase("on") ? true: false;
prop.put(DISABLED+"checked", (allowRegex) ? "1" : "0");
final boolean allowRegex = post.get("allowRegex", "off")
.equalsIgnoreCase("on") ? true : false;
prop.put(DISABLED + "checked", (allowRegex) ? "1" : "0");
if (post.containsKey("listNames")) {
blacklistToUse = post.get("listNames");
if (blacklistToUse.isEmpty() || !ListManager.listSetContains("listManager.listsPath", blacklistToUse)) {
if (blacklistToUse.isEmpty()
|| !ListManager.listSetContains(
"listManager.listsPath",
blacklistToUse)) {
prop.put("results", "2");
}
}
putBlacklists(prop, FileUtils.getDirListing(ListManager.listsPath, Blacklist.BLACKLIST_FILENAME_FILTER), blacklistToUse);
putBlacklists(prop, FileUtils.getDirListing(ListManager.listsPath,
Blacklist.BLACKLIST_FILENAME_FILTER),
blacklistToUse);
if (blacklistToUse != null) {
prop.put("results", "1");
if (post.containsKey("delete")) {
prop.put(RESULTS + "modified", "1");
prop.put(RESULTS + "modified_delCount", removeEntries(blacklistToUse, BlacklistType.values(), getKeysByPrefix(post, "select", true)));
prop.put(RESULTS + "modified_delCount",
removeEntries(blacklistToUse,
BlacklistType.values(),
getKeysByPrefix(post,
"select",
true)));
} else if (post.containsKey("alter")) {
prop.put(RESULTS + "modified", "2");
prop.put(RESULTS + "modified_alterCount", alterEntries(blacklistToUse, BlacklistType.values(), getKeysByPrefix(post, "select", false), getValuesByPrefix(post, "entry", false)));
prop.put(RESULTS + "modified_alterCount",
alterEntries(blacklistToUse,
BlacklistType.values(),
getKeysByPrefix(post,
"select",
false),
getValuesByPrefix(post,
"entry",
false)));
}
// list illegal entries
final Map<String, BlacklistError> illegalEntries = getIllegalEntries(blacklistToUse, Switchboard.urlBlacklist, allowRegex);
final Map<String, BlacklistError> illegalEntries = getIllegalEntries(
blacklistToUse, Switchboard.urlBlacklist,
allowRegex);
prop.put(RESULTS + "blList", blacklistToUse);
prop.put(RESULTS + "entries", illegalEntries.size());
prop.putHTML(RESULTS + "blEngine", Blacklist.getEngineInfo());
prop.put(RESULTS + "disabled", (illegalEntries.isEmpty()) ? "1" : "0");
prop.put(RESULTS + "disabled", (illegalEntries.isEmpty()) ? "1"
: "0");
if (!illegalEntries.isEmpty()) {
prop.put(RESULTS + DISABLED + "entries", illegalEntries.size());
prop.put(RESULTS + DISABLED + "entries",
illegalEntries.size());
int i = 0;
String key;
for (final Entry<String, BlacklistError> entry : illegalEntries.entrySet()) {
for (final Entry<String, BlacklistError> entry : illegalEntries
.entrySet()) {
key = entry.getKey();
prop.put(RESULTS + DISABLED + ENTRIES + i + "_error", entry.getValue().getLong());
prop.putHTML(RESULTS + DISABLED + ENTRIES + i + "_entry", key);
prop.put(RESULTS + DISABLED + ENTRIES + i + "_error",
entry.getValue().getLong());
prop.putHTML(RESULTS + DISABLED + ENTRIES + i
+ "_entry", key);
i++;
}
}
}
} else {
prop.put("results", "0");
putBlacklists(prop, FileUtils.getDirListing(ListManager.listsPath, Blacklist.BLACKLIST_FILENAME_FILTER), blacklistToUse);
putBlacklists(prop, FileUtils.getDirListing(ListManager.listsPath,
Blacklist.BLACKLIST_FILENAME_FILTER),
blacklistToUse);
}
return prop;
}
/**
* Adds a list of blacklist to the server objects properties which are used to
* display the blacklist in the HTML page belonging to this servlet.
* @param prop Server objects properties object.
* @param lists List of blacklists.
* @param selected Element in list of blacklists which will be preselected in HTML.
* Adds a list of blacklist to the server objects properties which are used
* to display the blacklist in the HTML page belonging to this servlet.
*
* @param prop
* Server objects properties object.
* @param lists
* List of blacklists.
* @param selected
* Element in list of blacklists which will be preselected in
* HTML.
*/
private static void putBlacklists(final serverObjects prop, final List<String> lists, final String selected) {
private static void putBlacklists(final serverObjects prop,
final List<String> lists, final String selected) {
boolean supported = false;
for (int i=0; i < supportedBLEngines.length && !supported; i++) {
for (int i = 0; i < supportedBLEngines.length && !supported; i++) {
supported |= (Switchboard.urlBlacklist.getClass() == supportedBLEngines[i]);
}
@ -143,7 +182,8 @@ public class BlacklistCleaner_p {
int count = 0;
for (final String list : lists) {
prop.putHTML(DISABLED + BLACKLISTS + count + "_name", list);
prop.put(DISABLED + BLACKLISTS + count + "_selected", (list.equals(selected)) ? "1" : "0");
prop.put(DISABLED + BLACKLISTS + count + "_selected",
(list.equals(selected)) ? "1" : "0");
count++;
}
} else {
@ -152,48 +192,68 @@ public class BlacklistCleaner_p {
} else {
prop.put("disabled", "1");
for (int i = 0; i < supportedBLEngines.length; i++) {
prop.putHTML(DISABLED + "engines_" + i + "_name", supportedBLEngines[i].getName());
prop.putHTML(DISABLED + "engines_" + i + "_name",
supportedBLEngines[i].getName());
}
prop.put(DISABLED + "engines", supportedBLEngines.length);
}
}
/**
* Retrieves all keys with a certain prefix from the data which has been sent and returns them as an array. This
* method is only a wrapper for {@link getByPrefix(de.anomic.server.serverObjects, java.lang.String, boolean, boolean)}
* which has been created to make it easier to understand the code.
* @param post All POST values.
* @param prefix Prefix by which the input is filtered.
* @param filterDoubles Set true if only unique results shall be returned, else false.
* Retrieves all keys with a certain prefix from the data which has been
* sent and returns them as an array. This method is only a wrapper for
* {@link getByPrefix(de.anomic.server.serverObjects, java.lang.String,
* boolean, boolean)} which has been created to make it easier to understand
* the code.
*
* @param post
* All POST values.
* @param prefix
* Prefix by which the input is filtered.
* @param filterDoubles
* Set true if only unique results shall be returned, else false.
* @return Keys which have been posted.
*/
private static String[] getKeysByPrefix(final serverObjects post, final String prefix, final boolean filterDoubles) {
private static String[] getKeysByPrefix(final serverObjects post,
final String prefix, final boolean filterDoubles) {
return getByPrefix(post, prefix, true, filterDoubles);
}
/**
* Retrieves all values with a certain prefix from the data which has been sent and returns them as an array. This
* method is only a wrapper for {@link getByPrefix(de.anomic.server.serverObjects, java.lang.String, boolean, boolean)}.
* @param post All POST values.
* @param prefix Prefix by which the input is filtered.
* @param filterDoubles Set true if only unique results shall be returned, else false.
* Retrieves all values with a certain prefix from the data which has been
* sent and returns them as an array. This method is only a wrapper for
* {@link getByPrefix(de.anomic.server.serverObjects, java.lang.String,
* boolean, boolean)}.
*
* @param post
* All POST values.
* @param prefix
* Prefix by which the input is filtered.
* @param filterDoubles
* Set true if only unique results shall be returned, else false.
* @return Values which have been posted.
*/
private static String[] getValuesByPrefix(final serverObjects post, final String prefix, final boolean filterDoubles) {
private static String[] getValuesByPrefix(final serverObjects post,
final String prefix, final boolean filterDoubles) {
return getByPrefix(post, prefix, false, filterDoubles);
}
/**
* Method which does all the work for {@link getKeysByPrefix(de.anomic.server.serverObjects, java.lang.String prefix, boolean)}
* and {@link getValuesByPrefix(de.anomic.server.serverObjects, java.lang.String prefix, boolean)} which
* have been crested to make it easier to understand the code.
* Method which does all the work for {@link
* getKeysByPrefix(de.anomic.server.serverObjects, java.lang.String prefix,
* boolean)} and {@link getValuesByPrefix(de.anomic.server.serverObjects,
* java.lang.String prefix, boolean)} which have been crested to make it
* easier to understand the code.
*
* @param post
* @param prefix
* @param useKeys
* @param useHashSet
* @return
*/
private static String[] getByPrefix(final serverObjects post, final String prefix, final boolean useKeys, final boolean useHashSet) {
private static String[] getByPrefix(final serverObjects post,
final String prefix, final boolean useKeys,
final boolean useHashSet) {
Collection<String> r;
if (useHashSet) {
r = new HashSet<String>();
@ -220,19 +280,27 @@ public class BlacklistCleaner_p {
/**
* Finds illegal entries in black list.
* @param blacklistToUse The blacklist to be checked.
* @param blEngine The blacklist engine which is used to check
* @param allowRegex Set to true to allow regular expressions in host part of blacklist entry.
* @return A map which contains all entries whoch have been identified as being
* illegal by the blacklistEngine with the entry as key and an error code as
* value.
*
* @param blacklistToUse
* The blacklist to be checked.
* @param blEngine
* The blacklist engine which is used to check
* @param allowRegex
* Set to true to allow regular expressions in host part of
* blacklist entry.
* @return A map which contains all entries whoch have been identified as
* being illegal by the blacklistEngine with the entry as key and an
* error code as value.
*/
private static Map<String, BlacklistError> getIllegalEntries(final String blacklistToUse, final Blacklist blEngine, final boolean allowRegex) {
private static Map<String, BlacklistError> getIllegalEntries(
final String blacklistToUse, final Blacklist blEngine,
final boolean allowRegex) {
final Map<String, BlacklistError> illegalEntries = new HashMap<String, BlacklistError>();
final Set<String> legalEntries = new HashSet<String>();
final List<String> list = FileUtils.getListArray(new File(ListManager.listsPath, blacklistToUse));
final Map<String, String> properties= new HashMap<String, String>();
final List<String> list = FileUtils.getListArray(new File(
ListManager.listsPath, blacklistToUse));
final Map<String, String> properties = new HashMap<String, String>();
properties.put("allowRegex", String.valueOf(allowRegex));
BlacklistError err = BlacklistError.NO_ERROR;
@ -259,30 +327,42 @@ public class BlacklistCleaner_p {
/**
* Removes existing entries from a blacklist.
* @param blacklistToUse The blacklist which contains the
* @param supportedBlacklistTypes Types of blacklists which the entry is to changed in.
* @param entries Array of entries to be deleted.
*
* @param blacklistToUse
* The blacklist which contains the
* @param supportedBlacklistTypes
* Types of blacklists which the entry is to changed in.
* @param entries
* Array of entries to be deleted.
* @return Length of the list of entries to be removed.
*/
private static int removeEntries(final String blacklistToUse, final BlacklistType[] supportedBlacklistTypes, final String[] entries) {
private static int removeEntries(final String blacklistToUse,
final BlacklistType[] supportedBlacklistTypes,
final String[] entries) {
for (final String entry : entries) {
String s = entry;
// get rid of escape characters which make it impossible to
// properly use contains()
if (s.contains("\\\\")) {
s = s.replaceAll(Pattern.quote("\\\\"), Matcher.quoteReplacement("\\"));
s = s.replaceAll(Pattern.quote("\\\\"),
Matcher.quoteReplacement("\\"));
}
// remove the entry from the running blacklist engine
for (final BlacklistType supportedBlacklistType : supportedBlacklistTypes) {
if (ListManager.listSetContains(supportedBlacklistType + ".BlackLists", blacklistToUse)) {
final String host = (s.indexOf('/',0) == -1) ? s : s.substring(0, s.indexOf('/',0));
final String path = (s.indexOf('/',0) == -1) ? ".*" : s.substring(s.indexOf('/',0) + 1);
if (ListManager.listSetContains(supportedBlacklistType
+ ".BlackLists", blacklistToUse)) {
final String host = (s.indexOf('/', 0) == -1) ? s : s
.substring(0, s.indexOf('/', 0));
final String path = (s.indexOf('/', 0) == -1) ? ".*" : s
.substring(s.indexOf('/', 0) + 1);
try {
Switchboard.urlBlacklist.remove(supportedBlacklistType, blacklistToUse, host, path);
Switchboard.urlBlacklist.remove(supportedBlacklistType,
blacklistToUse, host, path);
} catch (final RuntimeException e) {
ConcurrentLog.severe("BLACKLIST-CLEANER", e.getMessage() + ": " + host + "/" + path);
ConcurrentLog.severe(APP_NAME, e.getMessage() + ": "
+ host + "/" + path);
}
}
}
@ -293,35 +373,46 @@ public class BlacklistCleaner_p {
/**
* Changes existing entry in a blacklist.
* @param blacklistToUse The blacklist which contains the entry.
* @param supportedBlacklistTypes Types of blacklists which the entry is to changed in.
* @param oldEntry Entry to be changed.
* @param newEntry Changed entry.
*
* @param blacklistToUse
* The blacklist which contains the entry.
* @param supportedBlacklistTypes
* Types of blacklists which the entry is to changed in.
* @param oldEntry
* Entry to be changed.
* @param newEntry
* Changed entry.
* @return The length of the new entry.
*/
private static int alterEntries(
final String blacklistToUse,
final BlacklistType[] supportedBlacklistTypes,
final String[] oldEntry,
final String[] newEntry) {
private static int alterEntries(final String blacklistToUse,
final BlacklistType[] supportedBlacklistTypes,
final String[] oldEntry, final String[] newEntry) {
removeEntries(blacklistToUse, supportedBlacklistTypes, oldEntry);
String host, path;
for (final String n : newEntry) {
final int pos = n.indexOf('/',0);
if (pos < 0) {
host = n;
path = ".*";
} else {
host = n.substring(0, pos);
path = n.substring(pos + 1);
}
for (final BlacklistType s : supportedBlacklistTypes) {
if (ListManager.listSetContains(s + ".BlackLists",blacklistToUse)) {
Switchboard.urlBlacklist.add(s, blacklistToUse, host, path);
}
}
SearchEventCache.cleanupEvents(true);
}
for (final String n : newEntry) {
final int pos = n.indexOf('/', 0);
if (pos < 0) {
host = n;
path = ".*";
} else {
host = n.substring(0, pos);
path = n.substring(pos + 1);
}
for (final BlacklistType s : supportedBlacklistTypes) {
if (ListManager.listSetContains(s + ".BlackLists",
blacklistToUse)) {
try {
Switchboard.urlBlacklist.add(s, blacklistToUse, host,
path);
} catch (PunycodeException e) {
ConcurrentLog.warn(APP_NAME,
"Unable to add blacklist entry to blacklist "
+ s, e);
}
}
}
SearchEventCache.cleanupEvents(true);
}
return newEntry.length;
}
}

@ -34,6 +34,9 @@ import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import org.eclipse.jetty.util.log.Log;
import net.yacy.cora.document.id.Punycode.PunycodeException;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
@ -48,13 +51,16 @@ import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
public class Blacklist_p {
/** Used for logging. */
private static final String APP_NAME = "Blacklist";
private final static String EDIT = "edit_";
private final static String DISABLED = "disabled_";
private final static String BLACKLIST = "blackLists_";
private final static String BLACKLIST_MOVE = "blackListsMove_";
private final static String BLACKLIST_SHARED = "BlackLists.Shared";
public static serverObjects respond(final RequestHeader header, final serverObjects post, @SuppressWarnings("unused") final serverSwitch env) {
// load all blacklist files located in the directory
@ -134,7 +140,7 @@ public class Blacklist_p {
final File blackListFile = new File(ListManager.listsPath, blacklistToUse);
if(!blackListFile.delete()) {
ConcurrentLog.warn("Blacklist", "file "+ blackListFile +" could not be deleted!");
ConcurrentLog.warn(APP_NAME, "file "+ blackListFile +" could not be deleted!");
}
for (final BlacklistType supportedBlacklistType : BlacklistType.values()) {
@ -551,7 +557,7 @@ public class Blacklist_p {
// ignore empty entries
if(newEntry == null || newEntry.isEmpty()) {
ConcurrentLog.warn("Blacklist", "skipped adding an empty entry");
ConcurrentLog.warn(APP_NAME, "skipped adding an empty entry");
return "";
}
@ -582,7 +588,11 @@ public class Blacklist_p {
for (final BlacklistType supportedBlacklistType : BlacklistType.values()) {
if (ListManager.listSetContains(supportedBlacklistType + ".BlackLists",blacklistToUse)) {
Switchboard.urlBlacklist.add(supportedBlacklistType, blacklistToUse, host, path);
try {
Switchboard.urlBlacklist.add(supportedBlacklistType, blacklistToUse, host, path);
} catch (PunycodeException e) {
ConcurrentLog.warn(APP_NAME, "Unable to add blacklist entry to blacklist " + supportedBlacklistType, e);
}
}
}

@ -36,6 +36,7 @@ import java.util.Set;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.Punycode.PunycodeException;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.ResultURLs;
@ -54,6 +55,9 @@ import net.yacy.utils.nxTools;
public class CrawlResults {
/** Used for logging. */
private static final String APP_NAME = "PLASMA";
public static serverObjects respond(final RequestHeader header, serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements
final Switchboard sb = (Switchboard) env;
@ -139,7 +143,12 @@ public class CrawlResults {
// handle addtoblacklist
if (post.containsKey("delandaddtoblacklist")) {
Switchboard.urlBlacklist.add(selectedblacklist, domain, ".*");
try {
Switchboard.urlBlacklist.add(selectedblacklist, domain, ".*");
} catch (PunycodeException e) {
ConcurrentLog.warn(APP_NAME, "Unable to add blacklist entry to blacklist " + selectedblacklist, e);
}
}
}
}
@ -204,7 +213,7 @@ public class CrawlResults {
urle = sb.index.fulltext().getMetadata(urlhash);
}
if (urle == null) {
ConcurrentLog.warn("PLASMA", "CrawlResults: URL not in index with url hash " + entry.getKey());
ConcurrentLog.warn(APP_NAME, "CrawlResults: URL not in index with url hash " + entry.getKey());
urlstr = null;
urltxt = null;
continue;
@ -291,7 +300,7 @@ public class CrawlResults {
dark = !dark;
cnt++;
} catch (final Exception e) {
ConcurrentLog.severe("PLASMA", "genTableProps", e);
ConcurrentLog.severe(APP_NAME, "genTableProps", e);
}
}
prop.put("table_indexed", cnt);
@ -331,9 +340,6 @@ public class CrawlResults {
prop.put("table_blacklists", blacklistCount);
}
}
prop.put("process", tabletype.getCode());
// return rewrite properties

@ -35,6 +35,7 @@ import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.document.id.Punycode.PunycodeException;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
@ -70,6 +71,8 @@ import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
public class IndexControlRWIs_p {
private static final String APP_NAME = "IndexControlRWIs_p";
private final static String errmsg = "not possible to compute word from hash";
@ -381,11 +384,17 @@ public class IndexControlRWIs_p {
if ( ListManager.listSetContains(
supportedBlacklistType + ".BlackLists",
blacklist) ) {
Switchboard.urlBlacklist.add(
BlacklistType.valueOf(supportedBlacklistType),
blacklist,
url.getHost(),
url.getFile());
try {
Switchboard.urlBlacklist.add(
BlacklistType.valueOf(supportedBlacklistType),
blacklist,
url.getHost(),
url.getFile());
} catch (PunycodeException e) {
ConcurrentLog.warn(APP_NAME,
"Unable to add blacklist entry to blacklist "
+ supportedBlacklistType, e);
}
}
}
SearchEventCache.cleanupEvents(true);
@ -408,11 +417,17 @@ public class IndexControlRWIs_p {
if ( ListManager.listSetContains(
supportedBlacklistType + ".BlackLists",
blacklist) ) {
Switchboard.urlBlacklist.add(
supportedBlacklistType,
blacklist,
url.getHost(),
".*");
try {
Switchboard.urlBlacklist.add(
supportedBlacklistType,
blacklist,
url.getHost(),
".*");
} catch (PunycodeException e) {
ConcurrentLog.warn(APP_NAME,
"Unable to add blacklist entry to blacklist "
+ supportedBlacklistType, e);
}
}
}
}

@ -240,24 +240,28 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
// handle international domains
if (!Punycode.isBasic(this.host)) try {
final String[] domainParts = CommonPattern.DOT.split(this.host, 0);
final StringBuilder buffer = new StringBuilder(80);
// encode each domain-part separately
for(int i = 0; i < domainParts.length; i++) {
final String part = domainParts[i];
if (!Punycode.isBasic(part)) {
buffer.append("xn--").append(Punycode.encode(part));
} else {
buffer.append(part);
}
if (i != domainParts.length-1) {
buffer.append('.');
}
}
this.host = buffer.toString();
this.host = toPunycode(this.host);
} catch (final PunycodeException e) {}
}
public static String toPunycode(final String host) throws PunycodeException {
final String[] domainParts = CommonPattern.DOT.split(host, 0);
final StringBuilder buffer = new StringBuilder(80);
// encode each domain-part separately
for(int i = 0; i < domainParts.length; i++) {
final String part = domainParts[i];
if (!Punycode.isBasic(part)) {
buffer.append("xn--").append(Punycode.encode(part));
} else {
buffer.append(part);
}
if (i != domainParts.length-1) {
buffer.append('.');
}
}
return buffer.toString();
}
public static final boolean isHTTP(final String s) { return s.startsWith("http://"); }
public static final boolean isHTTPS(final String s) { return s.startsWith("https://"); }
public static final boolean isFTP(final String s) { return s.startsWith("ftp://"); }

@ -46,6 +46,9 @@ import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.document.id.Punycode;
import net.yacy.cora.document.id.Punycode.PunycodeException;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
@ -301,11 +304,22 @@ public class Blacklist {
}
}
public final void add(final BlacklistType blacklistType, final String blacklistToUse, final String host, final String path) {
if (contains(blacklistType, host, path)) {
/**
*
* @param blacklistType
* @param blacklistToUse
* @param host
* @param path
* @throws PunycodeException
*/
public final void add(final BlacklistType blacklistType, final String blacklistToUse, final String host, final String path) throws PunycodeException {
final String safeHost = Punycode.isBasic(host) ? host : MultiProtocolURL.toPunycode(host);
if (contains(blacklistType, safeHost, path)) {
return;
}
if (host == null) {
if (safeHost == null) {
throw new IllegalArgumentException("host may not be null");
}
if (path == null) {
@ -316,7 +330,7 @@ public class Blacklist {
final Map<String, Set<Pattern>> blacklistMap = getBlacklistMap(blacklistType, isMatchable(host));
// avoid PatternSyntaxException e
final String h = ((!isMatchable(host) && !host.isEmpty() && host.charAt(0) == '*') ? "." + host : host).toLowerCase();
final String h = ((!isMatchable(safeHost) && !safeHost.isEmpty() && safeHost.charAt(0) == '*') ? "." + safeHost : safeHost).toLowerCase();
if (!p.isEmpty() && p.charAt(0) == '*') {
p = "." + p;
}
@ -356,13 +370,14 @@ public class Blacklist {
}
/**
* appends a entry to the backlist source file
* appends aN entry to the backlist source file.
*
* @param blacklistSourcefile name of the blacklist file (LISTS/*.black)
* @param host host or host pattern
* @param path path or path pattern
* @throws PunycodeException
*/
public final void add (final String blacklistSourcefile, final String host, final String path) {
public final void add (final String blacklistSourcefile, final String host, final String path) throws PunycodeException {
// TODO: check sourcefile synced with cache.ser files ?
if (host == null) {
throw new IllegalArgumentException("host may not be null");
@ -374,7 +389,10 @@ public class Blacklist {
String p = (!path.isEmpty() && path.charAt(0) == '/') ? path.substring(1) : path;
// avoid PatternSyntaxException e
final String h = ((!isMatchable(host) && !host.isEmpty() && host.charAt(0) == '*') ? "." + host : host).toLowerCase();
String h = ((!isMatchable(host) && !host.isEmpty() && host.charAt(0) == '*') ? "." + host : host).toLowerCase();
h = Punycode.isBasic(h) ? h : MultiProtocolURL.toPunycode(h);
if (!p.isEmpty() && p.charAt(0) == '*') {
p = "." + p;
}

@ -27,7 +27,6 @@ package net.yacy.search.snippet;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
@ -58,7 +57,6 @@ import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard;
@SuppressWarnings("unused")
public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaSnippet> {
public ContentDomain type;
public DigestURL href, source;
@ -249,24 +247,24 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
}
/**
* Checks wether given URL is in blacklist for given blacklist type
*
* @param url The URL to check
* @param blacklistType Type of blacklist (see class Blacklist, BLACKLIST_FOO)
* @return isBlacklisted Wether the given URL is blacklisted
* Checks whether given URL is in blacklist for given blacklist type
*
* @param url
* The URL to check
* @param blacklistType
* Type of blacklist (see class Blacklist, BLACKLIST_FOO)
* @return isBlacklisted Whether the given URL is blacklisted
*/
private static boolean isUrlBlacklisted (final BlacklistType blacklistType, final DigestURL url) {
// Default is not blacklisted
boolean isBlacklisted = false;
// check if url is in blacklist
if (Switchboard.urlBlacklist.isListed(blacklistType, url.getHost().toLowerCase(), url.getFile())) {
final boolean isBlacklisted = Switchboard.urlBlacklist.isListed(blacklistType, url.getHost().toLowerCase(), url.getFile());
if (isBlacklisted) {
Switchboard.getSwitchboard().crawlQueues.errorURL.push(url, null, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
ConcurrentLog.fine("snippet fetch", "MEDIA-SNIPPET Rejecting URL '" + url.toString() + "'. URL is in blacklist.");
isBlacklisted = true;
}
// Return result
return isBlacklisted;
}

Loading…
Cancel
Save