Team added support for URLs with unicode characters in host part to

blacklist. Punycode is used to handle unicode characters.
pull/1/head
Marc Nause 11 years ago
parent fbf1656a67
commit 809b4e1fd9

@ -41,6 +41,7 @@ import java.util.Set;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import net.yacy.cora.document.id.Punycode.PunycodeException;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
import net.yacy.data.ListManager; import net.yacy.data.ListManager;
@ -55,84 +56,122 @@ import net.yacy.server.serverSwitch;
public class BlacklistCleaner_p { public class BlacklistCleaner_p {
/** Used for logging. */
private static final String APP_NAME = "BLACKLIST-CLEANER";
private static final String RESULTS = "results_"; private static final String RESULTS = "results_";
private static final String DISABLED = "disabled_"; private static final String DISABLED = "disabled_";
private static final String BLACKLISTS = "blacklists_"; private static final String BLACKLISTS = "blacklists_";
private static final String ENTRIES = "entries_"; private static final String ENTRIES = "entries_";
public static final Class<?>[] supportedBLEngines = { public static final Class<?>[] supportedBLEngines = { Blacklist.class };
Blacklist.class
};
public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, @SuppressWarnings("unused") final serverSwitch env) { public static serverObjects respond(
@SuppressWarnings("unused") final RequestHeader header,
final serverObjects post,
@SuppressWarnings("unused") final serverSwitch env) {
final serverObjects prop = new serverObjects(); final serverObjects prop = new serverObjects();
String blacklistToUse = null; String blacklistToUse = null;
prop.put(DISABLED+"checked", "1"); prop.put(DISABLED + "checked", "1");
if (post != null) { if (post != null) {
final boolean allowRegex = post.get("allowRegex", "off").equalsIgnoreCase("on") ? true: false; final boolean allowRegex = post.get("allowRegex", "off")
prop.put(DISABLED+"checked", (allowRegex) ? "1" : "0"); .equalsIgnoreCase("on") ? true : false;
prop.put(DISABLED + "checked", (allowRegex) ? "1" : "0");
if (post.containsKey("listNames")) { if (post.containsKey("listNames")) {
blacklistToUse = post.get("listNames"); blacklistToUse = post.get("listNames");
if (blacklistToUse.isEmpty() || !ListManager.listSetContains("listManager.listsPath", blacklistToUse)) { if (blacklistToUse.isEmpty()
|| !ListManager.listSetContains(
"listManager.listsPath",
blacklistToUse)) {
prop.put("results", "2"); prop.put("results", "2");
} }
} }
putBlacklists(prop, FileUtils.getDirListing(ListManager.listsPath, Blacklist.BLACKLIST_FILENAME_FILTER), blacklistToUse); putBlacklists(prop, FileUtils.getDirListing(ListManager.listsPath,
Blacklist.BLACKLIST_FILENAME_FILTER),
blacklistToUse);
if (blacklistToUse != null) { if (blacklistToUse != null) {
prop.put("results", "1"); prop.put("results", "1");
if (post.containsKey("delete")) { if (post.containsKey("delete")) {
prop.put(RESULTS + "modified", "1"); prop.put(RESULTS + "modified", "1");
prop.put(RESULTS + "modified_delCount", removeEntries(blacklistToUse, BlacklistType.values(), getKeysByPrefix(post, "select", true))); prop.put(RESULTS + "modified_delCount",
removeEntries(blacklistToUse,
BlacklistType.values(),
getKeysByPrefix(post,
"select",
true)));
} else if (post.containsKey("alter")) { } else if (post.containsKey("alter")) {
prop.put(RESULTS + "modified", "2"); prop.put(RESULTS + "modified", "2");
prop.put(RESULTS + "modified_alterCount", alterEntries(blacklistToUse, BlacklistType.values(), getKeysByPrefix(post, "select", false), getValuesByPrefix(post, "entry", false))); prop.put(RESULTS + "modified_alterCount",
alterEntries(blacklistToUse,
BlacklistType.values(),
getKeysByPrefix(post,
"select",
false),
getValuesByPrefix(post,
"entry",
false)));
} }
// list illegal entries // list illegal entries
final Map<String, BlacklistError> illegalEntries = getIllegalEntries(blacklistToUse, Switchboard.urlBlacklist, allowRegex); final Map<String, BlacklistError> illegalEntries = getIllegalEntries(
blacklistToUse, Switchboard.urlBlacklist,
allowRegex);
prop.put(RESULTS + "blList", blacklistToUse); prop.put(RESULTS + "blList", blacklistToUse);
prop.put(RESULTS + "entries", illegalEntries.size()); prop.put(RESULTS + "entries", illegalEntries.size());
prop.putHTML(RESULTS + "blEngine", Blacklist.getEngineInfo()); prop.putHTML(RESULTS + "blEngine", Blacklist.getEngineInfo());
prop.put(RESULTS + "disabled", (illegalEntries.isEmpty()) ? "1" : "0"); prop.put(RESULTS + "disabled", (illegalEntries.isEmpty()) ? "1"
: "0");
if (!illegalEntries.isEmpty()) { if (!illegalEntries.isEmpty()) {
prop.put(RESULTS + DISABLED + "entries", illegalEntries.size()); prop.put(RESULTS + DISABLED + "entries",
illegalEntries.size());
int i = 0; int i = 0;
String key; String key;
for (final Entry<String, BlacklistError> entry : illegalEntries.entrySet()) { for (final Entry<String, BlacklistError> entry : illegalEntries
.entrySet()) {
key = entry.getKey(); key = entry.getKey();
prop.put(RESULTS + DISABLED + ENTRIES + i + "_error", entry.getValue().getLong()); prop.put(RESULTS + DISABLED + ENTRIES + i + "_error",
prop.putHTML(RESULTS + DISABLED + ENTRIES + i + "_entry", key); entry.getValue().getLong());
prop.putHTML(RESULTS + DISABLED + ENTRIES + i
+ "_entry", key);
i++; i++;
} }
} }
} }
} else { } else {
prop.put("results", "0"); prop.put("results", "0");
putBlacklists(prop, FileUtils.getDirListing(ListManager.listsPath, Blacklist.BLACKLIST_FILENAME_FILTER), blacklistToUse); putBlacklists(prop, FileUtils.getDirListing(ListManager.listsPath,
Blacklist.BLACKLIST_FILENAME_FILTER),
blacklistToUse);
} }
return prop; return prop;
} }
/** /**
* Adds a list of blacklist to the server objects properties which are used to * Adds a list of blacklist to the server objects properties which are used
* display the blacklist in the HTML page belonging to this servlet. * to display the blacklist in the HTML page belonging to this servlet.
* @param prop Server objects properties object. *
* @param lists List of blacklists. * @param prop
* @param selected Element in list of blacklists which will be preselected in HTML. * Server objects properties object.
* @param lists
* List of blacklists.
* @param selected
* Element in list of blacklists which will be preselected in
* HTML.
*/ */
private static void putBlacklists(final serverObjects prop, final List<String> lists, final String selected) { private static void putBlacklists(final serverObjects prop,
final List<String> lists, final String selected) {
boolean supported = false; boolean supported = false;
for (int i=0; i < supportedBLEngines.length && !supported; i++) { for (int i = 0; i < supportedBLEngines.length && !supported; i++) {
supported |= (Switchboard.urlBlacklist.getClass() == supportedBLEngines[i]); supported |= (Switchboard.urlBlacklist.getClass() == supportedBLEngines[i]);
} }
@ -143,7 +182,8 @@ public class BlacklistCleaner_p {
int count = 0; int count = 0;
for (final String list : lists) { for (final String list : lists) {
prop.putHTML(DISABLED + BLACKLISTS + count + "_name", list); prop.putHTML(DISABLED + BLACKLISTS + count + "_name", list);
prop.put(DISABLED + BLACKLISTS + count + "_selected", (list.equals(selected)) ? "1" : "0"); prop.put(DISABLED + BLACKLISTS + count + "_selected",
(list.equals(selected)) ? "1" : "0");
count++; count++;
} }
} else { } else {
@ -152,48 +192,68 @@ public class BlacklistCleaner_p {
} else { } else {
prop.put("disabled", "1"); prop.put("disabled", "1");
for (int i = 0; i < supportedBLEngines.length; i++) { for (int i = 0; i < supportedBLEngines.length; i++) {
prop.putHTML(DISABLED + "engines_" + i + "_name", supportedBLEngines[i].getName()); prop.putHTML(DISABLED + "engines_" + i + "_name",
supportedBLEngines[i].getName());
} }
prop.put(DISABLED + "engines", supportedBLEngines.length); prop.put(DISABLED + "engines", supportedBLEngines.length);
} }
} }
/** /**
* Retrieves all keys with a certain prefix from the data which has been sent and returns them as an array. This * Retrieves all keys with a certain prefix from the data which has been
* method is only a wrapper for {@link getByPrefix(de.anomic.server.serverObjects, java.lang.String, boolean, boolean)} * sent and returns them as an array. This method is only a wrapper for
* which has been created to make it easier to understand the code. * {@link getByPrefix(de.anomic.server.serverObjects, java.lang.String,
* @param post All POST values. * boolean, boolean)} which has been created to make it easier to understand
* @param prefix Prefix by which the input is filtered. * the code.
* @param filterDoubles Set true if only unique results shall be returned, else false. *
* @param post
* All POST values.
* @param prefix
* Prefix by which the input is filtered.
* @param filterDoubles
* Set true if only unique results shall be returned, else false.
* @return Keys which have been posted. * @return Keys which have been posted.
*/ */
private static String[] getKeysByPrefix(final serverObjects post, final String prefix, final boolean filterDoubles) { private static String[] getKeysByPrefix(final serverObjects post,
final String prefix, final boolean filterDoubles) {
return getByPrefix(post, prefix, true, filterDoubles); return getByPrefix(post, prefix, true, filterDoubles);
} }
/** /**
* Retrieves all values with a certain prefix from the data which has been sent and returns them as an array. This * Retrieves all values with a certain prefix from the data which has been
* method is only a wrapper for {@link getByPrefix(de.anomic.server.serverObjects, java.lang.String, boolean, boolean)}. * sent and returns them as an array. This method is only a wrapper for
* @param post All POST values. * {@link getByPrefix(de.anomic.server.serverObjects, java.lang.String,
* @param prefix Prefix by which the input is filtered. * boolean, boolean)}.
* @param filterDoubles Set true if only unique results shall be returned, else false. *
* @param post
* All POST values.
* @param prefix
* Prefix by which the input is filtered.
* @param filterDoubles
* Set true if only unique results shall be returned, else false.
* @return Values which have been posted. * @return Values which have been posted.
*/ */
private static String[] getValuesByPrefix(final serverObjects post, final String prefix, final boolean filterDoubles) { private static String[] getValuesByPrefix(final serverObjects post,
final String prefix, final boolean filterDoubles) {
return getByPrefix(post, prefix, false, filterDoubles); return getByPrefix(post, prefix, false, filterDoubles);
} }
/** /**
* Method which does all the work for {@link getKeysByPrefix(de.anomic.server.serverObjects, java.lang.String prefix, boolean)} * Method which does all the work for {@link
* and {@link getValuesByPrefix(de.anomic.server.serverObjects, java.lang.String prefix, boolean)} which * getKeysByPrefix(de.anomic.server.serverObjects, java.lang.String prefix,
* have been crested to make it easier to understand the code. * boolean)} and {@link getValuesByPrefix(de.anomic.server.serverObjects,
* java.lang.String prefix, boolean)} which have been crested to make it
* easier to understand the code.
*
* @param post * @param post
* @param prefix * @param prefix
* @param useKeys * @param useKeys
* @param useHashSet * @param useHashSet
* @return * @return
*/ */
private static String[] getByPrefix(final serverObjects post, final String prefix, final boolean useKeys, final boolean useHashSet) { private static String[] getByPrefix(final serverObjects post,
final String prefix, final boolean useKeys,
final boolean useHashSet) {
Collection<String> r; Collection<String> r;
if (useHashSet) { if (useHashSet) {
r = new HashSet<String>(); r = new HashSet<String>();
@ -220,19 +280,27 @@ public class BlacklistCleaner_p {
/** /**
* Finds illegal entries in black list. * Finds illegal entries in black list.
* @param blacklistToUse The blacklist to be checked. *
* @param blEngine The blacklist engine which is used to check * @param blacklistToUse
* @param allowRegex Set to true to allow regular expressions in host part of blacklist entry. * The blacklist to be checked.
* @return A map which contains all entries whoch have been identified as being * @param blEngine
* illegal by the blacklistEngine with the entry as key and an error code as * The blacklist engine which is used to check
* value. * @param allowRegex
* Set to true to allow regular expressions in host part of
* blacklist entry.
* @return A map which contains all entries whoch have been identified as
* being illegal by the blacklistEngine with the entry as key and an
* error code as value.
*/ */
private static Map<String, BlacklistError> getIllegalEntries(final String blacklistToUse, final Blacklist blEngine, final boolean allowRegex) { private static Map<String, BlacklistError> getIllegalEntries(
final String blacklistToUse, final Blacklist blEngine,
final boolean allowRegex) {
final Map<String, BlacklistError> illegalEntries = new HashMap<String, BlacklistError>(); final Map<String, BlacklistError> illegalEntries = new HashMap<String, BlacklistError>();
final Set<String> legalEntries = new HashSet<String>(); final Set<String> legalEntries = new HashSet<String>();
final List<String> list = FileUtils.getListArray(new File(ListManager.listsPath, blacklistToUse)); final List<String> list = FileUtils.getListArray(new File(
final Map<String, String> properties= new HashMap<String, String>(); ListManager.listsPath, blacklistToUse));
final Map<String, String> properties = new HashMap<String, String>();
properties.put("allowRegex", String.valueOf(allowRegex)); properties.put("allowRegex", String.valueOf(allowRegex));
BlacklistError err = BlacklistError.NO_ERROR; BlacklistError err = BlacklistError.NO_ERROR;
@ -259,30 +327,42 @@ public class BlacklistCleaner_p {
/** /**
* Removes existing entries from a blacklist. * Removes existing entries from a blacklist.
* @param blacklistToUse The blacklist which contains the *
* @param supportedBlacklistTypes Types of blacklists which the entry is to changed in. * @param blacklistToUse
* @param entries Array of entries to be deleted. * The blacklist which contains the
* @param supportedBlacklistTypes
* Types of blacklists which the entry is to changed in.
* @param entries
* Array of entries to be deleted.
* @return Length of the list of entries to be removed. * @return Length of the list of entries to be removed.
*/ */
private static int removeEntries(final String blacklistToUse, final BlacklistType[] supportedBlacklistTypes, final String[] entries) { private static int removeEntries(final String blacklistToUse,
final BlacklistType[] supportedBlacklistTypes,
final String[] entries) {
for (final String entry : entries) { for (final String entry : entries) {
String s = entry; String s = entry;
// get rid of escape characters which make it impossible to // get rid of escape characters which make it impossible to
// properly use contains() // properly use contains()
if (s.contains("\\\\")) { if (s.contains("\\\\")) {
s = s.replaceAll(Pattern.quote("\\\\"), Matcher.quoteReplacement("\\")); s = s.replaceAll(Pattern.quote("\\\\"),
Matcher.quoteReplacement("\\"));
} }
// remove the entry from the running blacklist engine // remove the entry from the running blacklist engine
for (final BlacklistType supportedBlacklistType : supportedBlacklistTypes) { for (final BlacklistType supportedBlacklistType : supportedBlacklistTypes) {
if (ListManager.listSetContains(supportedBlacklistType + ".BlackLists", blacklistToUse)) { if (ListManager.listSetContains(supportedBlacklistType
final String host = (s.indexOf('/',0) == -1) ? s : s.substring(0, s.indexOf('/',0)); + ".BlackLists", blacklistToUse)) {
final String path = (s.indexOf('/',0) == -1) ? ".*" : s.substring(s.indexOf('/',0) + 1); final String host = (s.indexOf('/', 0) == -1) ? s : s
.substring(0, s.indexOf('/', 0));
final String path = (s.indexOf('/', 0) == -1) ? ".*" : s
.substring(s.indexOf('/', 0) + 1);
try { try {
Switchboard.urlBlacklist.remove(supportedBlacklistType, blacklistToUse, host, path); Switchboard.urlBlacklist.remove(supportedBlacklistType,
blacklistToUse, host, path);
} catch (final RuntimeException e) { } catch (final RuntimeException e) {
ConcurrentLog.severe("BLACKLIST-CLEANER", e.getMessage() + ": " + host + "/" + path); ConcurrentLog.severe(APP_NAME, e.getMessage() + ": "
+ host + "/" + path);
} }
} }
} }
@ -293,35 +373,46 @@ public class BlacklistCleaner_p {
/** /**
* Changes existing entry in a blacklist. * Changes existing entry in a blacklist.
* @param blacklistToUse The blacklist which contains the entry. *
* @param supportedBlacklistTypes Types of blacklists which the entry is to changed in. * @param blacklistToUse
* @param oldEntry Entry to be changed. * The blacklist which contains the entry.
* @param newEntry Changed entry. * @param supportedBlacklistTypes
* Types of blacklists which the entry is to changed in.
* @param oldEntry
* Entry to be changed.
* @param newEntry
* Changed entry.
* @return The length of the new entry. * @return The length of the new entry.
*/ */
private static int alterEntries( private static int alterEntries(final String blacklistToUse,
final String blacklistToUse, final BlacklistType[] supportedBlacklistTypes,
final BlacklistType[] supportedBlacklistTypes, final String[] oldEntry, final String[] newEntry) {
final String[] oldEntry,
final String[] newEntry) {
removeEntries(blacklistToUse, supportedBlacklistTypes, oldEntry); removeEntries(blacklistToUse, supportedBlacklistTypes, oldEntry);
String host, path; String host, path;
for (final String n : newEntry) { for (final String n : newEntry) {
final int pos = n.indexOf('/',0); final int pos = n.indexOf('/', 0);
if (pos < 0) { if (pos < 0) {
host = n; host = n;
path = ".*"; path = ".*";
} else { } else {
host = n.substring(0, pos); host = n.substring(0, pos);
path = n.substring(pos + 1); path = n.substring(pos + 1);
} }
for (final BlacklistType s : supportedBlacklistTypes) { for (final BlacklistType s : supportedBlacklistTypes) {
if (ListManager.listSetContains(s + ".BlackLists",blacklistToUse)) { if (ListManager.listSetContains(s + ".BlackLists",
Switchboard.urlBlacklist.add(s, blacklistToUse, host, path); blacklistToUse)) {
} try {
} Switchboard.urlBlacklist.add(s, blacklistToUse, host,
SearchEventCache.cleanupEvents(true); path);
} } catch (PunycodeException e) {
ConcurrentLog.warn(APP_NAME,
"Unable to add blacklist entry to blacklist "
+ s, e);
}
}
}
SearchEventCache.cleanupEvents(true);
}
return newEntry.length; return newEntry.length;
} }
} }

@ -34,6 +34,9 @@ import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
import org.eclipse.jetty.util.log.Log;
import net.yacy.cora.document.id.Punycode.PunycodeException;
import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
@ -48,13 +51,16 @@ import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch; import net.yacy.server.serverSwitch;
public class Blacklist_p { public class Blacklist_p {
/** Used for logging. */
private static final String APP_NAME = "Blacklist";
private final static String EDIT = "edit_"; private final static String EDIT = "edit_";
private final static String DISABLED = "disabled_"; private final static String DISABLED = "disabled_";
private final static String BLACKLIST = "blackLists_"; private final static String BLACKLIST = "blackLists_";
private final static String BLACKLIST_MOVE = "blackListsMove_"; private final static String BLACKLIST_MOVE = "blackListsMove_";
private final static String BLACKLIST_SHARED = "BlackLists.Shared"; private final static String BLACKLIST_SHARED = "BlackLists.Shared";
public static serverObjects respond(final RequestHeader header, final serverObjects post, @SuppressWarnings("unused") final serverSwitch env) { public static serverObjects respond(final RequestHeader header, final serverObjects post, @SuppressWarnings("unused") final serverSwitch env) {
// load all blacklist files located in the directory // load all blacklist files located in the directory
@ -134,7 +140,7 @@ public class Blacklist_p {
final File blackListFile = new File(ListManager.listsPath, blacklistToUse); final File blackListFile = new File(ListManager.listsPath, blacklistToUse);
if(!blackListFile.delete()) { if(!blackListFile.delete()) {
ConcurrentLog.warn("Blacklist", "file "+ blackListFile +" could not be deleted!"); ConcurrentLog.warn(APP_NAME, "file "+ blackListFile +" could not be deleted!");
} }
for (final BlacklistType supportedBlacklistType : BlacklistType.values()) { for (final BlacklistType supportedBlacklistType : BlacklistType.values()) {
@ -551,7 +557,7 @@ public class Blacklist_p {
// ignore empty entries // ignore empty entries
if(newEntry == null || newEntry.isEmpty()) { if(newEntry == null || newEntry.isEmpty()) {
ConcurrentLog.warn("Blacklist", "skipped adding an empty entry"); ConcurrentLog.warn(APP_NAME, "skipped adding an empty entry");
return ""; return "";
} }
@ -582,7 +588,11 @@ public class Blacklist_p {
for (final BlacklistType supportedBlacklistType : BlacklistType.values()) { for (final BlacklistType supportedBlacklistType : BlacklistType.values()) {
if (ListManager.listSetContains(supportedBlacklistType + ".BlackLists",blacklistToUse)) { if (ListManager.listSetContains(supportedBlacklistType + ".BlackLists",blacklistToUse)) {
Switchboard.urlBlacklist.add(supportedBlacklistType, blacklistToUse, host, path); try {
Switchboard.urlBlacklist.add(supportedBlacklistType, blacklistToUse, host, path);
} catch (PunycodeException e) {
ConcurrentLog.warn(APP_NAME, "Unable to add blacklist entry to blacklist " + supportedBlacklistType, e);
}
} }
} }

@ -36,6 +36,7 @@ import java.util.Set;
import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.Punycode.PunycodeException;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.ResultURLs; import net.yacy.crawler.data.ResultURLs;
@ -54,6 +55,9 @@ import net.yacy.utils.nxTools;
public class CrawlResults { public class CrawlResults {
/** Used for logging. */
private static final String APP_NAME = "PLASMA";
public static serverObjects respond(final RequestHeader header, serverObjects post, final serverSwitch env) { public static serverObjects respond(final RequestHeader header, serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements // return variable that accumulates replacements
final Switchboard sb = (Switchboard) env; final Switchboard sb = (Switchboard) env;
@ -139,7 +143,12 @@ public class CrawlResults {
// handle addtoblacklist // handle addtoblacklist
if (post.containsKey("delandaddtoblacklist")) { if (post.containsKey("delandaddtoblacklist")) {
Switchboard.urlBlacklist.add(selectedblacklist, domain, ".*"); try {
Switchboard.urlBlacklist.add(selectedblacklist, domain, ".*");
} catch (PunycodeException e) {
ConcurrentLog.warn(APP_NAME, "Unable to add blacklist entry to blacklist " + selectedblacklist, e);
}
} }
} }
} }
@ -204,7 +213,7 @@ public class CrawlResults {
urle = sb.index.fulltext().getMetadata(urlhash); urle = sb.index.fulltext().getMetadata(urlhash);
} }
if (urle == null) { if (urle == null) {
ConcurrentLog.warn("PLASMA", "CrawlResults: URL not in index with url hash " + entry.getKey()); ConcurrentLog.warn(APP_NAME, "CrawlResults: URL not in index with url hash " + entry.getKey());
urlstr = null; urlstr = null;
urltxt = null; urltxt = null;
continue; continue;
@ -291,7 +300,7 @@ public class CrawlResults {
dark = !dark; dark = !dark;
cnt++; cnt++;
} catch (final Exception e) { } catch (final Exception e) {
ConcurrentLog.severe("PLASMA", "genTableProps", e); ConcurrentLog.severe(APP_NAME, "genTableProps", e);
} }
} }
prop.put("table_indexed", cnt); prop.put("table_indexed", cnt);
@ -331,9 +340,6 @@ public class CrawlResults {
prop.put("table_blacklists", blacklistCount); prop.put("table_blacklists", blacklistCount);
} }
} }
prop.put("process", tabletype.getCode()); prop.put("process", tabletype.getCode());
// return rewrite properties // return rewrite properties

@ -35,6 +35,7 @@ import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.document.id.Punycode.PunycodeException;
import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
@ -70,6 +71,8 @@ import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch; import net.yacy.server.serverSwitch;
public class IndexControlRWIs_p { public class IndexControlRWIs_p {
private static final String APP_NAME = "IndexControlRWIs_p";
private final static String errmsg = "not possible to compute word from hash"; private final static String errmsg = "not possible to compute word from hash";
@ -381,11 +384,17 @@ public class IndexControlRWIs_p {
if ( ListManager.listSetContains( if ( ListManager.listSetContains(
supportedBlacklistType + ".BlackLists", supportedBlacklistType + ".BlackLists",
blacklist) ) { blacklist) ) {
Switchboard.urlBlacklist.add( try {
BlacklistType.valueOf(supportedBlacklistType), Switchboard.urlBlacklist.add(
blacklist, BlacklistType.valueOf(supportedBlacklistType),
url.getHost(), blacklist,
url.getFile()); url.getHost(),
url.getFile());
} catch (PunycodeException e) {
ConcurrentLog.warn(APP_NAME,
"Unable to add blacklist entry to blacklist "
+ supportedBlacklistType, e);
}
} }
} }
SearchEventCache.cleanupEvents(true); SearchEventCache.cleanupEvents(true);
@ -408,11 +417,17 @@ public class IndexControlRWIs_p {
if ( ListManager.listSetContains( if ( ListManager.listSetContains(
supportedBlacklistType + ".BlackLists", supportedBlacklistType + ".BlackLists",
blacklist) ) { blacklist) ) {
Switchboard.urlBlacklist.add( try {
supportedBlacklistType, Switchboard.urlBlacklist.add(
blacklist, supportedBlacklistType,
url.getHost(), blacklist,
".*"); url.getHost(),
".*");
} catch (PunycodeException e) {
ConcurrentLog.warn(APP_NAME,
"Unable to add blacklist entry to blacklist "
+ supportedBlacklistType, e);
}
} }
} }
} }

@ -240,24 +240,28 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
// handle international domains // handle international domains
if (!Punycode.isBasic(this.host)) try { if (!Punycode.isBasic(this.host)) try {
final String[] domainParts = CommonPattern.DOT.split(this.host, 0); this.host = toPunycode(this.host);
final StringBuilder buffer = new StringBuilder(80);
// encode each domain-part separately
for(int i = 0; i < domainParts.length; i++) {
final String part = domainParts[i];
if (!Punycode.isBasic(part)) {
buffer.append("xn--").append(Punycode.encode(part));
} else {
buffer.append(part);
}
if (i != domainParts.length-1) {
buffer.append('.');
}
}
this.host = buffer.toString();
} catch (final PunycodeException e) {} } catch (final PunycodeException e) {}
} }
public static String toPunycode(final String host) throws PunycodeException {
final String[] domainParts = CommonPattern.DOT.split(host, 0);
final StringBuilder buffer = new StringBuilder(80);
// encode each domain-part separately
for(int i = 0; i < domainParts.length; i++) {
final String part = domainParts[i];
if (!Punycode.isBasic(part)) {
buffer.append("xn--").append(Punycode.encode(part));
} else {
buffer.append(part);
}
if (i != domainParts.length-1) {
buffer.append('.');
}
}
return buffer.toString();
}
public static final boolean isHTTP(final String s) { return s.startsWith("http://"); } public static final boolean isHTTP(final String s) { return s.startsWith("http://"); }
public static final boolean isHTTPS(final String s) { return s.startsWith("https://"); } public static final boolean isHTTPS(final String s) { return s.startsWith("https://"); }
public static final boolean isFTP(final String s) { return s.startsWith("ftp://"); } public static final boolean isFTP(final String s) { return s.startsWith("ftp://"); }

@ -46,6 +46,9 @@ import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException; import java.util.regex.PatternSyntaxException;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.document.id.Punycode;
import net.yacy.cora.document.id.Punycode.PunycodeException;
import net.yacy.cora.storage.HandleSet; import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException; import net.yacy.cora.util.SpaceExceededException;
@ -301,11 +304,22 @@ public class Blacklist {
} }
} }
public final void add(final BlacklistType blacklistType, final String blacklistToUse, final String host, final String path) { /**
if (contains(blacklistType, host, path)) { *
* @param blacklistType
* @param blacklistToUse
* @param host
* @param path
* @throws PunycodeException
*/
public final void add(final BlacklistType blacklistType, final String blacklistToUse, final String host, final String path) throws PunycodeException {
final String safeHost = Punycode.isBasic(host) ? host : MultiProtocolURL.toPunycode(host);
if (contains(blacklistType, safeHost, path)) {
return; return;
} }
if (host == null) { if (safeHost == null) {
throw new IllegalArgumentException("host may not be null"); throw new IllegalArgumentException("host may not be null");
} }
if (path == null) { if (path == null) {
@ -316,7 +330,7 @@ public class Blacklist {
final Map<String, Set<Pattern>> blacklistMap = getBlacklistMap(blacklistType, isMatchable(host)); final Map<String, Set<Pattern>> blacklistMap = getBlacklistMap(blacklistType, isMatchable(host));
// avoid PatternSyntaxException e // avoid PatternSyntaxException e
final String h = ((!isMatchable(host) && !host.isEmpty() && host.charAt(0) == '*') ? "." + host : host).toLowerCase(); final String h = ((!isMatchable(safeHost) && !safeHost.isEmpty() && safeHost.charAt(0) == '*') ? "." + safeHost : safeHost).toLowerCase();
if (!p.isEmpty() && p.charAt(0) == '*') { if (!p.isEmpty() && p.charAt(0) == '*') {
p = "." + p; p = "." + p;
} }
@ -356,13 +370,14 @@ public class Blacklist {
} }
/** /**
* appends a entry to the backlist source file * appends aN entry to the backlist source file.
* *
* @param blacklistSourcefile name of the blacklist file (LISTS/*.black) * @param blacklistSourcefile name of the blacklist file (LISTS/*.black)
* @param host host or host pattern * @param host host or host pattern
* @param path path or path pattern * @param path path or path pattern
* @throws PunycodeException
*/ */
public final void add (final String blacklistSourcefile, final String host, final String path) { public final void add (final String blacklistSourcefile, final String host, final String path) throws PunycodeException {
// TODO: check sourcefile synced with cache.ser files ? // TODO: check sourcefile synced with cache.ser files ?
if (host == null) { if (host == null) {
throw new IllegalArgumentException("host may not be null"); throw new IllegalArgumentException("host may not be null");
@ -374,7 +389,10 @@ public class Blacklist {
String p = (!path.isEmpty() && path.charAt(0) == '/') ? path.substring(1) : path; String p = (!path.isEmpty() && path.charAt(0) == '/') ? path.substring(1) : path;
// avoid PatternSyntaxException e // avoid PatternSyntaxException e
final String h = ((!isMatchable(host) && !host.isEmpty() && host.charAt(0) == '*') ? "." + host : host).toLowerCase(); String h = ((!isMatchable(host) && !host.isEmpty() && host.charAt(0) == '*') ? "." + host : host).toLowerCase();
h = Punycode.isBasic(h) ? h : MultiProtocolURL.toPunycode(h);
if (!p.isEmpty() && p.charAt(0) == '*') { if (!p.isEmpty() && p.charAt(0) == '*') {
p = "." + p; p = "." + p;
} }

@ -27,7 +27,6 @@ package net.yacy.search.snippet;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Comparator; import java.util.Comparator;
import java.util.Date;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@ -58,7 +57,6 @@ import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
@SuppressWarnings("unused")
public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaSnippet> { public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaSnippet> {
public ContentDomain type; public ContentDomain type;
public DigestURL href, source; public DigestURL href, source;
@ -249,24 +247,24 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
} }
/** /**
* Checks wether given URL is in blacklist for given blacklist type * Checks whether given URL is in blacklist for given blacklist type
* *
* @param url The URL to check * @param url
* @param blacklistType Type of blacklist (see class Blacklist, BLACKLIST_FOO) * The URL to check
* @return isBlacklisted Wether the given URL is blacklisted * @param blacklistType
* Type of blacklist (see class Blacklist, BLACKLIST_FOO)
* @return isBlacklisted Whether the given URL is blacklisted
*/ */
private static boolean isUrlBlacklisted (final BlacklistType blacklistType, final DigestURL url) { private static boolean isUrlBlacklisted (final BlacklistType blacklistType, final DigestURL url) {
// Default is not blacklisted
boolean isBlacklisted = false;
// check if url is in blacklist final boolean isBlacklisted = Switchboard.urlBlacklist.isListed(blacklistType, url.getHost().toLowerCase(), url.getFile());
if (Switchboard.urlBlacklist.isListed(blacklistType, url.getHost().toLowerCase(), url.getFile())) {
if (isBlacklisted) {
Switchboard.getSwitchboard().crawlQueues.errorURL.push(url, null, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1); Switchboard.getSwitchboard().crawlQueues.errorURL.push(url, null, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
ConcurrentLog.fine("snippet fetch", "MEDIA-SNIPPET Rejecting URL '" + url.toString() + "'. URL is in blacklist."); ConcurrentLog.fine("snippet fetch", "MEDIA-SNIPPET Rejecting URL '" + url.toString() + "'. URL is in blacklist.");
isBlacklisted = true;
} }
// Return result
return isBlacklisted; return isBlacklisted;
} }

Loading…
Cancel
Save