From 522a268305208d71a2f5b9d3652694fe75b4c2d5 Mon Sep 17 00:00:00 2001 From: luccioman Date: Thu, 4 May 2017 16:36:45 +0200 Subject: [PATCH] Improved new blacklist entries URL scheme detection. --- .../net/yacy/repository/BlacklistHelper.java | 61 ++++++++++++------- .../yacy/repository/BlacklistHelperTest.java | 52 ++++++++++++++++ 2 files changed, 91 insertions(+), 22 deletions(-) create mode 100644 test/java/net/yacy/repository/BlacklistHelperTest.java diff --git a/source/net/yacy/repository/BlacklistHelper.java b/source/net/yacy/repository/BlacklistHelper.java index dd57a1ee8..9dfea13f7 100644 --- a/source/net/yacy/repository/BlacklistHelper.java +++ b/source/net/yacy/repository/BlacklistHelper.java @@ -1,5 +1,8 @@ package net.yacy.repository; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + import net.yacy.cora.document.id.Punycode.PunycodeException; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; @@ -12,10 +15,43 @@ public final class BlacklistHelper { /** Used for logging. */ public static final String APP_NAME = "Blacklist"; - + + /** Pattern to identify the eventual URL scheme (protocol) part. + * Examples that will be recognized : "http://", "https://", "ftp://", "^https?://", "anyprotocol://" */ + private static final Pattern URL_SCHEME_PATTERN = Pattern.compile("(^\\^?[a-z\\?]+://).+"); + /** Private constructor to avoid instantiation of static helper class. */ private BlacklistHelper() { } + + /** + * @param entry a blacklist entry. Must not be null. + * @return the entry eventually modified to be ready to use by the Blacklist engine + */ + protected static String prepareEntry(final String entry) { + String newEntry = entry; + /* Remove the eventual unnecessary Regex line beginning char '^' and URL scheme (protocol) part */ + Matcher schemeMatcher = URL_SCHEME_PATTERN.matcher(newEntry); + if(schemeMatcher.matches()) { + newEntry = newEntry.substring(schemeMatcher.end(1)); + } + + if (newEntry.indexOf("*") < 0 && newEntry.indexOf("?") < 0 && newEntry.indexOf("+") < 0) { + // user did not use any wild cards and just submitted a word + + newEntry = ".*" + newEntry + ".*/.*"; + newEntry = ".*.*/.*" + newEntry + ".*"; + + } else { + + int pos = newEntry.indexOf('/',0); + if (pos < 0) { + // add default empty path pattern + newEntry = newEntry + "/.*"; + } + } + return newEntry; + } /** * Adds a new entry to the chosen blacklist. @@ -45,27 +81,8 @@ public final class BlacklistHelper { } return location; } - - if (newEntry.startsWith("http://") ){ - newEntry = newEntry.substring(7); - } else if (newEntry.startsWith("https://")) { - newEntry = newEntry.substring(8); - } - - if (newEntry.indexOf("*") < 0 && newEntry.indexOf("?") < 0 && newEntry.indexOf("+") < 0) { - // user did not use any wild cards and just submitted a word - - newEntry = ".*" + newEntry + ".*/.*"; - newEntry = ".*.*/.*" + newEntry + ".*"; - - } else { - - int pos = newEntry.indexOf('/',0); - if (pos < 0) { - // add default empty path pattern - newEntry = newEntry + "/.*"; - } - } + + newEntry = prepareEntry(newEntry); int pos = newEntry.indexOf('/',0); String host = newEntry.substring(0, pos); diff --git a/test/java/net/yacy/repository/BlacklistHelperTest.java b/test/java/net/yacy/repository/BlacklistHelperTest.java new file mode 100644 index 000000000..bd0f17daa --- /dev/null +++ b/test/java/net/yacy/repository/BlacklistHelperTest.java @@ -0,0 +1,52 @@ +// BlacklistHelperTest.java +// Copyright 2017 by luccioman; https://github.com/luccioman +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package net.yacy.repository; + +import static org.junit.Assert.*; + +import org.junit.Test; + +/** + * Unit tests for the {@link BlacklistHelper} class. + * + */ +public class BlacklistHelperTest { + + /** + * Unit testing of the function {@link BlacklistHelper#prepareEntry(String)} + */ + @Test + public void testPrepareEntry() { + assertEquals("http protocol with path wildcard", "domain.com/path/*", BlacklistHelper.prepareEntry("http://domain.com/path/*")); + assertEquals("http protocol with path regex wildcard", "domain.com/path/.*", BlacklistHelper.prepareEntry("http://domain.com/path/.*")); + assertEquals("https protocol with path wildcard", "domain.com/path/*", BlacklistHelper.prepareEntry("https://domain.com/path/*")); + assertEquals("ftp protocol with path wildcard", "domain.com/path/*", BlacklistHelper.prepareEntry("ftp://domain.com/path/*")); + assertEquals("wildcard in protocol", "domain.com/path/*", BlacklistHelper.prepareEntry("https?://domain.com/path/*")); + assertEquals("regex with line beginning mark", "domain.com/path/.*", BlacklistHelper.prepareEntry("^https://domain.com/path/.*")); + assertEquals("host with regex", "[a-z\\.]*domain.com/path/*", BlacklistHelper.prepareEntry("http://[a-z\\.]*domain.com/path/*")); + assertEquals("path with regex", "domain.com/path/([^/1-9)+[^/]*/.*", BlacklistHelper.prepareEntry("domain.com/path/([^/1-9)+[^/]*/.*")); + assertEquals("ip v4 address", "192.168.1.1/*", BlacklistHelper.prepareEntry("192.168.1.1/*")); + assertEquals("domain only", "domain.com.*/.*", BlacklistHelper.prepareEntry("domain.com.*")); + assertEquals("word only", ".*.*/.*.*word.*/.*.*", BlacklistHelper.prepareEntry("word")); + } + +}