Improved new blacklist entries URL scheme detection.

pull/122/head
luccioman 8 years ago
parent 532981b363
commit 522a268305

@ -1,5 +1,8 @@
package net.yacy.repository;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.yacy.cora.document.id.Punycode.PunycodeException;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
@ -12,10 +15,43 @@ public final class BlacklistHelper {
/** Used for logging. */
public static final String APP_NAME = "Blacklist";
/** Pattern to identify the eventual URL scheme (protocol) part.
* Examples that will be recognized : "http://", "https://", "ftp://", "^https?://", "anyprotocol://" */
private static final Pattern URL_SCHEME_PATTERN = Pattern.compile("(^\\^?[a-z\\?]+://).+");
/** Private constructor to avoid instantiation of static helper class. */
private BlacklistHelper() {
}
/**
* @param entry a blacklist entry. Must not be null.
* @return the entry eventually modified to be ready to use by the Blacklist engine
*/
protected static String prepareEntry(final String entry) {
String newEntry = entry;
/* Remove the eventual unnecessary Regex line beginning char '^' and URL scheme (protocol) part */
Matcher schemeMatcher = URL_SCHEME_PATTERN.matcher(newEntry);
if(schemeMatcher.matches()) {
newEntry = newEntry.substring(schemeMatcher.end(1));
}
if (newEntry.indexOf("*") < 0 && newEntry.indexOf("?") < 0 && newEntry.indexOf("+") < 0) {
// user did not use any wild cards and just submitted a word
newEntry = ".*" + newEntry + ".*/.*";
newEntry = ".*.*/.*" + newEntry + ".*";
} else {
int pos = newEntry.indexOf('/',0);
if (pos < 0) {
// add default empty path pattern
newEntry = newEntry + "/.*";
}
}
return newEntry;
}
/**
* Adds a new entry to the chosen blacklist.
@ -45,27 +81,8 @@ public final class BlacklistHelper {
}
return location;
}
if (newEntry.startsWith("http://") ){
newEntry = newEntry.substring(7);
} else if (newEntry.startsWith("https://")) {
newEntry = newEntry.substring(8);
}
if (newEntry.indexOf("*") < 0 && newEntry.indexOf("?") < 0 && newEntry.indexOf("+") < 0) {
// user did not use any wild cards and just submitted a word
newEntry = ".*" + newEntry + ".*/.*";
newEntry = ".*.*/.*" + newEntry + ".*";
} else {
int pos = newEntry.indexOf('/',0);
if (pos < 0) {
// add default empty path pattern
newEntry = newEntry + "/.*";
}
}
newEntry = prepareEntry(newEntry);
int pos = newEntry.indexOf('/',0);
String host = newEntry.substring(0, pos);

@ -0,0 +1,52 @@
// BlacklistHelperTest.java
// Copyright 2017 by luccioman; https://github.com/luccioman
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.repository;
import static org.junit.Assert.*;
import org.junit.Test;
/**
* Unit tests for the {@link BlacklistHelper} class.
*
*/
public class BlacklistHelperTest {
/**
* Unit testing of the function {@link BlacklistHelper#prepareEntry(String)}
*/
@Test
public void testPrepareEntry() {
assertEquals("http protocol with path wildcard", "domain.com/path/*", BlacklistHelper.prepareEntry("http://domain.com/path/*"));
assertEquals("http protocol with path regex wildcard", "domain.com/path/.*", BlacklistHelper.prepareEntry("http://domain.com/path/.*"));
assertEquals("https protocol with path wildcard", "domain.com/path/*", BlacklistHelper.prepareEntry("https://domain.com/path/*"));
assertEquals("ftp protocol with path wildcard", "domain.com/path/*", BlacklistHelper.prepareEntry("ftp://domain.com/path/*"));
assertEquals("wildcard in protocol", "domain.com/path/*", BlacklistHelper.prepareEntry("https?://domain.com/path/*"));
assertEquals("regex with line beginning mark", "domain.com/path/.*", BlacklistHelper.prepareEntry("^https://domain.com/path/.*"));
assertEquals("host with regex", "[a-z\\.]*domain.com/path/*", BlacklistHelper.prepareEntry("http://[a-z\\.]*domain.com/path/*"));
assertEquals("path with regex", "domain.com/path/([^/1-9)+[^/]*/.*", BlacklistHelper.prepareEntry("domain.com/path/([^/1-9)+[^/]*/.*"));
assertEquals("ip v4 address", "192.168.1.1/*", BlacklistHelper.prepareEntry("192.168.1.1/*"));
assertEquals("domain only", "domain.com.*/.*", BlacklistHelper.prepareEntry("domain.com.*"));
assertEquals("word only", ".*.*/.*.*word.*/.*.*", BlacklistHelper.prepareEntry("word"));
}
}
Loading…
Cancel
Save