From 34831d2d9f534e9fc044066fcd5c2860ac28b4c5 Mon Sep 17 00:00:00 2001
From: theli
Date: Mon, 14 Aug 2006 16:11:22 +0000
Subject: [PATCH] *) Check validity of crawl filter reg.exp. before adding it
into the crawler queue See: http://www.yacy-forum.de/viewtopic.php?p=24671
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2410 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
htroot/IndexCreate_p.html | 10 ++++++----
htroot/IndexCreate_p.java | 23 +++++++++++++++++++++--
2 files changed, 27 insertions(+), 6 deletions(-)
diff --git a/htroot/IndexCreate_p.html b/htroot/IndexCreate_p.html
index e855a132c..11f87c5e1 100644
--- a/htroot/IndexCreate_p.html
+++ b/htroot/IndexCreate_p.html
@@ -232,13 +232,15 @@ Error: #[errmsg]#
::
Application not yet initialized. Sorry. Please wait some seconds and repeat the request.
::
-ERROR: Crawl filter "#[newcrawlingfilter]#" does not match with crawl root "#[crawlingStart]#". Please try again with different filter.
+ERROR: Crawl filter "#[newcrawlingfilter]#" does not match with crawl root "#[crawlingStart]#". Please try again with different filter.
::
-Crawling of "#[crawlingURL]#" failed. Reason: #[reasonString]#
+Crawling of "#[crawlingURL]#" failed. Reason: #[reasonString]#
::
-Error with URL input "#[crawlingStart]#": #[error]#
+Error with URL input "#[crawlingStart]#": #[error]#
::
-Error with file input "#[crawlingStart]#": #[error]#
+Error with file input "#[crawlingStart]#": #[error]#
+::
+Error with Crawling Filter "#[newcrawlingfilter]#": #[error]#
#(/error)#
#(info)#
diff --git a/htroot/IndexCreate_p.java b/htroot/IndexCreate_p.java
index d26024631..486b43b74 100644
--- a/htroot/IndexCreate_p.java
+++ b/htroot/IndexCreate_p.java
@@ -53,6 +53,8 @@ import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
import de.anomic.data.wikiCode;
import de.anomic.htmlFilter.htmlFilterContentScraper;
@@ -158,6 +160,10 @@ public class IndexCreate_p {
prop.put("error_newcrawlingfilter", newcrawlingfilter);
prop.put("error_crawlingStart", crawlingStart);
} else try {
+
+ // check if the crawl filter works correctly
+ Pattern.compile(newcrawlingfilter);
+
// stack request
// first delete old entry, if exists
String urlhash = indexURL.urlHash(crawlingStart);
@@ -201,6 +207,10 @@ public class IndexCreate_p {
ee.store();
switchboard.urlPool.errorURL.stackPushEntry(ee);
}
+ } catch (PatternSyntaxException e) {
+ prop.put("error", 8); //crawlfilter does not match url
+ prop.put("error_newcrawlingfilter", newcrawlingfilter);
+ prop.put("error_error", e.getMessage());
} catch (Exception e) {
// mist
prop.put("error", 6);//Error with url
@@ -213,7 +223,11 @@ public class IndexCreate_p {
if (post.containsKey("crawlingFile")) {
// getting the name of the uploaded file
String fileName = (String) post.get("crawlingFile");
- try {
+ try {
+ // check if the crawl filter works correctly
+ Pattern.compile(newcrawlingfilter);
+
+ // loading the file content
File file = new File(fileName);
// getting the content of the bookmark file
@@ -268,7 +282,12 @@ public class IndexCreate_p {
switchboard.urlPool.errorURL.stackPushEntry(ee);
}
}
-
+
+ } catch (PatternSyntaxException e) {
+ // print error message
+ prop.put("error", 8); //crawlfilter does not match url
+ prop.put("error_newcrawlingfilter", newcrawlingfilter);
+ prop.put("error_error", e.getMessage());
} catch (Exception e) {
// mist
prop.put("error", 7);//Error with file