From 34831d2d9f534e9fc044066fcd5c2860ac28b4c5 Mon Sep 17 00:00:00 2001 From: theli Date: Mon, 14 Aug 2006 16:11:22 +0000 Subject: [PATCH] *) Check validity of crawl filter reg.exp. before adding it into the crawler queue See: http://www.yacy-forum.de/viewtopic.php?p=24671 git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2410 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/IndexCreate_p.html | 10 ++++++---- htroot/IndexCreate_p.java | 23 +++++++++++++++++++++-- 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/htroot/IndexCreate_p.html b/htroot/IndexCreate_p.html index e855a132c..11f87c5e1 100644 --- a/htroot/IndexCreate_p.html +++ b/htroot/IndexCreate_p.html @@ -232,13 +232,15 @@ Error: #[errmsg]# :: Application not yet initialized. Sorry. Please wait some seconds and repeat the request. :: -ERROR: Crawl filter "#[newcrawlingfilter]#" does not match with crawl root "#[crawlingStart]#". Please try again with different filter.


+ERROR: Crawl filter "#[newcrawlingfilter]#" does not match with crawl root "#[crawlingStart]#". Please try again with different filter.


:: -Crawling of "#[crawlingURL]#" failed. Reason: #[reasonString]#
+Crawling of "#[crawlingURL]#" failed. Reason: #[reasonString]#
:: -Error with URL input "#[crawlingStart]#": #[error]# +Error with URL input "#[crawlingStart]#": #[error]# :: -Error with file input "#[crawlingStart]#": #[error]# +Error with file input "#[crawlingStart]#": #[error]# +:: +Error with Crawling Filter "#[newcrawlingfilter]#": #[error]# #(/error)#
#(info)# diff --git a/htroot/IndexCreate_p.java b/htroot/IndexCreate_p.java index d26024631..486b43b74 100644 --- a/htroot/IndexCreate_p.java +++ b/htroot/IndexCreate_p.java @@ -53,6 +53,8 @@ import java.util.Enumeration; import java.util.HashMap; import java.util.Iterator; import java.util.Map; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; import de.anomic.data.wikiCode; import de.anomic.htmlFilter.htmlFilterContentScraper; @@ -158,6 +160,10 @@ public class IndexCreate_p { prop.put("error_newcrawlingfilter", newcrawlingfilter); prop.put("error_crawlingStart", crawlingStart); } else try { + + // check if the crawl filter works correctly + Pattern.compile(newcrawlingfilter); + // stack request // first delete old entry, if exists String urlhash = indexURL.urlHash(crawlingStart); @@ -201,6 +207,10 @@ public class IndexCreate_p { ee.store(); switchboard.urlPool.errorURL.stackPushEntry(ee); } + } catch (PatternSyntaxException e) { + prop.put("error", 8); //crawlfilter does not match url + prop.put("error_newcrawlingfilter", newcrawlingfilter); + prop.put("error_error", e.getMessage()); } catch (Exception e) { // mist prop.put("error", 6);//Error with url @@ -213,7 +223,11 @@ public class IndexCreate_p { if (post.containsKey("crawlingFile")) { // getting the name of the uploaded file String fileName = (String) post.get("crawlingFile"); - try { + try { + // check if the crawl filter works correctly + Pattern.compile(newcrawlingfilter); + + // loading the file content File file = new File(fileName); // getting the content of the bookmark file @@ -268,7 +282,12 @@ public class IndexCreate_p { switchboard.urlPool.errorURL.stackPushEntry(ee); } } - + + } catch (PatternSyntaxException e) { + // print error message + prop.put("error", 8); //crawlfilter does not match url + prop.put("error_newcrawlingfilter", newcrawlingfilter); + prop.put("error_error", e.getMessage()); } catch (Exception e) { // mist prop.put("error", 7);//Error with file