From 0716a247372bac1559a3e4ccf4834c0757da7ac1 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 31 Oct 2012 15:13:05 +0100 Subject: [PATCH] added more / all new crawl profile fields into crawl profile editor --- htroot/CrawlProfileEditor_p.html | 4 +- htroot/CrawlProfileEditor_p.java | 17 ++++++-- .../net/yacy/crawler/data/CrawlProfile.java | 40 +++++++++---------- 3 files changed, 35 insertions(+), 26 deletions(-) diff --git a/htroot/CrawlProfileEditor_p.html b/htroot/CrawlProfileEditor_p.html index 2ec64eb4a..8a184b0f3 100644 --- a/htroot/CrawlProfileEditor_p.html +++ b/htroot/CrawlProfileEditor_p.html @@ -110,8 +110,8 @@
#(readonly)# :: + type="text" value="#[value]#" size="120" maxlength="10000":: + type="text" value="#[value]#" size="120" maxlength="10000"#(/type)# />:: #(type)##(checked)#false::true#(/checked)#::#[value]#::#[value]##(/type)##(/readonly)#
#{/entries}#
 
diff --git a/htroot/CrawlProfileEditor_p.java b/htroot/CrawlProfileEditor_p.java index f041ecd80..3272863b3 100644 --- a/htroot/CrawlProfileEditor_p.java +++ b/htroot/CrawlProfileEditor_p.java @@ -64,9 +64,17 @@ public class CrawlProfileEditor_p { private static final List labels = new ArrayList(); static { - labels.add(new eentry(CrawlProfile.NAME, "Name", true, eentry.STRING)); - labels.add(new eentry(CrawlProfile.CRAWLER_URL_MUSTMATCH, "Must-Match Filter", false, eentry.STRING)); - labels.add(new eentry(CrawlProfile.CRAWLER_URL_MUSTNOTMATCH, "Must-Not-Match Filter", false, eentry.STRING)); + labels.add(new eentry(CrawlProfile.NAME, "Name", true, eentry.STRING)); + labels.add(new eentry(CrawlProfile.COLLECTIONS, "Collections (comma-separated list)", false, eentry.STRING)); + labels.add(new eentry(CrawlProfile.CRAWLER_URL_MUSTMATCH, "URL Must-Match Filter", false, eentry.STRING)); + labels.add(new eentry(CrawlProfile.CRAWLER_URL_MUSTNOTMATCH, "URL Must-Not-Match Filter", false, eentry.STRING)); + labels.add(new eentry(CrawlProfile.CRAWLER_IP_MUSTMATCH, "IP Must-Match Filter", false, eentry.STRING)); + labels.add(new eentry(CrawlProfile.CRAWLER_IP_MUSTNOTMATCH, "IP Must-Not-Match Filter", false, eentry.STRING)); + labels.add(new eentry(CrawlProfile.CRAWLER_COUNTRY_MUSTMATCH, "Country Must-Match Filter", false, eentry.STRING)); + labels.add(new eentry(CrawlProfile.CRAWLER_URL_NODEPTHLIMITMATCH, "URL No-Depth-Limit Must-Match Filter", false, eentry.STRING)); + labels.add(new eentry(CrawlProfile.INDEXING_URL_MUSTMATCH, "Indexing Must-Match Filter", false, eentry.STRING)); + labels.add(new eentry(CrawlProfile.INDEXING_URL_MUSTNOTMATCH, "Indexing Must-Not-Match Filter", false, eentry.STRING)); + labels.add(new eentry(CrawlProfile.CACHE_STRAGEGY, "Cache Strategy (NOCACHE,IFFRESH,IFEXIST,CACHEONLY)", false, eentry.STRING)); labels.add(new eentry(CrawlProfile.DEPTH, "Crawl Depth", false, eentry.INTEGER)); labels.add(new eentry(CrawlProfile.RECRAWL_IF_OLDER, "Recrawl If Older", false, eentry.INTEGER)); labels.add(new eentry(CrawlProfile.DOM_MAX_PAGES, "Domain Max. Pages", false, eentry.INTEGER)); @@ -75,8 +83,9 @@ public class CrawlProfileEditor_p { labels.add(new eentry(CrawlProfile.INDEX_MEDIA, "Index Media", false, eentry.BOOLEAN)); labels.add(new eentry(CrawlProfile.STORE_HTCACHE, "Store in HTCache", false, eentry.BOOLEAN)); labels.add(new eentry(CrawlProfile.REMOTE_INDEXING, "Remote Indexing", false, eentry.BOOLEAN)); + labels.add(new eentry(CrawlProfile.DIRECT_DOC_BY_URL, "Put all linked urls into index without parsing", false, eentry.BOOLEAN)); } - + public static serverObjects respond( @SuppressWarnings("unused") final RequestHeader header, final serverObjects post, diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java index 17a9486d7..54699513a 100644 --- a/source/net/yacy/crawler/data/CrawlProfile.java +++ b/source/net/yacy/crawler/data/CrawlProfile.java @@ -55,26 +55,26 @@ public class CrawlProfile extends ConcurrentHashMap implements M // this is a simple record structure that hold all properties of a single crawl start private static final String HANDLE = "handle"; - public static final String NAME = "name"; - public static final String DEPTH = "generalDepth"; - private static final String DIRECT_DOC_BY_URL= "directDocByURL"; - public static final String RECRAWL_IF_OLDER = "recrawlIfOlder"; - public static final String DOM_MAX_PAGES = "domMaxPages"; - public static final String CRAWLING_Q = "crawlingQ"; - public static final String INDEX_TEXT = "indexText"; - public static final String INDEX_MEDIA = "indexMedia"; - public static final String STORE_HTCACHE = "storeHTCache"; - public static final String REMOTE_INDEXING = "remoteIndexing"; - private static final String CACHE_STRAGEGY = "cacheStrategy"; - public static final String CRAWLER_URL_MUSTMATCH = "crawlerURLMustMatch"; - public static final String CRAWLER_URL_MUSTNOTMATCH = "crawlerURLMustNotMatch"; - private static final String CRAWLER_IP_MUSTMATCH = "crawlerIPMustMatch"; - private static final String CRAWLER_IP_MUSTNOTMATCH = "crawlerIPMustNotMatch"; - private static final String CRAWLER_COUNTRY_MUSTMATCH = "crawlerCountryMustMatch"; - private static final String CRAWLER_URL_NODEPTHLIMITMATCH = "crawlerNoLimitURLMustMatch"; - private static final String INDEXING_URL_MUSTMATCH = "indexURLMustMatch"; - private static final String INDEXING_URL_MUSTNOTMATCH = "indexURLMustNotMatch"; - private static final String COLLECTIONS = "collections"; + public static final String NAME = "name"; + public static final String DEPTH = "generalDepth"; + public static final String DIRECT_DOC_BY_URL= "directDocByURL"; + public static final String RECRAWL_IF_OLDER = "recrawlIfOlder"; + public static final String DOM_MAX_PAGES = "domMaxPages"; + public static final String CRAWLING_Q = "crawlingQ"; + public static final String INDEX_TEXT = "indexText"; + public static final String INDEX_MEDIA = "indexMedia"; + public static final String STORE_HTCACHE = "storeHTCache"; + public static final String REMOTE_INDEXING = "remoteIndexing"; + public static final String CACHE_STRAGEGY = "cacheStrategy"; + public static final String COLLECTIONS = "collections"; + public static final String CRAWLER_URL_MUSTMATCH = "crawlerURLMustMatch"; + public static final String CRAWLER_URL_MUSTNOTMATCH = "crawlerURLMustNotMatch"; + public static final String CRAWLER_IP_MUSTMATCH = "crawlerIPMustMatch"; + public static final String CRAWLER_IP_MUSTNOTMATCH = "crawlerIPMustNotMatch"; + public static final String CRAWLER_COUNTRY_MUSTMATCH = "crawlerCountryMustMatch"; + public static final String CRAWLER_URL_NODEPTHLIMITMATCH = "crawlerNoLimitURLMustMatch"; + public static final String INDEXING_URL_MUSTMATCH = "indexURLMustMatch"; + public static final String INDEXING_URL_MUSTNOTMATCH = "indexURLMustNotMatch"; private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null; private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null;