From d8beafba3ae1044c87b66b4d339afdfc7323c506 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Thu, 9 Oct 2014 13:27:20 +0200 Subject: [PATCH] fix for values in CrawlProfileEditor table and xml; now the full profile is available in the xml. --- htroot/CrawlProfileEditor_p.html | 14 ++-- htroot/CrawlProfileEditor_p.xml | 37 +++++++--- .../net/yacy/crawler/data/CrawlProfile.java | 68 ++++++++++++------- source/net/yacy/server/serverObjects.java | 12 ++-- 4 files changed, 84 insertions(+), 47 deletions(-) diff --git a/htroot/CrawlProfileEditor_p.html b/htroot/CrawlProfileEditor_p.html index e52bf2b8e..42a331f28 100644 --- a/htroot/CrawlProfileEditor_p.html +++ b/htroot/CrawlProfileEditor_p.html @@ -32,11 +32,12 @@ Crawl Thread + Collections Status Depth Must Match Must Not Match - MaxAge + Recrawl if older than Domain Counter Content Max Page Per Domain Accept '?' URLs @@ -48,6 +49,7 @@ #{crawlProfiles}# #[name]# + #[collections]# #(terminateButton)#::
Running
@@ -64,13 +66,13 @@ #(/deleteButton)# #[depth]# - #[mustmatch]# - #[mustnotmatch]# - #[crawlingIfOlder]# + #[crawlerURLMustMatch]# + #[crawlerURLMustNotMatch]# + #[recrawlIfOlder]# #{crawlingDomFilterContent}##[item]#
#{/crawlingDomFilterContent}# - #[crawlingDomMaxPages]# + #[domMaxPages]# #(withQuery)#no::yes#(/withQuery)# - #(storeCache)#no::yes#(/storeCache)# + #(storeHTCache)#no::yes#(/storeHTCache)# #(indexText)#no::yes#(/indexText)# #(indexMedia)#no::yes#(/indexMedia)# #(remoteIndexing)#no::yes#(/remoteIndexing)# diff --git a/htroot/CrawlProfileEditor_p.xml b/htroot/CrawlProfileEditor_p.xml index 8d23bec3f..9ee5cf28c 100644 --- a/htroot/CrawlProfileEditor_p.xml +++ b/htroot/CrawlProfileEditor_p.xml @@ -2,23 +2,40 @@ #{crawlProfiles}# + #[handle]# #[name]# - #(status)#terminated::active::system#(/status)# + #[collections]# + #[agentName]# + #[userAgent]# #[depth]# - #[mustmatch]# - #[mustnotmatch]# - #[crawlingIfOlder]# + #(directDocByURL)#false::true#(/directDocByURL)# + #[recrawlIfOlder]# + #[domMaxPages]# + #(crawlingQ)#false::true#(/crawlingQ)# + #(followFrames)#false::true#(/followFrames)# + #(obeyHtmlRobotsNoindex)#false::true#(/obeyHtmlRobotsNoindex)# + #(obeyHtmlRobotsNofollow)#false::true#(/obeyHtmlRobotsNofollow)# + #(indexText)#false::true#(/indexText)# + #(indexMedia)#false::true#(/indexMedia)# + #(storeHTCache)#false::true#(/storeHTCache)# + #(remoteIndexing)#false::true#(/remoteIndexing)# + #[cacheStrategy]# + #[crawlerURLMustMatch]# + #[crawlerURLMustNotMatch]# + #[crawlerIPMustMatch]# + #[crawlerIPMustNotMatch]# + #[crawlerCountryMustMatch]# + #[crawlerNoLimitURLMustMatch]# + #[indexURLMustMatch]# + #[indexURLMustNotMatch]# + #[indexContentMustMatch]# + #[indexContentMustNotMatch]# + #(status)#terminated::active::system#(/status)# #{crawlingDomFilterContent}# #[item]# #{/crawlingDomFilterContent}# - #[crawlingDomMaxPages]# - #(withQuery)#no::yes#(/withQuery)# - #(storeCache)#no::yes#(/storeCache)# - #(indexText)#no::yes#(/indexText)# - #(indexMedia)#no::yes#(/indexMedia)# - #(remoteIndexing)#no::yes#(/remoteIndexing)# #{/crawlProfiles}# diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java index 4fb782f60..909427d57 100644 --- a/source/net/yacy/crawler/data/CrawlProfile.java +++ b/source/net/yacy/crawler/data/CrawlProfile.java @@ -25,7 +25,6 @@ package net.yacy.crawler.data; -import java.text.DateFormat; import java.util.Collection; import java.util.Date; import java.util.HashMap; @@ -625,39 +624,58 @@ public class CrawlProfile extends ConcurrentHashMap implements M boolean terminateButton = active && !CrawlSwitchboard.DEFAULT_PROFILES.contains(this.name()); boolean deleteButton = !active; prop.put(CRAWL_PROFILE_PREFIX + count + "_dark", dark ? "1" : "0"); - prop.put(CRAWL_PROFILE_PREFIX + count + "_name", this.collectionName()); + prop.putXML(CRAWL_PROFILE_PREFIX + count + "_handle", this.handle()); + prop.putXML(CRAWL_PROFILE_PREFIX + count + "_name", this.name()); + //prop.putXML(CRAWL_PROFILE_PREFIX + count + "_collection", this.get(COLLECTIONS)); // TODO: remove, replace with 'collections' + prop.putXML(CRAWL_PROFILE_PREFIX + count + "_collections", this.get(COLLECTIONS)); + prop.putXML(CRAWL_PROFILE_PREFIX + count + "_agentName", this.get(AGENT_NAME)); + prop.putXML(CRAWL_PROFILE_PREFIX + count + "_userAgent", this.getAgent().userAgent); + prop.put(CRAWL_PROFILE_PREFIX + count + "_depth", this.depth()); + prop.put(CRAWL_PROFILE_PREFIX + count + "_directDocByURL", this.directDocByURL() ? 1 : 0); + prop.putXML(CRAWL_PROFILE_PREFIX + count + "_recrawlIfOlder", this.recrawlIfOlder() == Long.MAX_VALUE ? "eternity" : (new Date(this.recrawlIfOlder()).toString())); + prop.put(CRAWL_PROFILE_PREFIX + count + "_domMaxPages", this.domMaxPages()); + //prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomMaxPages", (this.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : Integer.toString(this.domMaxPages())); // TODO: remove, replace with 'domMaxPages' + prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingQ", this.crawlingQ() ? 1 : 0); + //prop.put(CRAWL_PROFILE_PREFIX + count + "_withQuery", (this.crawlingQ()) ? "1" : "0"); // TODO: remove, replace with crawlingQ + prop.put(CRAWL_PROFILE_PREFIX + count + "_followFrames", this.followFrames() ? 1 : 0); + prop.put(CRAWL_PROFILE_PREFIX + count + "_obeyHtmlRobotsNoindex", this.obeyHtmlRobotsNoindex() ? 1 : 0); + prop.put(CRAWL_PROFILE_PREFIX + count + "_obeyHtmlRobotsNofollow", this.obeyHtmlRobotsNofollow() ? 1 : 0); + prop.put(CRAWL_PROFILE_PREFIX + count + "_indexText", this.indexText() ? 1 : 0); + prop.put(CRAWL_PROFILE_PREFIX + count + "_indexMedia", this.indexMedia() ? 1 : 0); + //prop.put(CRAWL_PROFILE_PREFIX + count + "_storeCache", this.storeHTCache() ? 1 : 0); // TODO: remove, replace with 'storeHTCache' + prop.put(CRAWL_PROFILE_PREFIX + count + "_storeHTCache", this.storeHTCache() ? 1 : 0); + prop.put(CRAWL_PROFILE_PREFIX + count + "_remoteIndexing", this.remoteIndexing() ? 1 : 0); + prop.putXML(CRAWL_PROFILE_PREFIX + count + "_cacheStrategy", this.get(CACHE_STRAGEGY)); + prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerURLMustMatch", this.get(CRAWLER_URL_MUSTMATCH)); + prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerURLMustNotMatch", this.get(CRAWLER_URL_MUSTNOTMATCH)); + prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerIPMustMatch", this.get(CRAWLER_IP_MUSTMATCH)); + prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerIPMustNotMatch", this.get(CRAWLER_IP_MUSTNOTMATCH)); + prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerCountryMustMatch", this.get(CRAWLER_COUNTRY_MUSTMATCH)); + prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerNoLimitURLMustMatch", this.get(CRAWLER_URL_NODEPTHLIMITMATCH)); + prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexURLMustMatch", this.get(INDEXING_URL_MUSTMATCH)); + prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexURLMustNotMatch", this.get(INDEXING_URL_MUSTNOTMATCH)); + prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustMatch", this.get(INDEXING_CONTENT_MUSTMATCH)); + prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustNotMatch", this.get(INDEXING_CONTENT_MUSTNOTMATCH)); + //prop.putXML(CRAWL_PROFILE_PREFIX + count + "_mustmatch", this.urlMustMatchPattern().toString()); // TODO: remove, replace with crawlerURLMustMatch + //prop.putXML(CRAWL_PROFILE_PREFIX + count + "_mustnotmatch", this.urlMustNotMatchPattern().toString()); // TODO: remove, replace with crawlerURLMustNotMatch + //prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingIfOlder", (this.recrawlIfOlder() == 0L) ? "no re-crawl" : DateFormat.getDateTimeInstance().format(this.recrawlIfOlder())); // TODO: remove, replace with recrawlIfOlder + prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterDepth", "inactive"); prop.put(CRAWL_PROFILE_PREFIX + count + "_status", terminateButton ? 1 : deleteButton ? 0 : 2); prop.put(CRAWL_PROFILE_PREFIX + count + "_terminateButton", terminateButton); prop.put(CRAWL_PROFILE_PREFIX + count + "_terminateButton_handle", this.handle()); prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton", deleteButton); prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton_handle", this.handle()); - prop.put(CRAWL_PROFILE_PREFIX + count + "_handle", this.handle()); - prop.put(CRAWL_PROFILE_PREFIX + count + "_depth", this.depth()); - prop.put(CRAWL_PROFILE_PREFIX + count + "_mustmatch", this.urlMustMatchPattern().toString()); - prop.put(CRAWL_PROFILE_PREFIX + count + "_mustnotmatch", this.urlMustNotMatchPattern().toString()); - prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingIfOlder", (this.recrawlIfOlder() == 0L) ? "no re-crawl" : DateFormat.getDateTimeInstance().format(this.recrawlIfOlder())); - prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterDepth", "inactive"); - + int i = 0; - if (active && this.domMaxPages() > 0 - && this.domMaxPages() != Integer.MAX_VALUE) { - String item; - while (i <= domlistlength && !(item = this.domName(true, i)).isEmpty()){ - if (i == domlistlength) { - item += " ..."; + if (active && this.domMaxPages() > 0 && this.domMaxPages() != Integer.MAX_VALUE) { + String item; + while (i <= domlistlength && !(item = this.domName(true, i)).isEmpty()) { + if (i == domlistlength) item += " ..."; + prop.putHTML(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterContent_" + i + "_item", item); + i++; } - prop.putHTML(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterContent_" + i + "_item", item); - i++; } - } - prop.put(CRAWL_PROFILE_PREFIX+count+"_crawlingDomFilterContent", i); - prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomMaxPages", (this.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : Integer.toString(this.domMaxPages())); - prop.put(CRAWL_PROFILE_PREFIX + count + "_withQuery", (this.crawlingQ()) ? "1" : "0"); - prop.put(CRAWL_PROFILE_PREFIX + count + "_storeCache", (this.storeHTCache()) ? "1" : "0"); - prop.put(CRAWL_PROFILE_PREFIX + count + "_indexText", (this.indexText()) ? "1" : "0"); - prop.put(CRAWL_PROFILE_PREFIX + count + "_indexMedia", (this.indexMedia()) ? "1" : "0"); - prop.put(CRAWL_PROFILE_PREFIX + count + "_remoteIndexing", (this.remoteIndexing()) ? "1" : "0"); } } diff --git a/source/net/yacy/server/serverObjects.java b/source/net/yacy/server/serverObjects.java index c7836e045..28f14f4a9 100644 --- a/source/net/yacy/server/serverObjects.java +++ b/source/net/yacy/server/serverObjects.java @@ -307,11 +307,11 @@ public class serverObjects implements Serializable, Cloneable { * @see CharacterCoding#encodeUnicode2html(String, boolean) */ public void putHTML(final String key, final String value) { - put(key, CharacterCoding.unicode2html(value, true)); + put(key, value == null ? "" : CharacterCoding.unicode2html(value, true)); } public void putHTML(final String key, final byte[] value) { - putHTML(key, UTF8.String(value)); + putHTML(key, value == null ? "" : UTF8.String(value)); } /** @@ -321,7 +321,7 @@ public class serverObjects implements Serializable, Cloneable { * replaced in the returned String. */ public void putXML(final String key, final String value) { - put(key, CharacterCoding.unicode2xml(value, true)); + put(key, value == null ? "" : CharacterCoding.unicode2xml(value, true)); } /** @@ -332,9 +332,9 @@ public class serverObjects implements Serializable, Cloneable { * @return */ public void put(final RequestHeader.FileType fileType, final String key, final String value) { - if (fileType == FileType.JSON) putJSON(key, value); - else if (fileType == FileType.XML) putXML(key, value); - else putHTML(key, value); + if (fileType == FileType.JSON) putJSON(key, value == null ? "" : value); + else if (fileType == FileType.XML) putXML(key, value == null ? "" : value); + else putHTML(key, value == null ? "" : value); } /**