From 7a7a1277bd9394e9267e90a342ba7598396def7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ryszard=20Go=C5=84?= Date: Thu, 14 Jan 2016 02:40:46 +0100 Subject: [PATCH] Add autocrawl settings page --- htroot/Autocrawl_p.html | 45 +++++++++ htroot/Autocrawl_p.java | 94 +++++++++++++++++++ .../env/templates/submenuIndexCreate.template | 1 + 3 files changed, 140 insertions(+) create mode 100644 htroot/Autocrawl_p.html create mode 100644 htroot/Autocrawl_p.java diff --git a/htroot/Autocrawl_p.html b/htroot/Autocrawl_p.html new file mode 100644 index 000000000..03d6634d0 --- /dev/null +++ b/htroot/Autocrawl_p.html @@ -0,0 +1,45 @@ + + + + YaCy '#[clientname]#': Crawl Start + #%env/templates/metas.template%# + + + + #%env/templates/header.template%# + #%env/templates/submenuIndexCreate.template%# + +

Autocrawler

+ Autocrawler automatically selects and adds tasks to the local crawl queue. This will work best when there are already quite a few domains in the index. + +
+ Autocralwer Configuration + +
+
+ #(changed)#::
You need to restart for some settings to be applied
#(/changed)# +
Enable Autocrawler:
+
+
Deep crawl every:
+
+ + Warning: if this is bigger than "Rows to fetch" only shallow crawls will run. +
+
Rows to fetch at once:
+
+
Recrawl only older than # days:
+
+
Get hosts by query:
+
+ + Can be any valid Solr query. +
+
Shallow crawl depth (0 to 2):
+
+
Deep crawl depth (1 to 5):
+
+
+
+
+
+ \ No newline at end of file diff --git a/htroot/Autocrawl_p.java b/htroot/Autocrawl_p.java new file mode 100644 index 000000000..375bb2a3a --- /dev/null +++ b/htroot/Autocrawl_p.java @@ -0,0 +1,94 @@ +import net.yacy.cora.protocol.RequestHeader; +import net.yacy.crawler.data.CrawlProfile.CrawlAttribute; +import net.yacy.search.Switchboard; +import net.yacy.search.SwitchboardConstants; +import net.yacy.server.serverObjects; +import net.yacy.server.serverSwitch; + +public class Autocrawl_p { + + public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) { + final serverObjects prop = new serverObjects(); + final Switchboard sb = (Switchboard) env; + + boolean autocrawlEnable = sb.getConfigBool(SwitchboardConstants.AUTOCRAWL, false); + int autocrawlRatio = Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_RATIO, "50")); + int autocrawlRows = Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_ROWS, "100")); + int autocrawlDays = Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DAYS, "1")); + String autocrawlQuery = sb.getConfig(SwitchboardConstants.AUTOCRAWL_QUERY, "*:*"); + int autocrawlShallow = Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_SHALLOW_DEPTH, "1")); + int autocrawlDeep = Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DEEP_DEPTH, "3")); + + if (post != null) { + autocrawlEnable = post.getBoolean("autocrawlEnable"); + if (post.containsKey("autocrawlRatio")) { + autocrawlRatio = post.getInt("autocrawlRatio", 50); + } + if (post.containsKey("autocrawlRows")) { + autocrawlRows = post.getInt("autocralwRows", 100); + } + if (post.containsKey("autocrawlDays")) { + autocrawlDays = post.getInt("autocrawlDays", 1); + } + if (post.containsKey("autocrawlQuery")) { + autocrawlQuery = post.get("autocrawlQuery", "*:*"); + } + if (post.containsKey("autocrawlShallow")){ + autocrawlShallow = post.getInt("autocrawlShallow", 1); + } + if (post.containsKey("autocrawlDeep")) { + autocrawlDeep = post.getInt("autocrawlDeep", 3); + } + } + + if (autocrawlRatio > 500) { + autocrawlRatio = 500; + } else if (autocrawlRatio < 1) { + autocrawlRatio = 1; + } + if (autocrawlRows > 500) { + autocrawlRows = 500; + } else if (autocrawlRows < 1) { + autocrawlRows = 1; + } + if (autocrawlDays > 60) { + autocrawlDays = 60; + } else if (autocrawlDays < 1) { + autocrawlDays = 1; + } + if (autocrawlShallow > 1) { + autocrawlShallow = 2; + } else if (autocrawlShallow < 0) { + autocrawlShallow = 0; + } + if (autocrawlDeep > 5) { + autocrawlDeep = 5; + } else if (autocrawlDeep < 1) { + autocrawlDeep = 1; + } + + if (post != null) { + sb.setConfig(SwitchboardConstants.AUTOCRAWL, autocrawlEnable); + sb.setConfig(SwitchboardConstants.AUTOCRAWL_RATIO, autocrawlRatio); + sb.setConfig(SwitchboardConstants.AUTOCRAWL_ROWS, autocrawlRows); + sb.setConfig(SwitchboardConstants.AUTOCRAWL_DAYS, autocrawlDays); + sb.setConfig(SwitchboardConstants.AUTOCRAWL_QUERY, autocrawlQuery); + sb.setConfig(SwitchboardConstants.AUTOCRAWL_SHALLOW_DEPTH, autocrawlShallow); + sb.setConfig(SwitchboardConstants.AUTOCRAWL_DEEP_DEPTH, autocrawlDeep); + + sb.initAutocrawl(autocrawlEnable); + + prop.put("changed", true); + } + + prop.put("autocrawlEnable", autocrawlEnable); + prop.put("autocrawlRatio", autocrawlRatio); + prop.put("autocrawlRows", autocrawlRows); + prop.put("autocrawlDays", autocrawlDays); + prop.put("autocrawlQuery", autocrawlQuery); + prop.put("autocrawlShallow", autocrawlShallow); + prop.put("autocrawlDeep", autocrawlDeep); + + return prop; + } +} diff --git a/htroot/env/templates/submenuIndexCreate.template b/htroot/env/templates/submenuIndexCreate.template index 8f4f7664d..8715a22d9 100644 --- a/htroot/env/templates/submenuIndexCreate.template +++ b/htroot/env/templates/submenuIndexCreate.template @@ -17,6 +17,7 @@
  • Network Scanner
  • Remote Crawling
  • Scraping Proxy
  • +
  • Autocrawl