run this crawl once and never load any page that is already known, only the start-url may be loaded again.
-
re-load
-
run this crawl once, but treat urls that are known since
-
A web crawl performs a double-check on all links found in the internet against the internal database. If the same url is found again,
then the url is treated as double when you check the 'no doubles' option. A url may be loaded again when it has reached a specific age,
- to use that check the 're-load' option. When you want that this web crawl is repeated automatically, then check the 'scheduled' option.
- In this case the crawl is repeated after the given time and no url from the previous crawl is omitted as double.
+ to use that check the 're-load' option.
- run this crawl once
- scheduled, look every
-
-
-
-
-
-
-
-
-
-
-
-
- for new documents automatically.
@@ -97,6 +79,7 @@
allow query-strings (urls with a '?' in the path)
+
diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java
index 47212540e..a612e9b42 100644
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@@ -214,35 +214,14 @@ public class Crawler_p {
// recrawl
final String recrawl = post.get("recrawl", "nodoubles"); // nodoubles, reload, scheduler
- boolean crawlingIfOlderCheck = "on".equals(post.get("crawlingIfOlderCheck", "off"));
- int crawlingIfOlderNumber = post.getInt("crawlingIfOlderNumber", -1);
- String crawlingIfOlderUnit = post.get("crawlingIfOlderUnit","year"); // year, month, day, hour
- int repeat_time = post.getInt("repeat_time", -1);
- final String repeat_unit = post.get("repeat_unit", "seldays"); // selminutes, selhours, seldays
-
- if ("scheduler".equals(recrawl) && repeat_time > 0) {
- // set crawlingIfOlder attributes that are appropriate for scheduled crawling
- crawlingIfOlderCheck = true;
- crawlingIfOlderNumber = "selminutes".equals(repeat_unit) ? 1 : "selhours".equals(repeat_unit) ? repeat_time / 2 : repeat_time * 12;
- crawlingIfOlderUnit = "hour";
- } else if ("reload".equals(recrawl)) {
- repeat_time = -1;
- crawlingIfOlderCheck = true;
- } else if ("nodoubles".equals(recrawl)) {
- repeat_time = -1;
- crawlingIfOlderCheck = false;
+ long crawlingIfOlder = 0;
+ if ("reload".equals(recrawl)) {
+ crawlingIfOlder = timeParser(true, post.getInt("reloadIfOlderNumber", -1), post.get("reloadIfOlderUnit","year")); // year, month, day, hour
}
- final long crawlingIfOlder = recrawlIfOlderC(crawlingIfOlderCheck, crawlingIfOlderNumber, crawlingIfOlderUnit);
env.setConfig("crawlingIfOlder", crawlingIfOlder);
// store this call as api call
- if (repeat_time > 0) {
- // store as scheduled api call
- sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + ((rootURLs.size() == 0) ? post.get("crawlingFile", "") : rootURLs.iterator().next().toNormalform(true)), repeat_time, repeat_unit.substring(3));
- } else {
- // store just a protocol
- sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + ((rootURLs.size() == 0) ? post.get("crawlingFile", "") : rootURLs.iterator().next().toNormalform(true)));
- }
+ sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + ((rootURLs.size() == 0) ? post.get("crawlingFile", "") : rootURLs.iterator().next().toNormalform(true)));
final boolean crawlingDomMaxCheck = "on".equals(post.get("crawlingDomMaxCheck", "off"));
final int crawlingDomMaxPages = (crawlingDomMaxCheck) ? post.getInt("crawlingDomMaxPages", -1) : -1;
@@ -564,13 +543,14 @@ public class Crawler_p {
return prop;
}
- private static long recrawlIfOlderC(final boolean recrawlIfOlderCheck, final int recrawlIfOlderNumber, final String crawlingIfOlderUnit) {
+ private static long timeParser(final boolean recrawlIfOlderCheck, final int number, final String unit) {
if (!recrawlIfOlderCheck) return 0L;
- if ("year".equals(crawlingIfOlderUnit)) return System.currentTimeMillis() - recrawlIfOlderNumber * 1000L * 60L * 60L * 24L * 365L;
- if ("month".equals(crawlingIfOlderUnit)) return System.currentTimeMillis() - recrawlIfOlderNumber * 1000L * 60L * 60L * 24L * 30L;
- if ("day".equals(crawlingIfOlderUnit)) return System.currentTimeMillis() - recrawlIfOlderNumber * 1000L * 60L * 60L * 24L;
- if ("hour".equals(crawlingIfOlderUnit)) return System.currentTimeMillis() - recrawlIfOlderNumber * 1000L * 60L * 60L;
- return System.currentTimeMillis() - recrawlIfOlderNumber;
+ if ("year".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L * 365L;
+ if ("month".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L * 30L;
+ if ("day".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L;
+ if ("hour".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L * 60L;
+ if ("minute".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L;
+ return 0L;
}
private static void setPerformance(final Switchboard sb, final serverObjects post) {
diff --git a/htroot/IndexCreateDomainCrawl_p.html b/htroot/IndexCreateDomainCrawl_p.html
deleted file mode 100644
index fb7ccfaa1..000000000
--- a/htroot/IndexCreateDomainCrawl_p.html
+++ /dev/null
@@ -1,71 +0,0 @@
-
-
-
- YaCy '#[clientname]#': Index Creation with a Web Crawl for a Single Domain
- #%env/templates/metas.template%#
-
-
-
-
- #%env/templates/header.template%#
- #%env/templates/submenuIndexCreate.template%#
-
Easy Crawl Start
-
-
- Start Crawling Job:
- You can define URLs as start points for Web page crawling and start crawling here.
- "Crawling" means that YaCy will download the given web-site, extract all links in it
- and then download the content behind these links.
- This is repeated as long as specified under "Crawling Depth".
-
-
-
-
- #%env/templates/footer.template%#
-
-
\ No newline at end of file
diff --git a/htroot/IndexCreateDomainCrawl_p.java b/htroot/IndexCreateDomainCrawl_p.java
deleted file mode 100644
index b411fc542..000000000
--- a/htroot/IndexCreateDomainCrawl_p.java
+++ /dev/null
@@ -1,96 +0,0 @@
-// IndexCreateDomainCrawl_p
-// (C) 2004 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
-// first published 02.12.2004 as IndexCreate_p.java on http://yacy.net
-//
-// This is a part of YaCy, a peer-to-peer based web search engine
-//
-// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
-// $LastChangedRevision: 1986 $
-// $LastChangedBy: orbiter $
-//
-// LICENSE
-//
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation; either version 2 of the License, or
-// (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with this program; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-import net.yacy.cora.protocol.RequestHeader;
-import net.yacy.search.SwitchboardConstants;
-import net.yacy.server.serverObjects;
-import net.yacy.server.serverSwitch;
-
-public class IndexCreateDomainCrawl_p {
-
- public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, @SuppressWarnings("unused") final serverObjects post, final serverSwitch env) {
- // return variable that accumulates replacements
- //Switchboard sb = (Switchboard) env;
- serverObjects prop = new serverObjects();
-
- // define visible variables
- prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0"));
- prop.put("crawlingDepth", Math.min(3, env.getConfigLong("crawlingDepth", 0)));
- prop.put("crawlingFilter", env.getConfig("crawlingFilter", "0"));
-
- int crawlingIfOlder = (int) env.getConfigLong("crawlingIfOlder", -1);
- prop.put("crawlingIfOlderCheck", (crawlingIfOlder == -1) ? "0" : "1");
- prop.put("crawlingIfOlderUnitYearCheck", "0");
- prop.put("crawlingIfOlderUnitMonthCheck", "0");
- prop.put("crawlingIfOlderUnitDayCheck", "0");
- prop.put("crawlingIfOlderUnitHourCheck", "0");
- prop.put("crawlingIfOlderUnitMinuteCheck", "0");
- if ((crawlingIfOlder == -1) || (crawlingIfOlder == Integer.MAX_VALUE)) {
- prop.put("crawlingIfOlderNumber", "-1");
- prop.put("crawlingIfOlderUnitYearCheck", "1");
- } else if (crawlingIfOlder >= 60*24*365) {
- prop.put("crawlingIfOlderNumber", Math.round((float)crawlingIfOlder / (float)(60*24*365)));
- prop.put("crawlingIfOlderUnitYearCheck", "1");
- } else if (crawlingIfOlder >= 60*24*30) {
- prop.put("crawlingIfOlderNumber", Math.round((float)crawlingIfOlder / (float)(60*24*30)));
- prop.put("crawlingIfOlderUnitMonthCheck", "1");
- } else if (crawlingIfOlder >= 60*24) {
- prop.put("crawlingIfOlderNumber", Math.round((float)crawlingIfOlder / (float)(60*24)));
- prop.put("crawlingIfOlderUnitDayCheck", "1");
- } else if (crawlingIfOlder >= 60) {
- prop.put("crawlingIfOlderNumber", Math.round(crawlingIfOlder / 60f));
- prop.put("crawlingIfOlderUnitHourCheck", "1");
- } else {
- prop.put("crawlingIfOlderNumber", crawlingIfOlder);
- prop.put("crawlingIfOlderUnitMinuteCheck", "1");
- }
- int crawlingDomFilterDepth = (int) env.getConfigLong("crawlingDomFilterDepth", -1);
- prop.put("crawlingDomFilterCheck", (crawlingDomFilterDepth == -1) ? "0" : "1");
- prop.put("crawlingDomFilterDepth", (crawlingDomFilterDepth == -1) ? 1 : crawlingDomFilterDepth);
- int crawlingDomMaxPages = (int) env.getConfigLong("crawlingDomMaxPages", -1);
- prop.put("crawlingDomMaxCheck", (crawlingDomMaxPages == -1) ? "0" : "1");
- prop.put("crawlingDomMaxPages", (crawlingDomMaxPages == -1) ? 10000 : crawlingDomMaxPages);
- prop.put("crawlingQChecked", env.getConfigBool("crawlingQ", false) ? "1" : "0");
- prop.put("storeHTCacheChecked", env.getConfigBool("storeHTCache", false) ? "1" : "0");
- prop.put("indexingTextChecked", env.getConfigBool("indexText", false) ? "1" : "0");
- prop.put("indexingMediaChecked", env.getConfigBool("indexMedia", false) ? "1" : "0");
- prop.put("crawlOrderChecked", env.getConfigBool("crawlOrder", false) ? "1" : "0");
-
- long LCbusySleep = env.getConfigLong(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, 100L);
- int LCppm = (LCbusySleep == 0) ? 1000 : (int) (60000L / LCbusySleep);
- prop.put("crawlingSpeedMaxChecked", (LCppm >= 1000) ? "1" : "0");
- prop.put("crawlingSpeedCustChecked", ((LCppm > 10) && (LCppm < 1000)) ? "1" : "0");
- prop.put("crawlingSpeedMinChecked", (LCppm <= 10) ? "1" : "0");
- prop.put("customPPMdefault", ((LCppm > 10) && (LCppm < 1000)) ? Integer.toString(LCppm) : "");
-
- prop.put("xsstopwChecked", env.getConfigBool("xsstopw", false) ? "1" : "0");
- prop.put("xdstopwChecked", env.getConfigBool("xdstopw", false) ? "1" : "0");
- prop.put("xpstopwChecked", env.getConfigBool("xpstopw", false) ? "1" : "0");
-
- // return rewrite properties
- return prop;
- }
-}
\ No newline at end of file