From b55ea2197f2c7c313242aa05881f0a5730b8171f Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 13 Nov 2012 10:54:21 +0100 Subject: [PATCH] - redesign of crawl start servlet - for domain-limited crawls, the domain is deleted now by default before the crawl is started --- htroot/CrawlStartExpert_p.html | 160 +++++++++++++++++---------------- htroot/CrawlStartSite_p.html | 1 + htroot/Crawler_p.java | 5 +- 3 files changed, 86 insertions(+), 80 deletions(-) diff --git a/htroot/CrawlStartExpert_p.html b/htroot/CrawlStartExpert_p.html index 32247dc88..56a0ca907 100644 --- a/htroot/CrawlStartExpert_p.html +++ b/htroot/CrawlStartExpert_p.html @@ -47,17 +47,17 @@ - + @@ -71,13 +71,13 @@ - + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - + - + - + - + - +
:: - +
- +
: - +
:
@@ -99,7 +99,7 @@     also all linked non-parsable documents
- Unlimited crawl depth for URLs matching with: + Unlimited crawl depth for URLs matching with:
This defines how often the Crawler will follow links (of links..) embedded in websites. @@ -109,6 +109,75 @@
: + + + + + + + +
on URLs for Crawling:
+ Restrict to start domain(s)
+ Restrict to sub-path(s)
+ Use filter
on IPs for Crawling:
on URLs for Indexing
+
+ The filter is a regular expression + that must match with the URLs which are used to be crawled; default is 'catch all'. + Example: to allow only urls that contain the word 'science', set the filter to '.*science.*'. + You can also use an automatic domain-restriction to fully crawl a single domain. +
: + + + + +
on URLs for Crawling:
on IPs for Crawling:
on URLs for Indexing:
+
+ The filter is a regular expression + that must not match with the URLs to allow that the content of the url is indexed. +
Document Deletion +
+
No Deletion
+
Do not delete any document before the crawl is started.
+
Delete start host
+
For each host in the start url list, delete all documents from that host.
+
Delete only old
+
Treat documents that are loaded + + ago as stale and delete them before the crawl is started. +
+
+
+ After a crawl was done in the past, document may become stale and eventually they are also deleted on the target host. + To remove old files from the search index it is not sufficient to just consider them for re-load but it may be necessary + to delete them because they simply do not exist any more. Use this in combination with re-crawl while this time should be longer. +
Document Double-Check
@@ -139,74 +208,7 @@ to use that check the 're-load' option.
: - Use filter   -
- Restrict to start domain
- Restrict to sub-path
- Delete all old documents in domain/subpath -
- The filter is a regular expression - that must match with the URLs which are used to be crawled; default is 'catch all'. - Example: to allow only urls that contain the word 'science', set the filter to '.*science.*'. - You can also use an automatic domain-restriction to fully crawl a single domain. -
: - - - The filter is a regular expression - that must not match to allow that the page is accepted for crawling. - The empty string is a never-match filter which should do well for most cases. - If you don't know what this means, please leave this field empty. -
: -
-
- The filter is a regular expression - that must match with the URLs to allow that the content of the url is indexed. -
: - - - The filter is a regular expression - that must not match with the URLs to allow that the content of the url is indexed. -
: - - - Like the MUST-Match Filter for URLs this filter must match, but only for the IP of the host. - YaCy performs a DNS lookup for each host and this filter restricts the crawl to specific IPs -
: - - - This filter must not match on the IP of the crawled host. -
: Use filter   @@ -218,7 +220,7 @@ the IP of the server that hosts the page. The filter is not a regular expressions but a list of country codes, separated by comma.
Maximum Pages per Domain: : @@ -232,7 +234,7 @@ the given depth. Domains outside the given depth are then sorted-out anyway.
: @@ -240,14 +242,14 @@ is accessed with URLs containing question marks. If you are unsure, do not check this to avoid crawl loops.
: This option is used by default for proxy prefetch, but is not needed for explicit crawling.
: no cache    @@ -263,7 +265,7 @@ cache only: never go online, use all content from cache. If no cache exist, treat content as unavailable
Do Local Indexing: : @@ -276,7 +278,7 @@ Document Cache without indexing.
: diff --git a/htroot/CrawlStartSite_p.html b/htroot/CrawlStartSite_p.html index 4ca2864f5..1238c5d97 100644 --- a/htroot/CrawlStartSite_p.html +++ b/htroot/CrawlStartSite_p.html @@ -80,6 +80,7 @@ allow query-strings (urls with a '?' in the path) + diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index a612e9b42..b5958da94 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -151,7 +151,10 @@ public class Crawler_p { if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL_STRING; // avoid that all urls are filtered out if bad value was submitted final boolean fullDomain = "domain".equals(post.get("range", "wide")); // special property in simple crawl start final boolean subPath = "subpath".equals(post.get("range", "wide")); // special property in simple crawl start - final boolean deleteold = (fullDomain || subPath || !CrawlProfile.MATCH_ALL_STRING.equals(newcrawlingMustMatch)) && post.getBoolean("deleteold"); + + final boolean restrictedcrawl = fullDomain || subPath || !CrawlProfile.MATCH_ALL_STRING.equals(newcrawlingMustMatch); + final boolean deleteold = restrictedcrawl && post.getBoolean("deleteold"); + final boolean deleteage = restrictedcrawl && "age".equals(post.get("deleteold","off")); String crawlingStart0 = post.get("crawlingURL","").trim(); // the crawljob start url String[] rootURLs0 = crawlingStart0.indexOf('\n') > 0 || crawlingStart0.indexOf('\r') > 0 ? crawlingStart0.split("[\\r\\n]+") : crawlingStart0.split(Pattern.quote("|"));