From d126d6c1b55ebdc8b27a0f03c574ba3a5f817fa5 Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 12 Jan 2010 10:05:28 +0000 Subject: [PATCH] renamed the servlet WatchCrawler_p to Crawler_p this was done because that servlet may be used for wget/cronjob triggered crawl starts and it appears to be confusing that the name of the crawl start servlet looks like a pure monitoring tool. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6568 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/ConfigPHPBB3Search.html | 2 +- htroot/ConfigWikiSearch.html | 2 +- htroot/CrawlStart_p.html | 2 +- .../{WatchCrawler_p.html => Crawler_p.html} | 6 +- .../{WatchCrawler_p.java => Crawler_p.java} | 8 +- htroot/IndexCreate_p.html.new-layout | 355 ------------------ htroot/Status.html | 2 +- htroot/env/base.css | 2 +- htroot/env/templates/header.template | 2 +- .../templates/submenuCrawlMonitor.template | 2 +- htroot/js/{WatchCrawler.js => Crawler.js} | 6 +- htroot/js/IndexCreate.js | 2 +- locales/de.lng | 4 +- locales/fr.lng | 2 +- .../anomic/http/server/HTTPDFileHandler.java | 2 +- 15 files changed, 22 insertions(+), 377 deletions(-) rename htroot/{WatchCrawler_p.html => Crawler_p.html} (97%) rename htroot/{WatchCrawler_p.java => Crawler_p.java} (99%) delete mode 100644 htroot/IndexCreate_p.html.new-layout rename htroot/js/{WatchCrawler.js => Crawler.js} (98%) diff --git a/htroot/ConfigPHPBB3Search.html b/htroot/ConfigPHPBB3Search.html index 575258602..45a3b6594 100644 --- a/htroot/ConfigPHPBB3Search.html +++ b/htroot/ConfigPHPBB3Search.html @@ -28,7 +28,7 @@ to this page to read the integration hints below.

-
+
URL of the phpBB3 forum main page
This is a crawl start point
diff --git a/htroot/ConfigWikiSearch.html b/htroot/ConfigWikiSearch.html index f4bd8d35b..5b2860e4b 100644 --- a/htroot/ConfigWikiSearch.html +++ b/htroot/ConfigWikiSearch.html @@ -20,7 +20,7 @@ to this page to read the integration hints below.

- +
URL of the wiki main page
This is a crawl start point
diff --git a/htroot/CrawlStart_p.html b/htroot/CrawlStart_p.html index 61e218e53..165404a5c 100644 --- a/htroot/CrawlStart_p.html +++ b/htroot/CrawlStart_p.html @@ -26,7 +26,7 @@ You can define URLs as start points for Web page crawling and start crawling here. "Crawling" means that YaCy will download the given website, extract all links in it and then download the content behind these links. This is repeated as long as specified under "Crawling Depth".

- + diff --git a/htroot/WatchCrawler_p.html b/htroot/Crawler_p.html similarity index 97% rename from htroot/WatchCrawler_p.html rename to htroot/Crawler_p.html index a6c7e32d8..543c0bfd0 100644 --- a/htroot/WatchCrawler_p.html +++ b/htroot/Crawler_p.html @@ -6,8 +6,8 @@ - - + + #%env/templates/header.template%# #%env/templates/submenuCrawlMonitor.template%#

Crawler Queues

@@ -67,7 +67,7 @@
Attribut
- + diff --git a/htroot/WatchCrawler_p.java b/htroot/Crawler_p.java similarity index 99% rename from htroot/WatchCrawler_p.java rename to htroot/Crawler_p.java index 61b0bb9e5..83eb53e2e 100644 --- a/htroot/WatchCrawler_p.java +++ b/htroot/Crawler_p.java @@ -1,4 +1,4 @@ -// WatchCrawler_p.java +// Crawler_p.java // (C) 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 18.12.2006 on http://www.anomic.de // this file was created using the an implementation from IndexCreate_p.java, published 02.12.2004 @@ -57,13 +57,13 @@ import de.anomic.server.serverSwitch; import de.anomic.yacy.yacyNewsPool; import de.anomic.yacy.yacyNewsRecord; -public class WatchCrawler_p { +public class Crawler_p { public static final String CRAWLING_MODE_URL = "url"; public static final String CRAWLING_MODE_FILE = "file"; public static final String CRAWLING_MODE_SITEMAP = "sitemap"; - // this servlet does NOT create the WatchCrawler page content! + // this servlet does NOT create the Crawler servlet page content! // this servlet starts a web crawl. The interface for entering the web crawl parameters is in IndexCreate_p.html public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { @@ -131,7 +131,7 @@ public class WatchCrawler_p { prop.put("info", "3"); } else { // log a GET url for this crawl start for possible use in cronjobs - Log.logInfo("CRAWLSTART-URL", "http://localhost:" + sb.getConfig("port", "8080") + "/WatchCrawler_p.html?" + post.toString()); + Log.logInfo("CRAWLSTART-URL", "http://localhost:" + sb.getConfig("port", "8080") + "/Crawler_p.html?" + post.toString()); // set new properties final boolean fullDomain = post.get("range", "wide").equals("domain"); // special property in simple crawl start diff --git a/htroot/IndexCreate_p.html.new-layout b/htroot/IndexCreate_p.html.new-layout deleted file mode 100644 index 2bf9a7999..000000000 --- a/htroot/IndexCreate_p.html.new-layout +++ /dev/null @@ -1,355 +0,0 @@ - - - - YaCy '#[clientname]#': Index Creation - #%env/templates/metas.template%# - - - - - #%env/templates/header.template%# - #%env/templates/submenuIndexCreate.template%# -

Index Creation

- -

- Start Crawling Job:  - You can define URLs as start points for Web page crawling and start crawling here. "Crawling" means that YaCy will download the given website, extract all links in it and then download the content behind these links. This is repeated as long as specified under "Crawling Depth". -

- - -
Crawling Depth -

- This defines how often the Crawler will follow links embedded in websites.
- A minimum of 0 is recommended and means that the page you enter under "Starting Point" will be added to - the index, but no linked content is indexed. 2-4 is good for normal indexing. - Be careful with the depth. Consider a branching factor of average 20; - A prefetch-depth of 8 would index 25.600.000.000 pages, maybe this is the whole WWW. -

-
-
:
-
-
-
- -
Crawling Filter -

- This is an emacs-like regular expression that must match with the URLs which are used to be crawled. - Use this i.e. to crawl a single domain. If you set this filter it makes sense to increase - the crawling depth. -

-
-
:
-
-
-
- -
Re-Crawl Option -

- If you use this option, web pages that are already existent in your database are crawled and indexed again. - It depends on the age of the last crawl if this is done or not: if the last crawl is older than the given - date, the page is crawled again, otherwise it is treated as 'double' and not loaded or indexed again. -

-
-
:
-
-
:
-
- - -
-
-
- -
Auto-Dom-Filter -

- This option will automatically create a domain-filter which limits the crawl on domains the crawler - will find on the given depth. You can use this option i.e. to crawl a page with bookmarks while - restricting the crawl on only those domains that appear on the bookmark-page. The adequate depth - for this example would be 1.
- The default value 0 gives no restrictions. -

-
-
:
-
-
:
-
-
-
- -
Maximum Pages per Domain -

- You can limit the maxmimum number of pages that are fetched and indexed from a single domain with this option. - You can combine this limitation with the 'Auto-Dom-Filter', so that the limit is applied to all the domains within - the given depth. Domains outside the given depth are then sorted-out anyway. -

-
-
:
-
-
:
-
-
-
- -
Accept dynamic URLs -

- A questionmark is usually a hint for a dynamic page. URLs pointing to dynamic content should usually not be crawled. However, there are sometimes web pages with static content that - is accessed with URLs containing question marks. If you are unsure, do not check this to avoid crawl loops. -

-
-
:
-
-
-
- -
Store to Proxy Cache -

- This option is used by default for proxy prefetch, but is not needed for explicit crawling. - We recommend to leave this switched off unless you want to control the crawl results with the - Cache Monitor. -

-
-
:
-
-
-
- -
Local Indexing -

- This enables indexing of the wepages the crawler will download. This should be switched on by default, unless you want to crawl only to fill the - Proxy Cache without indexing. -

-
-
:
-
-
:
-
-
-
- -
Remote Indexing -

- If checked, the crawler will contact other peers and use them as remote indexers for your crawl. - If you need your crawling results locally, you should switch this off. - Only senior and principal peers can initiate or receive remote crawls. - A YaCyNews message will be created to inform all peers about a global crawl, so they can omit starting a crawl with the same start point. -

-
-
:
-
-
:
-
- - This message will appear in the 'Other Peer Crawl Start' table of other peers. -
-
-
- -
Exclude static Stop-Words -

- This can be useful to circumvent that extremely common words are added to the database, i.e. "the", "he", "she", "it"... - To exclude all words given in the file yacy.stopwords from indexing, check this box. -

-
-
:
-
-
-
- - -
Starting Point -

- Existing start URLs are re-crawled. - Other already visited URLs are sorted out as "double". - A complete re-crawl will be available soon. -

-
-
:
-
- - -
-
:
-
- - -
-
 
-
- - -
-
-
- -
- -
- - - -

- Distributed Indexing: - Crawling and indexing can be done by remote peers. - Your peer can search and index for other peers and they can search for you. -

- -
- - - - - - - - - - - - - - - - - - - - -
- - - Accept remote crawling requests and perform crawl at maximum load -
- - - Accept remote crawling requests and perform crawl at maximum of - Pages Per Minute (minimum is 1, low system load usually at PPM ≥ 30) -
- - - Do not accept remote crawling requests (please set this only if you cannot accept to crawl only one page per minute; see option above) -
- - -
- - -

- #(info)# - :: - Crawling paused successfully. - :: - Continue crawling. - #(/info)# -

- - #(refreshbutton)# - :: -
-
- -
-
- #(/refreshbutton)# -
-
- #(crawler-paused)# - - :: - - #(/crawler-paused)# -
-
- -

Recently started remote crawls in progress:

- - - - - - - - - - #{otherCrawlStartInProgress}# - - - - - - - - - #{/otherCrawlStartInProgress}# -
Start TimePeer NameStart URLIntention/DescriptionDepthAccept '?' URLs
#[cre]##[peername]##[startURL]##[intention]##[generalDepth]##(crawlingQ)#no::yes#(/crawlingQ)#
-

Recently started remote crawls, finished:

- - - - - - - - - - #{otherCrawlStartFinished}# - - - - - - - - - #{/otherCrawlStartFinished}# -
Start TimePeer NameStart URLIntention/DescriptionDepthAccept '?' URLs
#[cre]##[peername]##[startURL]##[intention]##[generalDepth]##(crawlingQ)#no::yes#(/crawlingQ)#
-

Remote Crawling Peers: 

- #(remoteCrawlPeers)# -

No remote crawl peers available.

- :: -

#[num]# peers available for remote crawling.

- - - - - - - - - - - - - -
Idle Peers - #{available}##[name]# (#[due]# seconds due)   #{/available}# -
Busy Peers - #{busy}##[name]# (#[due]# seconds due)  #{/busy}# -
- #(/remoteCrawlPeers)# - - #%env/templates/footer.template%# - - diff --git a/htroot/Status.html b/htroot/Status.html index 8265e176e..43fd1a7cd 100644 --- a/htroot/Status.html +++ b/htroot/Status.html @@ -146,7 +146,7 @@ #(hintCrawlMonitor)#::
idea
-
Your Web Page Indexer is busy. You can monitor your web crawl here. +
Your Web Page Indexer is busy. You can monitor your web crawl here.
#(/hintCrawlMonitor)#