diff --git a/htroot/CrawlStartExpert_p.html b/htroot/CrawlStartExpert_p.html new file mode 100644 index 000000000..e9047ab6c --- /dev/null +++ b/htroot/CrawlStartExpert_p.html @@ -0,0 +1,306 @@ + + + + YaCy '#[clientname]#': Crawl Start + #%env/templates/metas.template%# + + + + + + + #%env/templates/header.template%# + #%env/templates/submenuIndexCreate.template%# +

Expert Crawl Start

+ +

+ Start Crawling Job:  + You can define URLs as start points for Web page crawling and start crawling here. "Crawling" means that YaCy will download the given website, extract all links in it and then download the content behind these links. This is repeated as long as specified under "Crawling Depth". +

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
AttributValueDescription
Starting Point: + + + + + + + + + + + + + + + + + + + +
: + +
: + +
:
+ +
+ empty +
+
+ Existing start URLs are always re-crawled. + Other already visited URLs are sorted out as "double", if they are not allowed using the re-crawl option. +
: + This defines how often the Crawler will follow links (of links..) embedded in websites. + 0 means that only the page you enter under "Starting Point" will be added + to the index. 2-4 is good for normal indexing. Values over 8 are not useful, since a depth-8 crawl will + index approximately 25.600.000.000 pages, maybe this is the whole WWW. +
Scheduled re-crawl +
+
no doubles
+
run this crawl once and never load any page that is already known, only the start-url may be loaded again.
+
re-load
+
run this crawl once, but treat urls that are known since
+ + not as double and load them again. No scheduled re-crawl. +
+
scheduled
+
after starting this crawl, repeat the crawl every
+ + automatically. +
+
+
+ A web crawl performs a double-check on all links found in the internet against the internal database. If the same url is found again, + then the url is treated as double when you check the 'no doubles' option. A url may be loaded again when it has reached a specific age, + to use that check the 'once' option. When you want that this web crawl is repeated automatically, then check the 'scheduled' option. + In this case the crawl is repeated after the given time and no url from the previous crawl is omitted as double. +
: + Use filter   +
+ Restrict to start domain
+ Restrict to sub-path +
+ The filter is a regular expression + that must match with the URLs which are used to be crawled; default is 'catch all'. + Example: to allow only urls that contain the word 'science', set the filter to '.*science.*'. + You can also use an automatic domain-restriction to fully crawl a single domain. +
: + + + This filter must not match to allow that the page is accepted for crawling. + The empty string is a never-match filter which should do well for most cases. + If you don't know what this means, please leave this field empty. +
Auto-Dom-Filter: + : +    + : + + + This option will automatically create a domain-filter which limits the crawl on domains the crawler + will find on the given depth. You can use this option i.e. to crawl a page with bookmarks while + restricting the crawl on only those domains that appear on the bookmark-page. The adequate depth + for this example would be 1.
+ The default value 0 gives no restrictions. +
Maximum Pages per Domain: + : +    + : + + + You can limit the maximum number of pages that are fetched and indexed from a single domain with this option. + You can combine this limitation with the 'Auto-Dom-Filter', so that the limit is applied to all the domains within + the given depth. Domains outside the given depth are then sorted-out anyway. +
: + A questionmark is usually a hint for a dynamic page. URLs pointing to dynamic content should usually not be crawled. However, there are sometimes web pages with static content that + is accessed with URLs containing question marks. If you are unsure, do not check this to avoid crawl loops. +
: + This option is used by default for proxy prefetch, but is not needed for explicit crawling. +
: + no cache    + if fresh    + if exist    + cache only + + The caching policy states when to use the cache during crawling: + no cache: never use the cache, all content from fresh internet source; + if fresh: use the cache if the cache exists and is fresh using the proxy-fresh rules; + if exist: use the cache if the cache exist. Do no check freshness. Otherwise use online source; + cache only: never go online, use all content from cache. If no cache exist, treat content as unavailable +
Do Local Indexing: + : +     + : + + + This enables indexing of the wepages the crawler will download. This should be switched on by default, unless you want to crawl only to fill the + Document Cache without indexing. +
: + + + + + +
+ + + :
+
+ This message will appear in the 'Other Peer Crawl Start' table of other peers. +
+
+ If checked, the crawler will contact other peers and use them as remote indexers for your crawl. + If you need your crawling results locally, you should switch this off. + Only senior and principal peers can initiate or receive remote crawls. + A YaCyNews message will be created to inform all peers about a global crawl, + so they can omit starting a crawl with the same start point. +
: + This can be useful to circumvent that extremely common words are added to the database, i.e. "the", "he", "she", "it"... To exclude all words given in the file yacy.stopwords from indexing, + check this box. +
Create Bookmark + : + +    (works with "Starting Point: From URL" only) +

+ :    +

+ : + +
  +
+ This option lets you create a bookmark from your crawl start URL. +
+
+ + #%env/templates/footer.template%# + + diff --git a/htroot/CrawlStartExpert_p.java b/htroot/CrawlStartExpert_p.java new file mode 100644 index 000000000..82ceaaeb5 --- /dev/null +++ b/htroot/CrawlStartExpert_p.java @@ -0,0 +1,83 @@ +// CrawlStartExpert_p.java +// (C) 2004 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 02.12.2004 as IndexCreate_p.java on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2010-08-23 14:32:02 +0200 (Mo, 23 Aug 2010) $ +// $LastChangedRevision: 7068 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +import net.yacy.cora.protocol.RequestHeader; +import de.anomic.crawler.CrawlProfile; +import de.anomic.search.SwitchboardConstants; +import de.anomic.server.serverObjects; +import de.anomic.server.serverSwitch; + +public class CrawlStartExpert_p { + + public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { + // return variable that accumulates replacements + //final Switchboard sb = (Switchboard) env; + final serverObjects prop = new serverObjects(); + + // define visible variables + //String a = sb.peers.mySeed().getPublicAddress(); + //boolean intranet = sb.getConfig(SwitchboardConstants.NETWORK_NAME, "").equals("intranet"); + //String repository = "http://" + ((a == null) ? "localhost:" + sb.getConfig("port", "8080") : a) + "/repository/"; + prop.put("starturl", /*(intranet) ? repository :*/ "http://"); + prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0")); + prop.put("crawlingDepth", env.getConfig("crawlingDepth", "0")); + prop.put("mustmatch", /*(intranet) ? repository + ".*" :*/ CrawlProfile.MATCH_ALL); + prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER); + + prop.put("crawlingIfOlderCheck", "0"); + prop.put("crawlingIfOlderUnitYearCheck", "0"); + prop.put("crawlingIfOlderUnitMonthCheck", "0"); + prop.put("crawlingIfOlderUnitDayCheck", "1"); + prop.put("crawlingIfOlderUnitHourCheck", "0"); + prop.put("crawlingIfOlderNumber", "7"); + + final int crawlingDomFilterDepth = (int) env.getConfigLong("crawlingDomFilterDepth", -1); + prop.put("crawlingDomFilterCheck", (crawlingDomFilterDepth == -1) ? "0" : "1"); + prop.put("crawlingDomFilterDepth", (crawlingDomFilterDepth == -1) ? 1 : crawlingDomFilterDepth); + final int crawlingDomMaxPages = (int) env.getConfigLong("crawlingDomMaxPages", -1); + prop.put("crawlingDomMaxCheck", (crawlingDomMaxPages == -1) ? "0" : "1"); + prop.put("crawlingDomMaxPages", (crawlingDomMaxPages == -1) ? 10000 : crawlingDomMaxPages); + prop.put("crawlingQChecked", env.getConfig("crawlingQ", "").equals("true") ? "1" : "0"); + prop.put("storeHTCacheChecked", env.getConfig("storeHTCache", "").equals("true") ? "1" : "0"); + prop.put("indexingTextChecked", env.getConfig("indexText", "").equals("true") ? "1" : "0"); + prop.put("indexingMediaChecked", env.getConfig("indexMedia", "").equals("true") ? "1" : "0"); + prop.put("crawlOrderChecked", env.getConfig("crawlOrder", "").equals("true") ? "1" : "0"); + + final long LCbusySleep = Integer.parseInt(env.getConfig(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, "100")); + final int LCppm = (LCbusySleep == 0) ? 1000 : (int) (60000L / LCbusySleep); + prop.put("crawlingSpeedMaxChecked", (LCppm >= 1000) ? "1" : "0"); + prop.put("crawlingSpeedCustChecked", ((LCppm > 10) && (LCppm < 1000)) ? "1" : "0"); + prop.put("crawlingSpeedMinChecked", (LCppm <= 10) ? "1" : "0"); + prop.put("customPPMdefault", ((LCppm > 10) && (LCppm < 1000)) ? Integer.toString(LCppm) : ""); + + prop.put("xsstopwChecked", env.getConfig("xsstopw", "").equals("true") ? "1" : "0"); + prop.put("xdstopwChecked", env.getConfig("xdstopw", "").equals("true") ? "1" : "0"); + prop.put("xpstopwChecked", env.getConfig("xpstopw", "").equals("true") ? "1" : "0"); + + // return rewrite properties + return prop; + } +} diff --git a/htroot/CrawlStart_p.html b/htroot/CrawlStart_p.html index 0ee51c9ae..7a8753910 100644 --- a/htroot/CrawlStart_p.html +++ b/htroot/CrawlStart_p.html @@ -19,92 +19,40 @@ #%env/templates/header.template%# #%env/templates/submenuIndexCreate.template%# -

Crawl Start

+

Site Crawling

- Start Crawling Job:  - You can define URLs as start points for Web page crawling and start crawling here. "Crawling" means that YaCy will download the given website, extract all links in it and then download the content behind these links. This is repeated as long as specified under "Crawling Depth". + Site Crawler:  + Download all web pages from a given domain or base URL.

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
AttributValueDescription
Starting Point: - - - - - - - - - - - - - - - - - - - -
: - -
: - -
:
- -
- empty -
-
- Existing start URLs are always re-crawled. - Other already visited URLs are sorted out as "double", if they are not allowed using the re-crawl option. -
: - This defines how often the Crawler will follow links (of links..) embedded in websites. - 0 means that only the page you enter under "Starting Point" will be added - to the index. 2-4 is good for normal indexing. Values over 8 are not useful, since a depth-8 crawl will - index approximately 25.600.000.000 pages, maybe this is the whole WWW. -
Scheduled re-crawl -
-
no doubles
-
run this crawl once and never load any page that is already known, only the start-url may be loaded again.
-
re-load
-
run this crawl once, but treat urls that are known since
- - not as double and load them again. No scheduled re-crawl. -
-
scheduled
-
after starting this crawl, repeat the crawl every
+
+ + + + +
+
+
+ + + + + + +
Start URL +
+ +
+ + empty +
Sitemap URL

+
+ +
+
+ run this crawl once
+ scheduled, repeat the crawl every automatically. -
-
-
- A web crawl performs a double-check on all links found in the internet against the internal database. If the same url is found again, - then the url is treated as double when you check the 'no doubles' option. A url may be loaded again when it has reached a specific age, - to use that check the 'once' option. When you want that this web crawl is repeated automatically, then check the 'scheduled' option. - In this case the crawl is repeated after the given time and no url from the previous crawl is omitted as double. -
: - Use filter   -
- Restrict to start domain
- Restrict to sub-path -
- The filter is a regular expression - that must match with the URLs which are used to be crawled; default is 'catch all'. - Example: to allow only urls that contain the word 'science', set the filter to '.*science.*'. - You can also use an automatic domain-restriction to fully crawl a single domain. -
: - - - This filter must not match to allow that the page is accepted for crawling. - The empty string is a never-match filter which should do well for most cases. - If you don't know what this means, please leave this field empty. -
Auto-Dom-Filter: - : -    - : - - - This option will automatically create a domain-filter which limits the crawl on domains the crawler - will find on the given depth. You can use this option i.e. to crawl a page with bookmarks while - restricting the crawl on only those domains that appear on the bookmark-page. The adequate depth - for this example would be 1.
- The default value 0 gives no restrictions. -
Maximum Pages per Domain: - : -    - : - - - You can limit the maximum number of pages that are fetched and indexed from a single domain with this option. - You can combine this limitation with the 'Auto-Dom-Filter', so that the limit is applied to all the domains within - the given depth. Domains outside the given depth are then sorted-out anyway. -
: - A questionmark is usually a hint for a dynamic page. URLs pointing to dynamic content should usually not be crawled. However, there are sometimes web pages with static content that - is accessed with URLs containing question marks. If you are unsure, do not check this to avoid crawl loops. -
: - This option is used by default for proxy prefetch, but is not needed for explicit crawling. -
: - no cache    - if fresh    - if exist    - cache only - - The caching policy states when to use the cache during crawling: - no cache: never use the cache, all content from fresh internet source; - if fresh: use the cache if the cache exists and is fresh using the proxy-fresh rules; - if exist: use the cache if the cache exist. Do no check freshness. Otherwise use online source; - cache only: never go online, use all content from cache. If no cache exist, treat content as unavailable -
Do Local Indexing: - : -     - : - - - This enables indexing of the wepages the crawler will download. This should be switched on by default, unless you want to crawl only to fill the - Document Cache without indexing. -
: - - - - - -
- - - :
-
- This message will appear in the 'Other Peer Crawl Start' table of other peers. -
-
- If checked, the crawler will contact other peers and use them as remote indexers for your crawl. - If you need your crawling results locally, you should switch this off. - Only senior and principal peers can initiate or receive remote crawls. - A YaCyNews message will be created to inform all peers about a global crawl, - so they can omit starting a crawl with the same start point. -
: - This can be useful to circumvent that extremely common words are added to the database, i.e. "the", "he", "she", "it"... To exclude all words given in the file yacy.stopwords from indexing, - check this box. -
Create Bookmark - : - -    (works with "Starting Point: From URL" only) -

- :    -

- : - -
  -
- This option lets you create a bookmark from your crawl start URL. -
+ +
+
+ full domain
+ only sub-path of given url +
+ + + +
+
+ + + +
not more than documents
+
+
+
allow '?' in path +
+ + + + + + + + + +
+
+
+
+ +
+ #%env/templates/footer.template%# diff --git a/htroot/CrawlStart_p.java b/htroot/CrawlStart_p.java index 212d3de71..ff0d14251 100644 --- a/htroot/CrawlStart_p.java +++ b/htroot/CrawlStart_p.java @@ -1,4 +1,4 @@ -// CrawlStartExpert_p.java +// CrawlStart_p.java // (C) 2004 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 02.12.2004 as IndexCreate_p.java on http://yacy.net // @@ -25,59 +25,12 @@ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA import net.yacy.cora.protocol.RequestHeader; -import de.anomic.crawler.CrawlProfile; -import de.anomic.search.SwitchboardConstants; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; public class CrawlStart_p { public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { - // return variable that accumulates replacements - //final Switchboard sb = (Switchboard) env; - final serverObjects prop = new serverObjects(); - - // define visible variables - //String a = sb.peers.mySeed().getPublicAddress(); - //boolean intranet = sb.getConfig(SwitchboardConstants.NETWORK_NAME, "").equals("intranet"); - //String repository = "http://" + ((a == null) ? "localhost:" + sb.getConfig("port", "8080") : a) + "/repository/"; - prop.put("starturl", /*(intranet) ? repository :*/ "http://"); - prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0")); - prop.put("crawlingDepth", env.getConfig("crawlingDepth", "0")); - prop.put("mustmatch", /*(intranet) ? repository + ".*" :*/ CrawlProfile.MATCH_ALL); - prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER); - - prop.put("crawlingIfOlderCheck", "0"); - prop.put("crawlingIfOlderUnitYearCheck", "0"); - prop.put("crawlingIfOlderUnitMonthCheck", "0"); - prop.put("crawlingIfOlderUnitDayCheck", "1"); - prop.put("crawlingIfOlderUnitHourCheck", "0"); - prop.put("crawlingIfOlderNumber", "7"); - - final int crawlingDomFilterDepth = (int) env.getConfigLong("crawlingDomFilterDepth", -1); - prop.put("crawlingDomFilterCheck", (crawlingDomFilterDepth == -1) ? "0" : "1"); - prop.put("crawlingDomFilterDepth", (crawlingDomFilterDepth == -1) ? 1 : crawlingDomFilterDepth); - final int crawlingDomMaxPages = (int) env.getConfigLong("crawlingDomMaxPages", -1); - prop.put("crawlingDomMaxCheck", (crawlingDomMaxPages == -1) ? "0" : "1"); - prop.put("crawlingDomMaxPages", (crawlingDomMaxPages == -1) ? 10000 : crawlingDomMaxPages); - prop.put("crawlingQChecked", env.getConfig("crawlingQ", "").equals("true") ? "1" : "0"); - prop.put("storeHTCacheChecked", env.getConfig("storeHTCache", "").equals("true") ? "1" : "0"); - prop.put("indexingTextChecked", env.getConfig("indexText", "").equals("true") ? "1" : "0"); - prop.put("indexingMediaChecked", env.getConfig("indexMedia", "").equals("true") ? "1" : "0"); - prop.put("crawlOrderChecked", env.getConfig("crawlOrder", "").equals("true") ? "1" : "0"); - - final long LCbusySleep = Integer.parseInt(env.getConfig(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, "100")); - final int LCppm = (LCbusySleep == 0) ? 1000 : (int) (60000L / LCbusySleep); - prop.put("crawlingSpeedMaxChecked", (LCppm >= 1000) ? "1" : "0"); - prop.put("crawlingSpeedCustChecked", ((LCppm > 10) && (LCppm < 1000)) ? "1" : "0"); - prop.put("crawlingSpeedMinChecked", (LCppm <= 10) ? "1" : "0"); - prop.put("customPPMdefault", ((LCppm > 10) && (LCppm < 1000)) ? Integer.toString(LCppm) : ""); - - prop.put("xsstopwChecked", env.getConfig("xsstopw", "").equals("true") ? "1" : "0"); - prop.put("xdstopwChecked", env.getConfig("xdstopw", "").equals("true") ? "1" : "0"); - prop.put("xpstopwChecked", env.getConfig("xpstopw", "").equals("true") ? "1" : "0"); - - // return rewrite properties - return prop; + return CrawlStartExpert_p.respond(header, post, env); } } diff --git a/htroot/env/templates/submenuIndexCreate.template b/htroot/env/templates/submenuIndexCreate.template index 896c31919..581f19419 100644 --- a/htroot/env/templates/submenuIndexCreate.template +++ b/htroot/env/templates/submenuIndexCreate.template @@ -5,7 +5,8 @@