diff --git a/htroot/CrawlStartExpert_p.html b/htroot/CrawlStartExpert_p.html new file mode 100644 index 000000000..e9047ab6c --- /dev/null +++ b/htroot/CrawlStartExpert_p.html @@ -0,0 +1,306 @@ + + +
++ Start Crawling Job: + You can define URLs as start points for Web page crawling and start crawling here. "Crawling" means that YaCy will download the given website, extract all links in it and then download the content behind these links. This is repeated as long as specified under "Crawling Depth". +
+ + + + #%env/templates/footer.template%# + + diff --git a/htroot/CrawlStartExpert_p.java b/htroot/CrawlStartExpert_p.java new file mode 100644 index 000000000..82ceaaeb5 --- /dev/null +++ b/htroot/CrawlStartExpert_p.java @@ -0,0 +1,83 @@ +// CrawlStartExpert_p.java +// (C) 2004 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 02.12.2004 as IndexCreate_p.java on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2010-08-23 14:32:02 +0200 (Mo, 23 Aug 2010) $ +// $LastChangedRevision: 7068 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +import net.yacy.cora.protocol.RequestHeader; +import de.anomic.crawler.CrawlProfile; +import de.anomic.search.SwitchboardConstants; +import de.anomic.server.serverObjects; +import de.anomic.server.serverSwitch; + +public class CrawlStartExpert_p { + + public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { + // return variable that accumulates replacements + //final Switchboard sb = (Switchboard) env; + final serverObjects prop = new serverObjects(); + + // define visible variables + //String a = sb.peers.mySeed().getPublicAddress(); + //boolean intranet = sb.getConfig(SwitchboardConstants.NETWORK_NAME, "").equals("intranet"); + //String repository = "http://" + ((a == null) ? "localhost:" + sb.getConfig("port", "8080") : a) + "/repository/"; + prop.put("starturl", /*(intranet) ? repository :*/ "http://"); + prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0")); + prop.put("crawlingDepth", env.getConfig("crawlingDepth", "0")); + prop.put("mustmatch", /*(intranet) ? repository + ".*" :*/ CrawlProfile.MATCH_ALL); + prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER); + + prop.put("crawlingIfOlderCheck", "0"); + prop.put("crawlingIfOlderUnitYearCheck", "0"); + prop.put("crawlingIfOlderUnitMonthCheck", "0"); + prop.put("crawlingIfOlderUnitDayCheck", "1"); + prop.put("crawlingIfOlderUnitHourCheck", "0"); + prop.put("crawlingIfOlderNumber", "7"); + + final int crawlingDomFilterDepth = (int) env.getConfigLong("crawlingDomFilterDepth", -1); + prop.put("crawlingDomFilterCheck", (crawlingDomFilterDepth == -1) ? "0" : "1"); + prop.put("crawlingDomFilterDepth", (crawlingDomFilterDepth == -1) ? 1 : crawlingDomFilterDepth); + final int crawlingDomMaxPages = (int) env.getConfigLong("crawlingDomMaxPages", -1); + prop.put("crawlingDomMaxCheck", (crawlingDomMaxPages == -1) ? "0" : "1"); + prop.put("crawlingDomMaxPages", (crawlingDomMaxPages == -1) ? 10000 : crawlingDomMaxPages); + prop.put("crawlingQChecked", env.getConfig("crawlingQ", "").equals("true") ? "1" : "0"); + prop.put("storeHTCacheChecked", env.getConfig("storeHTCache", "").equals("true") ? "1" : "0"); + prop.put("indexingTextChecked", env.getConfig("indexText", "").equals("true") ? "1" : "0"); + prop.put("indexingMediaChecked", env.getConfig("indexMedia", "").equals("true") ? "1" : "0"); + prop.put("crawlOrderChecked", env.getConfig("crawlOrder", "").equals("true") ? "1" : "0"); + + final long LCbusySleep = Integer.parseInt(env.getConfig(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, "100")); + final int LCppm = (LCbusySleep == 0) ? 1000 : (int) (60000L / LCbusySleep); + prop.put("crawlingSpeedMaxChecked", (LCppm >= 1000) ? "1" : "0"); + prop.put("crawlingSpeedCustChecked", ((LCppm > 10) && (LCppm < 1000)) ? "1" : "0"); + prop.put("crawlingSpeedMinChecked", (LCppm <= 10) ? "1" : "0"); + prop.put("customPPMdefault", ((LCppm > 10) && (LCppm < 1000)) ? Integer.toString(LCppm) : ""); + + prop.put("xsstopwChecked", env.getConfig("xsstopw", "").equals("true") ? "1" : "0"); + prop.put("xdstopwChecked", env.getConfig("xdstopw", "").equals("true") ? "1" : "0"); + prop.put("xpstopwChecked", env.getConfig("xpstopw", "").equals("true") ? "1" : "0"); + + // return rewrite properties + return prop; + } +} diff --git a/htroot/CrawlStart_p.html b/htroot/CrawlStart_p.html index 0ee51c9ae..7a8753910 100644 --- a/htroot/CrawlStart_p.html +++ b/htroot/CrawlStart_p.html @@ -19,92 +19,40 @@ #%env/templates/header.template%# #%env/templates/submenuIndexCreate.template%# -- Start Crawling Job: - You can define URLs as start points for Web page crawling and start crawling here. "Crawling" means that YaCy will download the given website, extract all links in it and then download the content behind these links. This is repeated as long as specified under "Crawling Depth". + Site Crawler: + Download all web pages from a given domain or base URL.
- + #%env/templates/footer.template%# diff --git a/htroot/CrawlStart_p.java b/htroot/CrawlStart_p.java index 212d3de71..ff0d14251 100644 --- a/htroot/CrawlStart_p.java +++ b/htroot/CrawlStart_p.java @@ -1,4 +1,4 @@ -// CrawlStartExpert_p.java +// CrawlStart_p.java // (C) 2004 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 02.12.2004 as IndexCreate_p.java on http://yacy.net // @@ -25,59 +25,12 @@ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA import net.yacy.cora.protocol.RequestHeader; -import de.anomic.crawler.CrawlProfile; -import de.anomic.search.SwitchboardConstants; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; public class CrawlStart_p { public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { - // return variable that accumulates replacements - //final Switchboard sb = (Switchboard) env; - final serverObjects prop = new serverObjects(); - - // define visible variables - //String a = sb.peers.mySeed().getPublicAddress(); - //boolean intranet = sb.getConfig(SwitchboardConstants.NETWORK_NAME, "").equals("intranet"); - //String repository = "http://" + ((a == null) ? "localhost:" + sb.getConfig("port", "8080") : a) + "/repository/"; - prop.put("starturl", /*(intranet) ? repository :*/ "http://"); - prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0")); - prop.put("crawlingDepth", env.getConfig("crawlingDepth", "0")); - prop.put("mustmatch", /*(intranet) ? repository + ".*" :*/ CrawlProfile.MATCH_ALL); - prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER); - - prop.put("crawlingIfOlderCheck", "0"); - prop.put("crawlingIfOlderUnitYearCheck", "0"); - prop.put("crawlingIfOlderUnitMonthCheck", "0"); - prop.put("crawlingIfOlderUnitDayCheck", "1"); - prop.put("crawlingIfOlderUnitHourCheck", "0"); - prop.put("crawlingIfOlderNumber", "7"); - - final int crawlingDomFilterDepth = (int) env.getConfigLong("crawlingDomFilterDepth", -1); - prop.put("crawlingDomFilterCheck", (crawlingDomFilterDepth == -1) ? "0" : "1"); - prop.put("crawlingDomFilterDepth", (crawlingDomFilterDepth == -1) ? 1 : crawlingDomFilterDepth); - final int crawlingDomMaxPages = (int) env.getConfigLong("crawlingDomMaxPages", -1); - prop.put("crawlingDomMaxCheck", (crawlingDomMaxPages == -1) ? "0" : "1"); - prop.put("crawlingDomMaxPages", (crawlingDomMaxPages == -1) ? 10000 : crawlingDomMaxPages); - prop.put("crawlingQChecked", env.getConfig("crawlingQ", "").equals("true") ? "1" : "0"); - prop.put("storeHTCacheChecked", env.getConfig("storeHTCache", "").equals("true") ? "1" : "0"); - prop.put("indexingTextChecked", env.getConfig("indexText", "").equals("true") ? "1" : "0"); - prop.put("indexingMediaChecked", env.getConfig("indexMedia", "").equals("true") ? "1" : "0"); - prop.put("crawlOrderChecked", env.getConfig("crawlOrder", "").equals("true") ? "1" : "0"); - - final long LCbusySleep = Integer.parseInt(env.getConfig(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, "100")); - final int LCppm = (LCbusySleep == 0) ? 1000 : (int) (60000L / LCbusySleep); - prop.put("crawlingSpeedMaxChecked", (LCppm >= 1000) ? "1" : "0"); - prop.put("crawlingSpeedCustChecked", ((LCppm > 10) && (LCppm < 1000)) ? "1" : "0"); - prop.put("crawlingSpeedMinChecked", (LCppm <= 10) ? "1" : "0"); - prop.put("customPPMdefault", ((LCppm > 10) && (LCppm < 1000)) ? Integer.toString(LCppm) : ""); - - prop.put("xsstopwChecked", env.getConfig("xsstopw", "").equals("true") ? "1" : "0"); - prop.put("xdstopwChecked", env.getConfig("xdstopw", "").equals("true") ? "1" : "0"); - prop.put("xpstopwChecked", env.getConfig("xpstopw", "").equals("true") ? "1" : "0"); - - // return rewrite properties - return prop; + return CrawlStartExpert_p.respond(header, post, env); } } diff --git a/htroot/env/templates/submenuIndexCreate.template b/htroot/env/templates/submenuIndexCreate.template index 896c31919..581f19419 100644 --- a/htroot/env/templates/submenuIndexCreate.template +++ b/htroot/env/templates/submenuIndexCreate.template @@ -5,7 +5,8 @@