enhanced input options for crawl start

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1978 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent d181d3fde7
commit 860a7b545b

@ -46,8 +46,8 @@ You can define URLs as start points for Web page crawling and start crawling her
<tr valign="top" class="TableCellLight">
<td class=small>Re-Crawl Option:</td>
<td class=small>
<input type="checkbox" name="crawlingIfOlderCheck" align="top" #(crawlingIfOlderCheck)#::checked#(/crawlingIfOlderCheck)#>
<input name="crawlingIfOlderNumber" type="text" size="7" maxlength="7" value="#[crawlingIfOlderNumber]#"><br>
Use:<input type="checkbox" name="crawlingIfOlderCheck" align="top" #(crawlingIfOlderCheck)#::checked#(/crawlingIfOlderCheck)#>&nbsp;&nbsp;
Time:<input name="crawlingIfOlderNumber" type="text" size="7" maxlength="7" value="#[crawlingIfOlderNumber]#"><br>
<input type="radio" name="crawlingIfOlderUnit" value="year" #(crawlingIfOlderUnitYearCheck)#::checked#(/crawlingIfOlderUnitYearCheck)#>Year(s)&nbsp;&nbsp;
<input type="radio" name="crawlingIfOlderUnit" value="month" #(crawlingIfOlderUnitMonthCheck)#::checked#(/crawlingIfOlderUnitMonthCheck)#>Month(s)&nbsp;&nbsp;
<input type="radio" name="crawlingIfOlderUnit" value="day" #(crawlingIfOlderUnitDayCheck)#::checked#(/crawlingIfOlderUnitDayCheck)#>Day(s)&nbsp;&nbsp;
@ -60,17 +60,26 @@ You can define URLs as start points for Web page crawling and start crawling her
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td class=small>Auto-Dom-Filter Depth:</td>
<td class=small><input name="crawlingDomFilterDepth" type="text" size="2" maxlength="2" value="#[crawlingDomFilterDepth]#"></td>
<td class=small>Auto-Dom-Filter:</td>
<td class=small>
Use:<input type="checkbox" name="crawlingDomFilterCheck" align="top" #(crawlingDomFilterCheck)#::checked#(/crawlingDomFilterCheck)#>&nbsp;&nbsp;
Depth:<input name="crawlingDomFilterDepth" type="text" size="2" maxlength="2" value="#[crawlingDomFilterDepth]#"></td>
<td class=small>
This option will cause a creation of a domain-list during indexing. This list is filled only with domains that
appear on the given depth during crawling. The domain-list is then used to filter-out all domains, that appear
on depths greater then the given depth, but do not appear in the domain-list. You can use this option i.e.
to crawl pages with bookmarks while restricting the crawl on only those domains that appear on the bookmark-page.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td class=small>Maximum Pages per Domain:</td>
<td class=small><input name="crawlingDomMaxPages" type="text" size="6" maxlength="6" value="#[crawlingDomMaxPages]#"></td>
<td class=small>
Use:<input type="checkbox" name="crawlingDomMaxCheck" align="top" #(crawlingDomMaxCheck)#::checked#(/crawlingDomMaxCheck)#>&nbsp;&nbsp;
Page-Count:<input name="crawlingDomMaxPages" type="text" size="6" maxlength="6" value="#[crawlingDomMaxPages]#"></td>
<td class=small>
You can limit the maxmimum number of pages that are fetched and indexed from a single domain with this option.
You can combine this limitation with the 'Auto-Dom-Filter', so that the limit is applied to all the domains within
the given depth. Domains outside the given depth are then sorted-out anyway.
</td>
</tr>
<tr valign="top" class="TableCellDark">

@ -93,30 +93,43 @@ public class IndexCreate_p {
// set new properties
String newcrawlingfilter = post.get("crawlingFilter", ".*");
env.setConfig("crawlingFilter", newcrawlingfilter);
int newcrawlingdepth = Integer.parseInt(post.get("crawlingDepth", "0"));
env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
boolean crawlingIfOlderCheck = post.get("crawlingIfOlderCheck", "").equals("on");
boolean crawlingIfOlderCheck = post.get("crawlingIfOlderCheck", "off").equals("on");
int crawlingIfOlderNumber = Integer.parseInt(post.get("crawlingIfOlderNumber", "-1"));
String crawlingIfOlderUnit = post.get("crawlingIfOlderUnit","year");
int crawlingIfOlder = recrawlIfOlderC(crawlingIfOlderCheck, crawlingIfOlderNumber, crawlingIfOlderUnit);
env.setConfig("crawlingIfOlder", crawlingIfOlder);
int domFilterDepth = Integer.parseInt(post.get("crawlingDomFilterDepth", "-1"));
env.setConfig("crawlingDomFilterDepth", Integer.toString(domFilterDepth));
int domMaxPages = Integer.parseInt(post.get("crawlingDomMaxPages", "-1"));
env.setConfig("crawlingDomMaxPages", Integer.toString(domMaxPages));
boolean crawlingQ = post.get("crawlingQ", "").equals("on");
boolean crawlingDomFilterCheck = post.get("crawlingDomFilterCheck", "off").equals("on");
int crawlingDomFilterDepth = (crawlingDomFilterCheck) ? Integer.parseInt(post.get("crawlingDomFilterDepth", "-1")) : -1;
env.setConfig("crawlingDomFilterDepth", Integer.toString(crawlingDomFilterDepth));
boolean crawlingDomMaxCheck = post.get("crawlingDomMaxCheck", "off").equals("on");
int crawlingDomMaxPages = (crawlingDomMaxCheck) ? Integer.parseInt(post.get("crawlingDomFilterDepth", "-1")) : -1;
env.setConfig("crawlingDomMaxPages", Integer.toString(crawlingDomMaxPages));
boolean crawlingQ = post.get("crawlingQ", "off").equals("on");
env.setConfig("crawlingQ", (crawlingQ) ? "true" : "false");
boolean storeHTCache = post.get("storeHTCache", "").equals("on");
boolean storeHTCache = post.get("storeHTCache", "off").equals("on");
env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false");
boolean localIndexing = post.get("localIndexing", "").equals("on");
boolean localIndexing = post.get("localIndexing", "off").equals("on");
env.setConfig("localIndexing", (localIndexing) ? "true" : "false");
boolean crawlOrder = post.get("crawlOrder", "").equals("on");
boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
env.setConfig("crawlOrder", (crawlOrder) ? "true" : "false");
boolean xsstopw = post.get("xsstopw", "").equals("on");
boolean xsstopw = post.get("xsstopw", "off").equals("on");
env.setConfig("xsstopw", (xsstopw) ? "true" : "false");
boolean xdstopw = post.get("xdstopw", "").equals("on");
boolean xdstopw = post.get("xdstopw", "off").equals("on");
env.setConfig("xdstopw", (xdstopw) ? "true" : "false");
boolean xpstopw = post.get("xpstopw", "").equals("on");
boolean xpstopw = post.get("xpstopw", "off").equals("on");
env.setConfig("xpstopw", (xpstopw) ? "true" : "false");
String crawlingMode = post.get("crawlingMode","url");
@ -154,7 +167,7 @@ public class IndexCreate_p {
switchboard.urlPool.errorURL.remove(urlhash);
// stack url
plasmaCrawlProfile.entry pe = switchboard.profiles.newEntry(crawlingStartURL.getHost(), crawlingStart, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingIfOlder, domFilterDepth, domMaxPages, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);
plasmaCrawlProfile.entry pe = switchboard.profiles.newEntry(crawlingStartURL.getHost(), crawlingStart, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);
String reasonString = switchboard.sbStackCrawlThread.stackCrawl(crawlingStart, null, yacyCore.seedDB.mySeed.hash, "CRAWLING-ROOT", new Date(), 0, pe);
if (reasonString == null) {
@ -215,7 +228,7 @@ public class IndexCreate_p {
HashMap hyperlinks = (HashMap) scraper.getAnchors();
// creating a crawler profile
plasmaCrawlProfile.entry profile = switchboard.profiles.newEntry(fileName, file.toURL().toString(), newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingIfOlder, domFilterDepth, domMaxPages, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);
plasmaCrawlProfile.entry profile = switchboard.profiles.newEntry(fileName, file.toURL().toString(), newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);
// loop through the contained links
Iterator interator = hyperlinks.entrySet().iterator();
@ -306,7 +319,7 @@ public class IndexCreate_p {
prop.put("crawlingFilter", env.getConfig("crawlingFilter", "0"));
int crawlingIfOlder = (int) env.getConfigLong("crawlingIfOlder", -1);
prop.put("crawlingIfOlderCheck", (crawlingIfOlder == Integer.MAX_VALUE) ? 0 : 1);
prop.put("crawlingIfOlderCheck", (crawlingIfOlder == -1) ? 0 : 1);
prop.put("crawlingIfOlderUnitYearCheck", 0);
prop.put("crawlingIfOlderUnitMonthCheck", 0);
prop.put("crawlingIfOlderUnitDayCheck", 0);
@ -329,9 +342,12 @@ public class IndexCreate_p {
prop.put("crawlingIfOlderNumber", crawlingIfOlder);
prop.put("crawlingIfOlderUnitMinuteCheck", 1);
}
//prop.put("crawlingIfOlder", crawlingIfOlder);
prop.put("crawlingDomFilterDepth", env.getConfig("crawlingDomFilterDepth", "-1"));
prop.put("crawlingDomMaxPages", env.getConfig("crawlingDomMaxPages", "-1"));
int crawlingDomFilterDepth = (int) env.getConfigLong("crawlingDomFilterDepth", -1);
prop.put("crawlingDomFilterCheck", (crawlingDomFilterDepth == -1) ? 0 : 1);
prop.put("crawlingDomFilterDepth", (crawlingDomFilterDepth == -1) ? 1 : crawlingDomFilterDepth);
int crawlingDomMaxPages = (int) env.getConfigLong("crawlingDomMaxPages", -1);
prop.put("crawlingDomMaxCheck", (crawlingDomMaxPages == -1) ? 0 : 1);
prop.put("crawlingDomMaxPages", (crawlingDomMaxPages == -1) ? 10000 : crawlingDomMaxPages);
prop.put("crawlingQChecked", env.getConfig("crawlingQ", "").equals("true") ? 1 : 0);
prop.put("storeHTCacheChecked", env.getConfig("storeHTCache", "").equals("true") ? 1 : 0);
prop.put("localIndexingChecked", env.getConfig("localIndexing", "").equals("true") ? 1 : 0);

Loading…
Cancel
Save