enhanced re-crawl settings

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1960 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 708cc6c8d9
commit 0c9b61820e

@ -45,9 +45,18 @@ You can define URLs as start points for Web page crawling and start crawling her
</tr>
<tr valign="top" class="TableCellLight">
<td class=small>Re-Crawl Option:</td>
<td class=small><input name="crawlingIfOlder" type="text" size="7" maxlength="7" value="#[crawlingIfOlder]#"></td>
<td class=small>
<input type="checkbox" name="crawlingIfOlderCheck" align="top" #(crawlingIfOlderCheck)#::checked#(/crawlingIfOlderCheck)#>
<input name="crawlingIfOlderNumber" type="text" size="7" maxlength="7" value="#[crawlingIfOlderNumber]#"></td>
Year(s)&nbsp;<input type="radio" name="crawlingIfOlderUnit" value="year" #(crawlingIfOlderUnitYearCheck)#::checked#(/crawlingIfOlderYearUnitCheck)#>
Month(s)&nbsp;<input type="radio" name="crawlingIfOlderUnit" value="month" #(crawlingIfOlderUnitMonthCheck)#::checked#(/crawlingIfOlderMonthUnitCheck)#>
Day(s)&nbsp;<input type="radio" name="crawlingIfOlderUnit" value="day" #(crawlingIfOlderUnitDayCheck)#::checked#(/crawlingIfOlderDayUnitCheck)#>
Hour(s)&nbsp;<input type="radio" name="crawlingIfOlderUnit" value="hour" #(crawlingIfOlderUnitHourCheck)#::checked#(/crawlingIfOlderHourUnitCheck)#>
Minute(s)&nbsp;<input type="radio" name="crawlingIfOlderUnit" value="minute" #(crawlingIfOlderUnitMinuteCheck)#::checked#(/crawlingIfOlderMinuteUnitCheck)#>
<td class=small>
If you use this option, web pages that are already existent in your database are crawled and indexed again.
It depends on the age of the last crawl if this is done or not: if the last crawl is older than the given
date, the page is crawled again, othervise it is treaded as 'double' and not loaded or indexed again.
</td>
</tr>
<tr valign="top" class="TableCellDark">

@ -95,8 +95,11 @@ public class IndexCreate_p {
env.setConfig("crawlingFilter", newcrawlingfilter);
int newcrawlingdepth = Integer.parseInt(post.get("crawlingDepth", "0"));
env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
int recrawlIfOlder = Integer.parseInt(post.get("crawlingIfOlder", "-1"));
env.setConfig("crawlingIfOlder", recrawlIfOlder);
boolean crawlingIfOlderCheck = post.get("crawlingIfOlderCheck", "").equals("on");
int crawlingIfOlderNumber = Integer.parseInt(post.get("crawlingIfOlderNumber", "-1"));
String crawlingIfOlderUnit = post.get("crawlingIfOlderUnit","year");
int crawlingIfOlder = recrawlIfOlderC(crawlingIfOlderCheck, crawlingIfOlderNumber, crawlingIfOlderUnit);
env.setConfig("crawlingIfOlder", crawlingIfOlder);
int domFilterDepth = Integer.parseInt(post.get("crawlingDomFilterDepth", "-1"));
env.setConfig("crawlingDomFilterDepth", Integer.toString(domFilterDepth));
int domMaxPages = Integer.parseInt(post.get("crawlingDomMaxPages", "-1"));
@ -151,7 +154,7 @@ public class IndexCreate_p {
switchboard.urlPool.errorURL.remove(urlhash);
// stack url
plasmaCrawlProfile.entry pe = switchboard.profiles.newEntry(crawlingStartURL.getHost(), crawlingStart, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, recrawlIfOlder, domFilterDepth, domMaxPages, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);
plasmaCrawlProfile.entry pe = switchboard.profiles.newEntry(crawlingStartURL.getHost(), crawlingStart, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingIfOlder, domFilterDepth, domMaxPages, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);
String reasonString = switchboard.sbStackCrawlThread.stackCrawl(crawlingStart, null, yacyCore.seedDB.mySeed.hash, "CRAWLING-ROOT", new Date(), 0, pe);
if (reasonString == null) {
@ -212,7 +215,7 @@ public class IndexCreate_p {
HashMap hyperlinks = (HashMap) scraper.getAnchors();
// creating a crawler profile
plasmaCrawlProfile.entry profile = switchboard.profiles.newEntry(fileName, file.toURL().toString(), newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, recrawlIfOlder, domFilterDepth, domMaxPages, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);
plasmaCrawlProfile.entry profile = switchboard.profiles.newEntry(fileName, file.toURL().toString(), newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingIfOlder, domFilterDepth, domMaxPages, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);
// loop through the contained links
Iterator interator = hyperlinks.entrySet().iterator();
@ -301,7 +304,32 @@ public class IndexCreate_p {
prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0"));
prop.put("crawlingDepth", env.getConfig("crawlingDepth", "0"));
prop.put("crawlingFilter", env.getConfig("crawlingFilter", "0"));
prop.put("crawlingIfOlder", env.getConfig("crawlingIfOlder", "-1"));
int crawlingIfOlder = (int) env.getConfigLong("crawlingIfOlder", -1);
prop.put("crawlingIfOlderCheck", (crawlingIfOlder == Integer.MAX_VALUE) ? 0 : 1);
prop.put("crawlingIfOlderUnitYearCheck", 0);
prop.put("crawlingIfOlderUnitMonthCheck", 0);
prop.put("crawlingIfOlderUnitDayCheck", 0);
prop.put("crawlingIfOlderUnitHourCheck", 0);
prop.put("crawlingIfOlderUnitMinuteCheck", 0);
if (crawlingIfOlder == Integer.MAX_VALUE) {
} else if (crawlingIfOlder >= 60*24*365) {
prop.put("crawlingIfOlderNumber", crawlingIfOlder / 60*24*365);
prop.put("crawlingIfOlderUnitYearCheck", 1);
} else if (crawlingIfOlder >= 60*24*30) {
prop.put("crawlingIfOlderNumber", crawlingIfOlder / 60*24*30);
prop.put("crawlingIfOlderUnitMonthCheck", 1);
} else if (crawlingIfOlder >= 60*24) {
prop.put("crawlingIfOlderNumber", crawlingIfOlder / 60*24);
prop.put("crawlingIfOlderUnitDayCheck", 1);
} else if (crawlingIfOlder >= 60) {
prop.put("crawlingIfOlderNumber", crawlingIfOlder / 60);
prop.put("crawlingIfOlderUnitHourCheck", 1);
} else {
prop.put("crawlingIfOlderNumber", crawlingIfOlder);
prop.put("crawlingIfOlderUnitMinuteCheck", 1);
}
//prop.put("crawlingIfOlder", crawlingIfOlder);
prop.put("crawlingDomFilterDepth", env.getConfig("crawlingDomFilterDepth", "-1"));
prop.put("crawlingDomMaxPages", env.getConfig("crawlingDomMaxPages", "-1"));
prop.put("crawlingQChecked", env.getConfig("crawlingQ", "").equals("true") ? 1 : 0);
@ -476,7 +504,16 @@ public class IndexCreate_p {
// return rewrite properties
return prop;
}
private static int recrawlIfOlderC(boolean recrawlIfOlderCheck, int recrawlIfOlderNumber, String crawlingIfOlderUnit) {
if (!recrawlIfOlderCheck) return -1;
if (crawlingIfOlderUnit.equals("year")) return recrawlIfOlderNumber * 60 * 24 * 356;
if (crawlingIfOlderUnit.equals("month")) return recrawlIfOlderNumber * 60 * 24 * 30;
if (crawlingIfOlderUnit.equals("day")) return recrawlIfOlderNumber * 60 * 24;
if (crawlingIfOlderUnit.equals("hour")) return recrawlIfOlderNumber * 60;
if (crawlingIfOlderUnit.equals("minute")) return recrawlIfOlderNumber;
return -1;
}
}

@ -37,13 +37,7 @@
<tr><td class="MenuItem">&nbsp;<img border="0" src="/env/grafics/lock.gif" align="top">&nbsp;<a href="/IndexCreate_p.html" class="MenuItemLink">Index Create</a></td></tr>
<tr><td class="MenuItem">&nbsp;<img border="0" src="/env/grafics/lock.gif" align="top">&nbsp;<a href="/IndexControl_p.html" class="MenuItemLink">Index Control</a></td></tr>
<tr><td class="MenuItem">&nbsp;<a href="/IndexMonitor.html" class="MenuItemLink">Index Monitor</a></td></tr>
<tr><td class="MenuSpacer"></td></tr>
<tr><td class="MenuHeader">&nbsp;Local Proxy</td></tr>
<tr><td class="MenuItem">&nbsp;<img border="0" src="/env/grafics/lock.gif" align="top">&nbsp;<a href="/Blacklist_p.html" class="MenuItemLink">Blacklist</a></td></tr>
<tr><td class="MenuItem">&nbsp;<img border="0" src="/env/grafics/lock.gif" align="top">&nbsp;<a href="/ProxyIndexingMonitor_p.html" class="MenuItemLink">Proxy Indexing</a></td></tr>
<tr><td class="MenuItem">&nbsp;<img border="0" src="/env/grafics/lock.gif" align="top">&nbsp;<a href="/CacheAdmin_p.html" class="MenuItemLink">Cache Monitor</a></td></tr>
<tr><td class="MenuItem">&nbsp;<img border="0" src="/env/grafics/lock.gif" align="top">&nbsp;<a href="/CookieMonitorIncoming_p.html" class="MenuItemLink">Cookie Monitor</a></td></tr>
<tr><td class="MenuSpacer"></td></tr>
<tr><td class="MenuHeader">&nbsp;Communication / Publication</td></tr>
@ -65,6 +59,12 @@
<tr><td class="MenuItem">&nbsp;<img border="0" src="/env/grafics/lock.gif" align="top">&nbsp;<a href="/Connections_p.html" class="MenuItemLink">Connections</a></td></tr>
<tr><td class="MenuSpacer"></td></tr>
<tr><td class="MenuHeader">&nbsp;Local Proxy</td></tr>
<tr><td class="MenuItem">&nbsp;<img border="0" src="/env/grafics/lock.gif" align="top">&nbsp;<a href="/ProxyIndexingMonitor_p.html" class="MenuItemLink">Proxy Indexing</a></td></tr>
<tr><td class="MenuItem">&nbsp;<img border="0" src="/env/grafics/lock.gif" align="top">&nbsp;<a href="/CacheAdmin_p.html" class="MenuItemLink">Cache Monitor</a></td></tr>
<tr><td class="MenuItem">&nbsp;<img border="0" src="/env/grafics/lock.gif" align="top">&nbsp;<a href="/CookieMonitorIncoming_p.html" class="MenuItemLink">Cookie Monitor</a></td></tr>
<tr><td class="MenuSpacer"></td></tr>
<tr><td class="MenuHeader">&nbsp;The Project</td></tr>
<tr><td class="MenuItem">&nbsp;<a href="http://www.yacy.net/yacy/" class="MenuItemLink">Project Home</a></td></tr>
<tr><td class="MenuItem">&nbsp;<a href="http://www.yacy.net/yacy/News.html" class="MenuItemLink">Project News</a></td></tr>

@ -138,7 +138,7 @@ public final class yacy {
private static float version = (float) 0.1;
private static final String vDATE = "@REPL_DATE@";
private static final String copyright = "[ YACY Proxy v" + vString + ", build " + vDATE + " by Michael Christen / www.yacy.net ]";
private static final String copyright = "[ YaCy v" + vString + ", build " + vDATE + " by Michael Christen / www.yacy.net ]";
private static final String hline = "-------------------------------------------------------------------------------";
/**
@ -163,10 +163,10 @@ public final class yacy {
}
/**
* Combines the version of the proxy with the versionnumber from SVN to a
* Combines the version of YaCy with the versionnumber from SVN to a
* combined version
*
* @param version Current given version for this proxy.
* @param version Current given version.
* @param svn Current version given from svn.
* @return String with the combined version
*/

Loading…
Cancel
Save