changed recrawl

use a specific date to define old documents
this solves an unwanted recrawl-loop during a running crawl

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5081 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
lotus 17 years ago
parent da1b0b2fc6
commit 480497f7c9

@ -144,7 +144,7 @@ public class WatchCrawler_p {
final boolean crawlingIfOlderCheck = post.get("crawlingIfOlderCheck", "off").equals("on");
final int crawlingIfOlderNumber = Integer.parseInt(post.get("crawlingIfOlderNumber", "-1"));
final String crawlingIfOlderUnit = post.get("crawlingIfOlderUnit","year");
final int crawlingIfOlder = recrawlIfOlderC(crawlingIfOlderCheck, crawlingIfOlderNumber, crawlingIfOlderUnit);
final long crawlingIfOlder = recrawlIfOlderC(crawlingIfOlderCheck, crawlingIfOlderNumber, crawlingIfOlderUnit);
env.setConfig("crawlingIfOlder", crawlingIfOlder);
final boolean crawlingDomFilterCheck = post.get("crawlingDomFilterCheck", "off").equals("on");
@ -394,13 +394,13 @@ public class WatchCrawler_p {
return prop;
}
private static int recrawlIfOlderC(final boolean recrawlIfOlderCheck, final int recrawlIfOlderNumber, final String crawlingIfOlderUnit) {
if (!recrawlIfOlderCheck) return -1;
if (crawlingIfOlderUnit.equals("year")) return recrawlIfOlderNumber * 60 * 24 * 365;
if (crawlingIfOlderUnit.equals("month")) return recrawlIfOlderNumber * 60 * 24 * 30;
if (crawlingIfOlderUnit.equals("day")) return recrawlIfOlderNumber * 60 * 24;
if (crawlingIfOlderUnit.equals("hour")) return recrawlIfOlderNumber * 60;
return recrawlIfOlderNumber;
private static long recrawlIfOlderC(final boolean recrawlIfOlderCheck, final int recrawlIfOlderNumber, final String crawlingIfOlderUnit) {
if (!recrawlIfOlderCheck) return 0L;
if (crawlingIfOlderUnit.equals("year")) return System.currentTimeMillis() - (long) recrawlIfOlderNumber * 1000L * 60L * 60L * 24L * 365L;
if (crawlingIfOlderUnit.equals("month")) return System.currentTimeMillis() - (long) recrawlIfOlderNumber * 1000L * 60L * 60L * 24L * 30L;
if (crawlingIfOlderUnit.equals("day")) return System.currentTimeMillis() - (long) recrawlIfOlderNumber * 1000L * 60L * 60L * 24L;
if (crawlingIfOlderUnit.equals("hour")) return System.currentTimeMillis() - (long) recrawlIfOlderNumber * 1000L * 60L * 60L;
return System.currentTimeMillis() - (long) recrawlIfOlderNumber;
}
private static void setPerformance(final plasmaSwitchboard sb, final serverObjects post) {

@ -146,7 +146,7 @@ public class CrawlProfile {
public entry newEntry(final String name, final yacyURL startURL, final String generalFilter, final String specificFilter,
final int generalDepth, final int specificDepth,
final int recrawlIfOlder /*minutes*/, final int domFilterDepth, final int domMaxPages,
final long recrawlIfOlder /*date*/, final int domFilterDepth, final int domMaxPages,
final boolean crawlingQ,
final boolean indexText, final boolean indexMedia,
final boolean storeHTCache, final boolean storeTXCache,
@ -244,7 +244,7 @@ public class CrawlProfile {
public entry(final String name, final yacyURL startURL, final String generalFilter, final String specificFilter,
final int generalDepth, final int specificDepth,
final int recrawlIfOlder /*minutes*/, final int domFilterDepth, final int domMaxPages,
final long recrawlIfOlder /*date*/, final int domFilterDepth, final int domMaxPages,
final boolean crawlingQ,
final boolean indexText, final boolean indexMedia,
final boolean storeHTCache, final boolean storeTXCache,
@ -260,7 +260,7 @@ public class CrawlProfile {
mem.put(SPECIFIC_FILTER, (specificFilter == null) ? ".*" : specificFilter);
mem.put(GENERAL_DEPTH, Integer.toString(generalDepth));
mem.put(SPECIFIC_DEPTH, Integer.toString(specificDepth));
mem.put(RECRAWL_IF_OLDER, Integer.toString(recrawlIfOlder));
mem.put(RECRAWL_IF_OLDER, Long.toString(recrawlIfOlder));
mem.put(DOM_FILTER_DEPTH, Integer.toString(domFilterDepth));
mem.put(DOM_MAX_PAGES, Integer.toString(domMaxPages));
mem.put(CRAWLING_Q, Boolean.toString(crawlingQ)); // crawling of urls with '?'
@ -339,14 +339,14 @@ public class CrawlProfile {
}
public long recrawlIfOlder() {
// returns a long (millis) that is the minimum age that
// an antry must have to be re-crawled
// an entry must have to be re-crawled
final String r = mem.get(RECRAWL_IF_OLDER);
if (r == null) return Long.MAX_VALUE;
if (r == null) return 0L;
try {
final long l = Long.parseLong(r) * 60000L;
return (l < 0) ? Long.MAX_VALUE : l;
final long l = Long.parseLong(r);
return (l < 0) ? 0L : l;
} catch (final NumberFormatException e) {
return Long.MAX_VALUE;
return 0L;
}
}
public int domFilterDepth() {

@ -452,7 +452,7 @@ public final class CrawlStacker extends Thread {
// check if the url is double registered
final String dbocc = sb.crawlQueues.urlExists(entry.url().hash());
final indexURLReference oldEntry = this.sb.webIndex.getURL(entry.url().hash(), null, 0);
final boolean recrawl = (oldEntry != null) && ((System.currentTimeMillis() - oldEntry.loaddate().getTime()) > profile.recrawlIfOlder());
final boolean recrawl = (oldEntry != null) && (profile.recrawlIfOlder() > oldEntry.loaddate().getTime());
// do double-check
if ((dbocc != null) && (!recrawl)) {
reason = ErrorURL.DOUBLE_REGISTERED + dbocc + ")";

Loading…
Cancel
Save