changed recrawl

use a specific date to define old documents
this solves an unwanted recrawl-loop during a running crawl

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5081 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
lotus 17 years ago
parent da1b0b2fc6
commit 480497f7c9

@ -144,7 +144,7 @@ public class WatchCrawler_p {
final boolean crawlingIfOlderCheck = post.get("crawlingIfOlderCheck", "off").equals("on"); final boolean crawlingIfOlderCheck = post.get("crawlingIfOlderCheck", "off").equals("on");
final int crawlingIfOlderNumber = Integer.parseInt(post.get("crawlingIfOlderNumber", "-1")); final int crawlingIfOlderNumber = Integer.parseInt(post.get("crawlingIfOlderNumber", "-1"));
final String crawlingIfOlderUnit = post.get("crawlingIfOlderUnit","year"); final String crawlingIfOlderUnit = post.get("crawlingIfOlderUnit","year");
final int crawlingIfOlder = recrawlIfOlderC(crawlingIfOlderCheck, crawlingIfOlderNumber, crawlingIfOlderUnit); final long crawlingIfOlder = recrawlIfOlderC(crawlingIfOlderCheck, crawlingIfOlderNumber, crawlingIfOlderUnit);
env.setConfig("crawlingIfOlder", crawlingIfOlder); env.setConfig("crawlingIfOlder", crawlingIfOlder);
final boolean crawlingDomFilterCheck = post.get("crawlingDomFilterCheck", "off").equals("on"); final boolean crawlingDomFilterCheck = post.get("crawlingDomFilterCheck", "off").equals("on");
@ -394,13 +394,13 @@ public class WatchCrawler_p {
return prop; return prop;
} }
private static int recrawlIfOlderC(final boolean recrawlIfOlderCheck, final int recrawlIfOlderNumber, final String crawlingIfOlderUnit) { private static long recrawlIfOlderC(final boolean recrawlIfOlderCheck, final int recrawlIfOlderNumber, final String crawlingIfOlderUnit) {
if (!recrawlIfOlderCheck) return -1; if (!recrawlIfOlderCheck) return 0L;
if (crawlingIfOlderUnit.equals("year")) return recrawlIfOlderNumber * 60 * 24 * 365; if (crawlingIfOlderUnit.equals("year")) return System.currentTimeMillis() - (long) recrawlIfOlderNumber * 1000L * 60L * 60L * 24L * 365L;
if (crawlingIfOlderUnit.equals("month")) return recrawlIfOlderNumber * 60 * 24 * 30; if (crawlingIfOlderUnit.equals("month")) return System.currentTimeMillis() - (long) recrawlIfOlderNumber * 1000L * 60L * 60L * 24L * 30L;
if (crawlingIfOlderUnit.equals("day")) return recrawlIfOlderNumber * 60 * 24; if (crawlingIfOlderUnit.equals("day")) return System.currentTimeMillis() - (long) recrawlIfOlderNumber * 1000L * 60L * 60L * 24L;
if (crawlingIfOlderUnit.equals("hour")) return recrawlIfOlderNumber * 60; if (crawlingIfOlderUnit.equals("hour")) return System.currentTimeMillis() - (long) recrawlIfOlderNumber * 1000L * 60L * 60L;
return recrawlIfOlderNumber; return System.currentTimeMillis() - (long) recrawlIfOlderNumber;
} }
private static void setPerformance(final plasmaSwitchboard sb, final serverObjects post) { private static void setPerformance(final plasmaSwitchboard sb, final serverObjects post) {

@ -146,7 +146,7 @@ public class CrawlProfile {
public entry newEntry(final String name, final yacyURL startURL, final String generalFilter, final String specificFilter, public entry newEntry(final String name, final yacyURL startURL, final String generalFilter, final String specificFilter,
final int generalDepth, final int specificDepth, final int generalDepth, final int specificDepth,
final int recrawlIfOlder /*minutes*/, final int domFilterDepth, final int domMaxPages, final long recrawlIfOlder /*date*/, final int domFilterDepth, final int domMaxPages,
final boolean crawlingQ, final boolean crawlingQ,
final boolean indexText, final boolean indexMedia, final boolean indexText, final boolean indexMedia,
final boolean storeHTCache, final boolean storeTXCache, final boolean storeHTCache, final boolean storeTXCache,
@ -244,7 +244,7 @@ public class CrawlProfile {
public entry(final String name, final yacyURL startURL, final String generalFilter, final String specificFilter, public entry(final String name, final yacyURL startURL, final String generalFilter, final String specificFilter,
final int generalDepth, final int specificDepth, final int generalDepth, final int specificDepth,
final int recrawlIfOlder /*minutes*/, final int domFilterDepth, final int domMaxPages, final long recrawlIfOlder /*date*/, final int domFilterDepth, final int domMaxPages,
final boolean crawlingQ, final boolean crawlingQ,
final boolean indexText, final boolean indexMedia, final boolean indexText, final boolean indexMedia,
final boolean storeHTCache, final boolean storeTXCache, final boolean storeHTCache, final boolean storeTXCache,
@ -260,7 +260,7 @@ public class CrawlProfile {
mem.put(SPECIFIC_FILTER, (specificFilter == null) ? ".*" : specificFilter); mem.put(SPECIFIC_FILTER, (specificFilter == null) ? ".*" : specificFilter);
mem.put(GENERAL_DEPTH, Integer.toString(generalDepth)); mem.put(GENERAL_DEPTH, Integer.toString(generalDepth));
mem.put(SPECIFIC_DEPTH, Integer.toString(specificDepth)); mem.put(SPECIFIC_DEPTH, Integer.toString(specificDepth));
mem.put(RECRAWL_IF_OLDER, Integer.toString(recrawlIfOlder)); mem.put(RECRAWL_IF_OLDER, Long.toString(recrawlIfOlder));
mem.put(DOM_FILTER_DEPTH, Integer.toString(domFilterDepth)); mem.put(DOM_FILTER_DEPTH, Integer.toString(domFilterDepth));
mem.put(DOM_MAX_PAGES, Integer.toString(domMaxPages)); mem.put(DOM_MAX_PAGES, Integer.toString(domMaxPages));
mem.put(CRAWLING_Q, Boolean.toString(crawlingQ)); // crawling of urls with '?' mem.put(CRAWLING_Q, Boolean.toString(crawlingQ)); // crawling of urls with '?'
@ -339,14 +339,14 @@ public class CrawlProfile {
} }
public long recrawlIfOlder() { public long recrawlIfOlder() {
// returns a long (millis) that is the minimum age that // returns a long (millis) that is the minimum age that
// an antry must have to be re-crawled // an entry must have to be re-crawled
final String r = mem.get(RECRAWL_IF_OLDER); final String r = mem.get(RECRAWL_IF_OLDER);
if (r == null) return Long.MAX_VALUE; if (r == null) return 0L;
try { try {
final long l = Long.parseLong(r) * 60000L; final long l = Long.parseLong(r);
return (l < 0) ? Long.MAX_VALUE : l; return (l < 0) ? 0L : l;
} catch (final NumberFormatException e) { } catch (final NumberFormatException e) {
return Long.MAX_VALUE; return 0L;
} }
} }
public int domFilterDepth() { public int domFilterDepth() {

@ -452,7 +452,7 @@ public final class CrawlStacker extends Thread {
// check if the url is double registered // check if the url is double registered
final String dbocc = sb.crawlQueues.urlExists(entry.url().hash()); final String dbocc = sb.crawlQueues.urlExists(entry.url().hash());
final indexURLReference oldEntry = this.sb.webIndex.getURL(entry.url().hash(), null, 0); final indexURLReference oldEntry = this.sb.webIndex.getURL(entry.url().hash(), null, 0);
final boolean recrawl = (oldEntry != null) && ((System.currentTimeMillis() - oldEntry.loaddate().getTime()) > profile.recrawlIfOlder()); final boolean recrawl = (oldEntry != null) && (profile.recrawlIfOlder() > oldEntry.loaddate().getTime());
// do double-check // do double-check
if ((dbocc != null) && (!recrawl)) { if ((dbocc != null) && (!recrawl)) {
reason = ErrorURL.DOUBLE_REGISTERED + dbocc + ")"; reason = ErrorURL.DOUBLE_REGISTERED + dbocc + ")";

Loading…
Cancel
Save