changed handling of crawl profile field crawlingIfOlder: this should be

filled with the date, when the url is recognized as to be outdated. That
field was partly misinterpreted and the time interval was filled in. In
case that all the urls which are in the index shall be treated as
outdated, the field is filled now with Long.MAX_VALUE because then all
crawl dates are before that date and therefore outdated.
pull/1/head
Michael Peter Christen 11 years ago
parent 4eec1a7452
commit 542c20a597

@ -210,8 +210,7 @@ public class Crawler_p {
final boolean deleteage = restrictedcrawl && "age".equals(post.get("deleteold","off"));
Date deleteageDate = null;
if (deleteage) {
long t = timeParser(true, post.getInt("deleteIfOlderNumber", -1), post.get("deleteIfOlderUnit","year")); // year, month, day, hour
if (t > 0) deleteageDate = new Date(t);
deleteageDate = timeParser(true, post.getInt("deleteIfOlderNumber", -1), post.get("deleteIfOlderUnit","year")); // year, month, day, hour
}
final boolean deleteold = (deleteage && deleteageDate != null) || (restrictedcrawl && post.getBoolean("deleteold"));
@ -289,11 +288,11 @@ public class Crawler_p {
// recrawl
final String recrawl = post.get("recrawl", "nodoubles"); // nodoubles, reload, scheduler
long crawlingIfOlder = 0;
Date crawlingIfOlder = null;
if ("reload".equals(recrawl)) {
crawlingIfOlder = timeParser(true, post.getInt("reloadIfOlderNumber", -1), post.get("reloadIfOlderUnit","year")); // year, month, day, hour
}
env.setConfig("crawlingIfOlder", crawlingIfOlder);
env.setConfig("crawlingIfOlder", crawlingIfOlder == null ? Long.MAX_VALUE : crawlingIfOlder.getTime());
// store this call as api call
sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + ((rootURLs.size() == 0) ? post.get("crawlingFile", "") : rootURLs.iterator().next().toNormalform(true)));
@ -672,14 +671,14 @@ public class Crawler_p {
return prop;
}
private static long timeParser(final boolean recrawlIfOlderCheck, final int number, final String unit) {
if (!recrawlIfOlderCheck) return 0L;
if ("year".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L * 365L;
if ("month".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L * 30L;
if ("day".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L;
if ("hour".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L * 60L;
if ("minute".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L;
return 0L;
private static Date timeParser(final boolean recrawlIfOlderCheck, final int number, final String unit) {
if (!recrawlIfOlderCheck) return null;
if ("year".equals(unit)) return new Date(System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L * 365L);
if ("month".equals(unit)) return new Date(System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L * 30L);
if ("day".equals(unit)) return new Date(System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L);
if ("hour".equals(unit)) return new Date(System.currentTimeMillis() - number * 1000L * 60L * 60L);
if ("minute".equals(unit)) return new Date(System.currentTimeMillis() - number * 1000L * 60L);
return null;
}
}

@ -146,7 +146,7 @@ public class QuickCrawlLink_p {
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
CrawlingDepth,
true,
60 * 24 * 30, // recrawlIfOlder (minutes); here: one month
CrawlProfile.getRecrawlDate(60 * 24 * 30), // recrawlIfOlder (minutes); here: one month
-1, // domMaxPages, if negative: no count restriction
crawlingQ, followFrames,
obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,

@ -315,7 +315,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
0,
false,
-1,
null,
-1,
true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
true,
@ -522,7 +522,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
0,
false,
System.currentTimeMillis(),
null,
-1,
true, true, false, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
true,

@ -27,6 +27,7 @@ package net.yacy.crawler.data;
import java.text.DateFormat;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
@ -111,8 +112,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
* @param indexContentMustNotMatch content which match this regex will be ignored for indexing
* @param depth height of the tree which will be created by the crawler
* @param directDocByURL if true, then linked documents that cannot be parsed are indexed as document
* @param recrawlIfOlder documents which have been indexed in the past will
* be indexed again if they are older than the time (ms) in this parameter
* @param recrawlIfOlder documents which have been indexed in the past will be indexed again if they are older than the given date
* @param domMaxPages maximum number from one domain which will be indexed
* @param crawlingQ true if URLs containing questionmarks shall be indexed
* @param indexText true if text content of URL shall be indexed
@ -134,7 +134,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
final String indexContentMustMatch, final String indexContentMustNotMatch,
final int depth,
final boolean directDocByURL,
final long recrawlIfOlder /*date*/,
final Date recrawlIfOlder /*date*/,
final int domMaxPages,
final boolean crawlingQ, final boolean followFrames,
final boolean obeyHtmlRobotsNoindex, final boolean obeyHtmlRobotsNofollow,
@ -167,7 +167,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
put(INDEXING_CONTENT_MUSTNOTMATCH, (indexContentMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexContentMustNotMatch);
put(DEPTH, depth);
put(DIRECT_DOC_BY_URL, directDocByURL);
put(RECRAWL_IF_OLDER, recrawlIfOlder);
put(RECRAWL_IF_OLDER, recrawlIfOlder == null ? Long.MAX_VALUE : recrawlIfOlder.getTime());
put(DOM_MAX_PAGES, domMaxPages);
put(CRAWLING_Q, crawlingQ); // crawling of urls with '?'
put(FOLLOW_FRAMES, followFrames); // load pages contained in frames or ifames
@ -487,8 +487,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
}
/**
* Gets the minimum age that an entry must have to be re-crawled.
* @return time in ms
* Gets the minimum date that an entry must have to be re-crawled.
* @return time in ms representing a date
*/
public long recrawlIfOlder() {
// returns a long (millis) that is the minimum age that
@ -566,8 +566,13 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return (r.equals(Boolean.TRUE.toString()));
}
public static long getRecrawlDate(final long oldTimeMinutes) {
return System.currentTimeMillis() - (60000L * oldTimeMinutes);
/**
* get a recrawl date for a given age in minutes
* @param oldTimeMinutes
* @return a Date representing the recrawl date limit
*/
public static Date getRecrawlDate(final long oldTimeMinutes) {
return new Date(System.currentTimeMillis() - (60000L * oldTimeMinutes));
}
public static String siteFilter(final Collection<? extends MultiProtocolURL> urls) {

@ -2092,31 +2092,31 @@ public final class Switchboard extends serverSwitch {
}
boolean insert = false;
if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY) ) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE)));
selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE).getTime()));
insert = true;
}
if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) ) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE)));
selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE).getTime()));
insert = true;
}
if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) ) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE)));
selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE).getTime()));
insert = true;
}
if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_GREEDY_LEARNING_TEXT) ) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE)));
selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE).getTime()));
insert = true;
}
if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) ) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE)));
selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE).getTime()));
insert = true;
}
if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) ) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE)));
selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE).getTime()));
insert = true;
}
if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE) ) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE)));
selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE).getTime()));
insert = true;
}
if ( insert ) {

Loading…
Cancel
Save