changed handling of crawl profile field crawlingIfOlder: this should be

filled with the date, when the url is recognized as to be outdated. That
field was partly misinterpreted and the time interval was filled in. In
case that all the urls which are in the index shall be treated as
outdated, the field is filled now with Long.MAX_VALUE because then all
crawl dates are before that date and therefore outdated.
pull/1/head
Michael Peter Christen 11 years ago
parent 4eec1a7452
commit 542c20a597

@ -210,8 +210,7 @@ public class Crawler_p {
final boolean deleteage = restrictedcrawl && "age".equals(post.get("deleteold","off")); final boolean deleteage = restrictedcrawl && "age".equals(post.get("deleteold","off"));
Date deleteageDate = null; Date deleteageDate = null;
if (deleteage) { if (deleteage) {
long t = timeParser(true, post.getInt("deleteIfOlderNumber", -1), post.get("deleteIfOlderUnit","year")); // year, month, day, hour deleteageDate = timeParser(true, post.getInt("deleteIfOlderNumber", -1), post.get("deleteIfOlderUnit","year")); // year, month, day, hour
if (t > 0) deleteageDate = new Date(t);
} }
final boolean deleteold = (deleteage && deleteageDate != null) || (restrictedcrawl && post.getBoolean("deleteold")); final boolean deleteold = (deleteage && deleteageDate != null) || (restrictedcrawl && post.getBoolean("deleteold"));
@ -289,11 +288,11 @@ public class Crawler_p {
// recrawl // recrawl
final String recrawl = post.get("recrawl", "nodoubles"); // nodoubles, reload, scheduler final String recrawl = post.get("recrawl", "nodoubles"); // nodoubles, reload, scheduler
long crawlingIfOlder = 0; Date crawlingIfOlder = null;
if ("reload".equals(recrawl)) { if ("reload".equals(recrawl)) {
crawlingIfOlder = timeParser(true, post.getInt("reloadIfOlderNumber", -1), post.get("reloadIfOlderUnit","year")); // year, month, day, hour crawlingIfOlder = timeParser(true, post.getInt("reloadIfOlderNumber", -1), post.get("reloadIfOlderUnit","year")); // year, month, day, hour
} }
env.setConfig("crawlingIfOlder", crawlingIfOlder); env.setConfig("crawlingIfOlder", crawlingIfOlder == null ? Long.MAX_VALUE : crawlingIfOlder.getTime());
// store this call as api call // store this call as api call
sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + ((rootURLs.size() == 0) ? post.get("crawlingFile", "") : rootURLs.iterator().next().toNormalform(true))); sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + ((rootURLs.size() == 0) ? post.get("crawlingFile", "") : rootURLs.iterator().next().toNormalform(true)));
@ -672,14 +671,14 @@ public class Crawler_p {
return prop; return prop;
} }
private static long timeParser(final boolean recrawlIfOlderCheck, final int number, final String unit) { private static Date timeParser(final boolean recrawlIfOlderCheck, final int number, final String unit) {
if (!recrawlIfOlderCheck) return 0L; if (!recrawlIfOlderCheck) return null;
if ("year".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L * 365L; if ("year".equals(unit)) return new Date(System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L * 365L);
if ("month".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L * 30L; if ("month".equals(unit)) return new Date(System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L * 30L);
if ("day".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L; if ("day".equals(unit)) return new Date(System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L);
if ("hour".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L * 60L; if ("hour".equals(unit)) return new Date(System.currentTimeMillis() - number * 1000L * 60L * 60L);
if ("minute".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L; if ("minute".equals(unit)) return new Date(System.currentTimeMillis() - number * 1000L * 60L);
return 0L; return null;
} }
} }

@ -146,7 +146,7 @@ public class QuickCrawlLink_p {
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
CrawlingDepth, CrawlingDepth,
true, true,
60 * 24 * 30, // recrawlIfOlder (minutes); here: one month CrawlProfile.getRecrawlDate(60 * 24 * 30), // recrawlIfOlder (minutes); here: one month
-1, // domMaxPages, if negative: no count restriction -1, // domMaxPages, if negative: no count restriction
crawlingQ, followFrames, crawlingQ, followFrames,
obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,

@ -315,7 +315,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
0, 0,
false, false,
-1, null,
-1, -1,
true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
true, true,
@ -522,7 +522,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
0, 0,
false, false,
System.currentTimeMillis(), null,
-1, -1,
true, true, false, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, true, true, false, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
true, true,

@ -27,6 +27,7 @@ package net.yacy.crawler.data;
import java.text.DateFormat; import java.text.DateFormat;
import java.util.Collection; import java.util.Collection;
import java.util.Date;
import java.util.HashMap; import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.LinkedHashMap; import java.util.LinkedHashMap;
@ -111,8 +112,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
* @param indexContentMustNotMatch content which match this regex will be ignored for indexing * @param indexContentMustNotMatch content which match this regex will be ignored for indexing
* @param depth height of the tree which will be created by the crawler * @param depth height of the tree which will be created by the crawler
* @param directDocByURL if true, then linked documents that cannot be parsed are indexed as document * @param directDocByURL if true, then linked documents that cannot be parsed are indexed as document
* @param recrawlIfOlder documents which have been indexed in the past will * @param recrawlIfOlder documents which have been indexed in the past will be indexed again if they are older than the given date
* be indexed again if they are older than the time (ms) in this parameter
* @param domMaxPages maximum number from one domain which will be indexed * @param domMaxPages maximum number from one domain which will be indexed
* @param crawlingQ true if URLs containing questionmarks shall be indexed * @param crawlingQ true if URLs containing questionmarks shall be indexed
* @param indexText true if text content of URL shall be indexed * @param indexText true if text content of URL shall be indexed
@ -134,7 +134,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
final String indexContentMustMatch, final String indexContentMustNotMatch, final String indexContentMustMatch, final String indexContentMustNotMatch,
final int depth, final int depth,
final boolean directDocByURL, final boolean directDocByURL,
final long recrawlIfOlder /*date*/, final Date recrawlIfOlder /*date*/,
final int domMaxPages, final int domMaxPages,
final boolean crawlingQ, final boolean followFrames, final boolean crawlingQ, final boolean followFrames,
final boolean obeyHtmlRobotsNoindex, final boolean obeyHtmlRobotsNofollow, final boolean obeyHtmlRobotsNoindex, final boolean obeyHtmlRobotsNofollow,
@ -167,7 +167,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
put(INDEXING_CONTENT_MUSTNOTMATCH, (indexContentMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexContentMustNotMatch); put(INDEXING_CONTENT_MUSTNOTMATCH, (indexContentMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexContentMustNotMatch);
put(DEPTH, depth); put(DEPTH, depth);
put(DIRECT_DOC_BY_URL, directDocByURL); put(DIRECT_DOC_BY_URL, directDocByURL);
put(RECRAWL_IF_OLDER, recrawlIfOlder); put(RECRAWL_IF_OLDER, recrawlIfOlder == null ? Long.MAX_VALUE : recrawlIfOlder.getTime());
put(DOM_MAX_PAGES, domMaxPages); put(DOM_MAX_PAGES, domMaxPages);
put(CRAWLING_Q, crawlingQ); // crawling of urls with '?' put(CRAWLING_Q, crawlingQ); // crawling of urls with '?'
put(FOLLOW_FRAMES, followFrames); // load pages contained in frames or ifames put(FOLLOW_FRAMES, followFrames); // load pages contained in frames or ifames
@ -487,8 +487,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
} }
/** /**
* Gets the minimum age that an entry must have to be re-crawled. * Gets the minimum date that an entry must have to be re-crawled.
* @return time in ms * @return time in ms representing a date
*/ */
public long recrawlIfOlder() { public long recrawlIfOlder() {
// returns a long (millis) that is the minimum age that // returns a long (millis) that is the minimum age that
@ -566,8 +566,13 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return (r.equals(Boolean.TRUE.toString())); return (r.equals(Boolean.TRUE.toString()));
} }
public static long getRecrawlDate(final long oldTimeMinutes) { /**
return System.currentTimeMillis() - (60000L * oldTimeMinutes); * get a recrawl date for a given age in minutes
* @param oldTimeMinutes
* @return a Date representing the recrawl date limit
*/
public static Date getRecrawlDate(final long oldTimeMinutes) {
return new Date(System.currentTimeMillis() - (60000L * oldTimeMinutes));
} }
public static String siteFilter(final Collection<? extends MultiProtocolURL> urls) { public static String siteFilter(final Collection<? extends MultiProtocolURL> urls) {

@ -2092,31 +2092,31 @@ public final class Switchboard extends serverSwitch {
} }
boolean insert = false; boolean insert = false;
if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY) ) { if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY) ) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE))); selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE).getTime()));
insert = true; insert = true;
} }
if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) ) { if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) ) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE))); selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE).getTime()));
insert = true; insert = true;
} }
if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) ) { if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) ) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE))); selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE).getTime()));
insert = true; insert = true;
} }
if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_GREEDY_LEARNING_TEXT) ) { if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_GREEDY_LEARNING_TEXT) ) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE))); selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE).getTime()));
insert = true; insert = true;
} }
if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) ) { if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) ) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE))); selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE).getTime()));
insert = true; insert = true;
} }
if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) ) { if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) ) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE))); selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE).getTime()));
insert = true; insert = true;
} }
if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE) ) { if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE) ) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE))); selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE).getTime()));
insert = true; insert = true;
} }
if ( insert ) { if ( insert ) {

Loading…
Cancel
Save