From 542c20a5972b843dadf2f9959940a12d79ae339e Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 22 Jul 2014 00:23:17 +0200 Subject: [PATCH] changed handling of crawl profile field crawlingIfOlder: this should be filled with the date, when the url is recognized as to be outdated. That field was partly misinterpreted and the time interval was filled in. In case that all the urls which are in the index shall be treated as outdated, the field is filled now with Long.MAX_VALUE because then all crawl dates are before that date and therefore outdated. --- htroot/Crawler_p.java | 23 +++++++++---------- htroot/QuickCrawlLink_p.java | 2 +- source/net/yacy/crawler/CrawlSwitchboard.java | 4 ++-- .../net/yacy/crawler/data/CrawlProfile.java | 21 ++++++++++------- source/net/yacy/search/Switchboard.java | 14 +++++------ 5 files changed, 34 insertions(+), 30 deletions(-) diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index c83c4aba0..316c4e118 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -210,8 +210,7 @@ public class Crawler_p { final boolean deleteage = restrictedcrawl && "age".equals(post.get("deleteold","off")); Date deleteageDate = null; if (deleteage) { - long t = timeParser(true, post.getInt("deleteIfOlderNumber", -1), post.get("deleteIfOlderUnit","year")); // year, month, day, hour - if (t > 0) deleteageDate = new Date(t); + deleteageDate = timeParser(true, post.getInt("deleteIfOlderNumber", -1), post.get("deleteIfOlderUnit","year")); // year, month, day, hour } final boolean deleteold = (deleteage && deleteageDate != null) || (restrictedcrawl && post.getBoolean("deleteold")); @@ -289,11 +288,11 @@ public class Crawler_p { // recrawl final String recrawl = post.get("recrawl", "nodoubles"); // nodoubles, reload, scheduler - long crawlingIfOlder = 0; + Date crawlingIfOlder = null; if ("reload".equals(recrawl)) { crawlingIfOlder = timeParser(true, post.getInt("reloadIfOlderNumber", -1), post.get("reloadIfOlderUnit","year")); // year, month, day, hour } - env.setConfig("crawlingIfOlder", crawlingIfOlder); + env.setConfig("crawlingIfOlder", crawlingIfOlder == null ? Long.MAX_VALUE : crawlingIfOlder.getTime()); // store this call as api call sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + ((rootURLs.size() == 0) ? post.get("crawlingFile", "") : rootURLs.iterator().next().toNormalform(true))); @@ -672,14 +671,14 @@ public class Crawler_p { return prop; } - private static long timeParser(final boolean recrawlIfOlderCheck, final int number, final String unit) { - if (!recrawlIfOlderCheck) return 0L; - if ("year".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L * 365L; - if ("month".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L * 30L; - if ("day".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L; - if ("hour".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L * 60L; - if ("minute".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L; - return 0L; + private static Date timeParser(final boolean recrawlIfOlderCheck, final int number, final String unit) { + if (!recrawlIfOlderCheck) return null; + if ("year".equals(unit)) return new Date(System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L * 365L); + if ("month".equals(unit)) return new Date(System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L * 30L); + if ("day".equals(unit)) return new Date(System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L); + if ("hour".equals(unit)) return new Date(System.currentTimeMillis() - number * 1000L * 60L * 60L); + if ("minute".equals(unit)) return new Date(System.currentTimeMillis() - number * 1000L * 60L); + return null; } } diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index 46fef42fc..756b0a4b9 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -146,7 +146,7 @@ public class QuickCrawlLink_p { CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch CrawlingDepth, true, - 60 * 24 * 30, // recrawlIfOlder (minutes); here: one month + CrawlProfile.getRecrawlDate(60 * 24 * 30), // recrawlIfOlder (minutes); here: one month -1, // domMaxPages, if negative: no count restriction crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, diff --git a/source/net/yacy/crawler/CrawlSwitchboard.java b/source/net/yacy/crawler/CrawlSwitchboard.java index 69bf338ea..e37659830 100644 --- a/source/net/yacy/crawler/CrawlSwitchboard.java +++ b/source/net/yacy/crawler/CrawlSwitchboard.java @@ -315,7 +315,7 @@ public final class CrawlSwitchboard { CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch 0, false, - -1, + null, -1, true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, true, @@ -522,7 +522,7 @@ public final class CrawlSwitchboard { CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch 0, false, - System.currentTimeMillis(), + null, -1, true, true, false, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, true, diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java index 98c979758..6002bd9ff 100644 --- a/source/net/yacy/crawler/data/CrawlProfile.java +++ b/source/net/yacy/crawler/data/CrawlProfile.java @@ -27,6 +27,7 @@ package net.yacy.crawler.data; import java.text.DateFormat; import java.util.Collection; +import java.util.Date; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashMap; @@ -111,8 +112,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M * @param indexContentMustNotMatch content which match this regex will be ignored for indexing * @param depth height of the tree which will be created by the crawler * @param directDocByURL if true, then linked documents that cannot be parsed are indexed as document - * @param recrawlIfOlder documents which have been indexed in the past will - * be indexed again if they are older than the time (ms) in this parameter + * @param recrawlIfOlder documents which have been indexed in the past will be indexed again if they are older than the given date * @param domMaxPages maximum number from one domain which will be indexed * @param crawlingQ true if URLs containing questionmarks shall be indexed * @param indexText true if text content of URL shall be indexed @@ -134,7 +134,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M final String indexContentMustMatch, final String indexContentMustNotMatch, final int depth, final boolean directDocByURL, - final long recrawlIfOlder /*date*/, + final Date recrawlIfOlder /*date*/, final int domMaxPages, final boolean crawlingQ, final boolean followFrames, final boolean obeyHtmlRobotsNoindex, final boolean obeyHtmlRobotsNofollow, @@ -167,7 +167,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M put(INDEXING_CONTENT_MUSTNOTMATCH, (indexContentMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexContentMustNotMatch); put(DEPTH, depth); put(DIRECT_DOC_BY_URL, directDocByURL); - put(RECRAWL_IF_OLDER, recrawlIfOlder); + put(RECRAWL_IF_OLDER, recrawlIfOlder == null ? Long.MAX_VALUE : recrawlIfOlder.getTime()); put(DOM_MAX_PAGES, domMaxPages); put(CRAWLING_Q, crawlingQ); // crawling of urls with '?' put(FOLLOW_FRAMES, followFrames); // load pages contained in frames or ifames @@ -487,8 +487,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M } /** - * Gets the minimum age that an entry must have to be re-crawled. - * @return time in ms + * Gets the minimum date that an entry must have to be re-crawled. + * @return time in ms representing a date */ public long recrawlIfOlder() { // returns a long (millis) that is the minimum age that @@ -566,8 +566,13 @@ public class CrawlProfile extends ConcurrentHashMap implements M return (r.equals(Boolean.TRUE.toString())); } - public static long getRecrawlDate(final long oldTimeMinutes) { - return System.currentTimeMillis() - (60000L * oldTimeMinutes); + /** + * get a recrawl date for a given age in minutes + * @param oldTimeMinutes + * @return a Date representing the recrawl date limit + */ + public static Date getRecrawlDate(final long oldTimeMinutes) { + return new Date(System.currentTimeMillis() - (60000L * oldTimeMinutes)); } public static String siteFilter(final Collection urls) { diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 2be7fbe4d..0afc77644 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2092,31 +2092,31 @@ public final class Switchboard extends serverSwitch { } boolean insert = false; if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY) ) { - selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE))); + selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE).getTime())); insert = true; } if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) ) { - selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE))); + selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE).getTime())); insert = true; } if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) ) { - selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE))); + selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE).getTime())); insert = true; } if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_GREEDY_LEARNING_TEXT) ) { - selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE))); + selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE).getTime())); insert = true; } if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) ) { - selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE))); + selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE).getTime())); insert = true; } if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) ) { - selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE))); + selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE).getTime())); insert = true; } if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE) ) { - selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE))); + selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE).getTime())); insert = true; } if ( insert ) {