diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index c83c4aba0..316c4e118 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -210,8 +210,7 @@ public class Crawler_p { final boolean deleteage = restrictedcrawl && "age".equals(post.get("deleteold","off")); Date deleteageDate = null; if (deleteage) { - long t = timeParser(true, post.getInt("deleteIfOlderNumber", -1), post.get("deleteIfOlderUnit","year")); // year, month, day, hour - if (t > 0) deleteageDate = new Date(t); + deleteageDate = timeParser(true, post.getInt("deleteIfOlderNumber", -1), post.get("deleteIfOlderUnit","year")); // year, month, day, hour } final boolean deleteold = (deleteage && deleteageDate != null) || (restrictedcrawl && post.getBoolean("deleteold")); @@ -289,11 +288,11 @@ public class Crawler_p { // recrawl final String recrawl = post.get("recrawl", "nodoubles"); // nodoubles, reload, scheduler - long crawlingIfOlder = 0; + Date crawlingIfOlder = null; if ("reload".equals(recrawl)) { crawlingIfOlder = timeParser(true, post.getInt("reloadIfOlderNumber", -1), post.get("reloadIfOlderUnit","year")); // year, month, day, hour } - env.setConfig("crawlingIfOlder", crawlingIfOlder); + env.setConfig("crawlingIfOlder", crawlingIfOlder == null ? Long.MAX_VALUE : crawlingIfOlder.getTime()); // store this call as api call sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + ((rootURLs.size() == 0) ? post.get("crawlingFile", "") : rootURLs.iterator().next().toNormalform(true))); @@ -672,14 +671,14 @@ public class Crawler_p { return prop; } - private static long timeParser(final boolean recrawlIfOlderCheck, final int number, final String unit) { - if (!recrawlIfOlderCheck) return 0L; - if ("year".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L * 365L; - if ("month".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L * 30L; - if ("day".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L; - if ("hour".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L * 60L; - if ("minute".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L; - return 0L; + private static Date timeParser(final boolean recrawlIfOlderCheck, final int number, final String unit) { + if (!recrawlIfOlderCheck) return null; + if ("year".equals(unit)) return new Date(System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L * 365L); + if ("month".equals(unit)) return new Date(System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L * 30L); + if ("day".equals(unit)) return new Date(System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L); + if ("hour".equals(unit)) return new Date(System.currentTimeMillis() - number * 1000L * 60L * 60L); + if ("minute".equals(unit)) return new Date(System.currentTimeMillis() - number * 1000L * 60L); + return null; } } diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index 46fef42fc..756b0a4b9 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -146,7 +146,7 @@ public class QuickCrawlLink_p { CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch CrawlingDepth, true, - 60 * 24 * 30, // recrawlIfOlder (minutes); here: one month + CrawlProfile.getRecrawlDate(60 * 24 * 30), // recrawlIfOlder (minutes); here: one month -1, // domMaxPages, if negative: no count restriction crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, diff --git a/source/net/yacy/crawler/CrawlSwitchboard.java b/source/net/yacy/crawler/CrawlSwitchboard.java index 69bf338ea..e37659830 100644 --- a/source/net/yacy/crawler/CrawlSwitchboard.java +++ b/source/net/yacy/crawler/CrawlSwitchboard.java @@ -315,7 +315,7 @@ public final class CrawlSwitchboard { CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch 0, false, - -1, + null, -1, true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, true, @@ -522,7 +522,7 @@ public final class CrawlSwitchboard { CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch 0, false, - System.currentTimeMillis(), + null, -1, true, true, false, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, true, diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java index 98c979758..6002bd9ff 100644 --- a/source/net/yacy/crawler/data/CrawlProfile.java +++ b/source/net/yacy/crawler/data/CrawlProfile.java @@ -27,6 +27,7 @@ package net.yacy.crawler.data; import java.text.DateFormat; import java.util.Collection; +import java.util.Date; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashMap; @@ -111,8 +112,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M * @param indexContentMustNotMatch content which match this regex will be ignored for indexing * @param depth height of the tree which will be created by the crawler * @param directDocByURL if true, then linked documents that cannot be parsed are indexed as document - * @param recrawlIfOlder documents which have been indexed in the past will - * be indexed again if they are older than the time (ms) in this parameter + * @param recrawlIfOlder documents which have been indexed in the past will be indexed again if they are older than the given date * @param domMaxPages maximum number from one domain which will be indexed * @param crawlingQ true if URLs containing questionmarks shall be indexed * @param indexText true if text content of URL shall be indexed @@ -134,7 +134,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M final String indexContentMustMatch, final String indexContentMustNotMatch, final int depth, final boolean directDocByURL, - final long recrawlIfOlder /*date*/, + final Date recrawlIfOlder /*date*/, final int domMaxPages, final boolean crawlingQ, final boolean followFrames, final boolean obeyHtmlRobotsNoindex, final boolean obeyHtmlRobotsNofollow, @@ -167,7 +167,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M put(INDEXING_CONTENT_MUSTNOTMATCH, (indexContentMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexContentMustNotMatch); put(DEPTH, depth); put(DIRECT_DOC_BY_URL, directDocByURL); - put(RECRAWL_IF_OLDER, recrawlIfOlder); + put(RECRAWL_IF_OLDER, recrawlIfOlder == null ? Long.MAX_VALUE : recrawlIfOlder.getTime()); put(DOM_MAX_PAGES, domMaxPages); put(CRAWLING_Q, crawlingQ); // crawling of urls with '?' put(FOLLOW_FRAMES, followFrames); // load pages contained in frames or ifames @@ -487,8 +487,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M } /** - * Gets the minimum age that an entry must have to be re-crawled. - * @return time in ms + * Gets the minimum date that an entry must have to be re-crawled. + * @return time in ms representing a date */ public long recrawlIfOlder() { // returns a long (millis) that is the minimum age that @@ -566,8 +566,13 @@ public class CrawlProfile extends ConcurrentHashMap implements M return (r.equals(Boolean.TRUE.toString())); } - public static long getRecrawlDate(final long oldTimeMinutes) { - return System.currentTimeMillis() - (60000L * oldTimeMinutes); + /** + * get a recrawl date for a given age in minutes + * @param oldTimeMinutes + * @return a Date representing the recrawl date limit + */ + public static Date getRecrawlDate(final long oldTimeMinutes) { + return new Date(System.currentTimeMillis() - (60000L * oldTimeMinutes)); } public static String siteFilter(final Collection urls) { diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 2be7fbe4d..0afc77644 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2092,31 +2092,31 @@ public final class Switchboard extends serverSwitch { } boolean insert = false; if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY) ) { - selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE))); + selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE).getTime())); insert = true; } if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) ) { - selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE))); + selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE).getTime())); insert = true; } if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) ) { - selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE))); + selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE).getTime())); insert = true; } if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_GREEDY_LEARNING_TEXT) ) { - selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE))); + selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE).getTime())); insert = true; } if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) ) { - selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE))); + selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE).getTime())); insert = true; } if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) ) { - selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE))); + selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE).getTime())); insert = true; } if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE) ) { - selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE))); + selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE).getTime())); insert = true; } if ( insert ) {