changed handling of crawl profile field crawlingIfOlder: this should be

filled with the date, when the url is recognized as to be outdated. That field was partly misinterpreted and the time interval was filled in. In case that all the urls which are in the index shall be treated as outdated, the field is filled now with Long.MAX_VALUE because then all crawl dates are before that date and therefore outdated.
11 years ago · 542c20a597
parent 4eec1a7452
commit 542c20a597
5 changed files with 34 additions and 30 deletions
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -210,8 +210,7 @@ public class Crawler_p {
                final boolean deleteage = restrictedcrawl && "age".equals(post.get("deleteold","off"));
                Date deleteageDate = null;
                if (deleteage) {
-                    long t = timeParser(true, post.getInt("deleteIfOlderNumber", -1), post.get("deleteIfOlderUnit","year")); // year, month, day, hour
+                    deleteageDate = timeParser(true, post.getInt("deleteIfOlderNumber", -1), post.get("deleteIfOlderUnit","year")); // year, month, day, hour
                    if (t > 0) deleteageDate = new Date(t);
                }
                final boolean deleteold = (deleteage && deleteageDate != null) || (restrictedcrawl && post.getBoolean("deleteold"));
@ -289,11 +288,11 @@ public class Crawler_p {
                // recrawl
                final String recrawl = post.get("recrawl", "nodoubles"); // nodoubles, reload, scheduler
-                long crawlingIfOlder = 0;
+                Date crawlingIfOlder = null;
                if ("reload".equals(recrawl)) {
                    crawlingIfOlder = timeParser(true, post.getInt("reloadIfOlderNumber", -1), post.get("reloadIfOlderUnit","year")); // year, month, day, hour
                }
-                env.setConfig("crawlingIfOlder", crawlingIfOlder);
+                env.setConfig("crawlingIfOlder", crawlingIfOlder == null ? Long.MAX_VALUE : crawlingIfOlder.getTime());
                // store this call as api call
                sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + ((rootURLs.size() == 0) ? post.get("crawlingFile", "") : rootURLs.iterator().next().toNormalform(true)));
@ -672,14 +671,14 @@ public class Crawler_p {
        return prop;
    }
-    private static long timeParser(final boolean recrawlIfOlderCheck, final int number, final String unit) {
+    private static Date timeParser(final boolean recrawlIfOlderCheck, final int number, final String unit) {
-        if (!recrawlIfOlderCheck) return 0L;
+        if (!recrawlIfOlderCheck) return null;
-        if ("year".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L * 365L;
+        if ("year".equals(unit)) return new Date(System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L * 365L);
-        if ("month".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L * 30L;
+        if ("month".equals(unit)) return new Date(System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L * 30L);
-        if ("day".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L;
+        if ("day".equals(unit)) return new Date(System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L);
-        if ("hour".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L * 60L;
+        if ("hour".equals(unit)) return new Date(System.currentTimeMillis() - number * 1000L * 60L * 60L);
-        if ("minute".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L;
+        if ("minute".equals(unit)) return new Date(System.currentTimeMillis() - number * 1000L * 60L);
-        return 0L;
+        return null;
    }
 }
--- a/htroot/QuickCrawlLink_p.java
+++ b/htroot/QuickCrawlLink_p.java
@ -146,7 +146,7 @@ public class QuickCrawlLink_p {
                        CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
                        CrawlingDepth,
                        true,
-                        60 * 24 * 30, // recrawlIfOlder (minutes); here: one month
+                        CrawlProfile.getRecrawlDate(60 * 24 * 30), // recrawlIfOlder (minutes); here: one month
                        -1, // domMaxPages, if negative: no count restriction
                        crawlingQ, followFrames,
                        obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
--- a/source/net/yacy/crawler/CrawlSwitchboard.java
+++ b/source/net/yacy/crawler/CrawlSwitchboard.java
@ -315,7 +315,7 @@ public final class CrawlSwitchboard {
                CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
                0,
                false,
-                -1,
+                null,
                -1,
                true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
                true,
@ -522,7 +522,7 @@ public final class CrawlSwitchboard {
                CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
                0,
                false,
-                System.currentTimeMillis(),
+                null,
                -1,
                true, true, false, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
                true,
--- a/source/net/yacy/crawler/data/CrawlProfile.java
+++ b/source/net/yacy/crawler/data/CrawlProfile.java
@ -27,6 +27,7 @@ package net.yacy.crawler.data;
 import java.text.DateFormat;
 import java.util.Collection;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.LinkedHashMap;
@ -111,8 +112,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
     * @param indexContentMustNotMatch content which match this regex will be ignored for indexing
     * @param depth height of the tree which will be created by the crawler
     * @param directDocByURL if true, then linked documents that cannot be parsed are indexed as document
-     * @param recrawlIfOlder documents which have been indexed in the past will
+     * @param recrawlIfOlder documents which have been indexed in the past will be indexed again if they are older than the given date
     * be indexed again if they are older than the time (ms) in this parameter
     * @param domMaxPages maximum number from one domain which will be indexed
     * @param crawlingQ true if URLs containing questionmarks shall be indexed
     * @param indexText true if text content of URL shall be indexed
@ -134,7 +134,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
                 final String indexContentMustMatch, final String indexContentMustNotMatch,
                 final int depth,
                 final boolean directDocByURL,
-                 final long recrawlIfOlder /*date*/,
+                 final Date recrawlIfOlder /*date*/,
                 final int domMaxPages,
                 final boolean crawlingQ, final boolean followFrames,
                 final boolean obeyHtmlRobotsNoindex, final boolean obeyHtmlRobotsNofollow,
@ -167,7 +167,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        put(INDEXING_CONTENT_MUSTNOTMATCH, (indexContentMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexContentMustNotMatch);
        put(DEPTH,            depth);
        put(DIRECT_DOC_BY_URL, directDocByURL);
-        put(RECRAWL_IF_OLDER, recrawlIfOlder);
+        put(RECRAWL_IF_OLDER, recrawlIfOlder == null ? Long.MAX_VALUE : recrawlIfOlder.getTime());
        put(DOM_MAX_PAGES,    domMaxPages);
        put(CRAWLING_Q,       crawlingQ); // crawling of urls with '?'
        put(FOLLOW_FRAMES,    followFrames); // load pages contained in frames or ifames
@ -487,8 +487,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
    }
    /**
-     * Gets the minimum age that an entry must have to be re-crawled.
+     * Gets the minimum date that an entry must have to be re-crawled.
-     * @return time in ms
+     * @return time in ms representing a date
     */
    public long recrawlIfOlder() {
        // returns a long (millis) that is the minimum age that
@ -566,8 +566,13 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        return (r.equals(Boolean.TRUE.toString()));
    }
-    public static long getRecrawlDate(final long oldTimeMinutes) {
+    /**
-        return System.currentTimeMillis() - (60000L * oldTimeMinutes);
+     * get a recrawl date for a given age in minutes
     * @param oldTimeMinutes
     * @return a Date representing the recrawl date limit
     */
    public static Date getRecrawlDate(final long oldTimeMinutes) {
        return new Date(System.currentTimeMillis() - (60000L * oldTimeMinutes));
    }
    public static String siteFilter(final Collection<? extends MultiProtocolURL> urls) {
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -2092,31 +2092,31 @@ public final class Switchboard extends serverSwitch {
                    }
                    boolean insert = false;
                    if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY) ) {
-                        selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE)));
+                        selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE).getTime()));
                        insert = true;
                    }
                    if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) ) {
-                        selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE)));
+                        selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE).getTime()));
                        insert = true;
                    }
                    if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) ) {
-                        selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE)));
+                        selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE).getTime()));
                        insert = true;
                    }
                    if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_GREEDY_LEARNING_TEXT) ) {
-                        selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE)));
+                        selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE).getTime()));
                        insert = true;
                    }
                    if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) ) {
-                        selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE)));
+                        selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE).getTime()));
                        insert = true;
                    }
                    if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) ) {
-                        selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE)));
+                        selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE).getTime()));
                        insert = true;
                    }
                    if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE) ) {
-                        selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE)));
+                        selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE).getTime()));
                        insert = true;
                    }
                    if ( insert ) {