diff --git a/htroot/CrawlProfileEditor_p.java b/htroot/CrawlProfileEditor_p.java index 21ce6c013..6b321d91c 100644 --- a/htroot/CrawlProfileEditor_p.java +++ b/htroot/CrawlProfileEditor_p.java @@ -25,6 +25,7 @@ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA import java.io.IOException; +import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Iterator; @@ -215,7 +216,7 @@ public class CrawlProfileEditor_p { prop.put("crawlProfiles_" + count + "_handle", profile.handle()); prop.put("crawlProfiles_" + count + "_depth", profile.generalDepth()); prop.put("crawlProfiles_" + count + "_filter", profile.generalFilter()); - prop.put("crawlProfiles_" + count + "_crawlingIfOlder", (profile.recrawlIfOlder() == Long.MAX_VALUE) ? "no re-crawl" : ""+profile.recrawlIfOlder()); + prop.put("crawlProfiles_" + count + "_crawlingIfOlder", (profile.recrawlIfOlder() == 0L) ? "no re-crawl" : ""+ SimpleDateFormat.getDateTimeInstance().format(profile.recrawlIfOlder())); prop.put("crawlProfiles_" + count + "_crawlingDomFilterDepth", (profile.domFilterDepth() == Integer.MAX_VALUE) ? "inactive" : Integer.toString(profile.domFilterDepth())); // start contrib [MN] diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java index 69f82de97..73b84ffd5 100644 --- a/source/de/anomic/crawler/CrawlProfile.java +++ b/source/de/anomic/crawler/CrawlProfile.java @@ -1,4 +1,4 @@ -// plasmaCrawlProfile.java +// CrawlProfile.java // ------------------------ // part of YaCy // (C) by Michael Peter Christen; mc@yacy.net @@ -199,6 +199,10 @@ public class CrawlProfile { profileTable.put(e.handle(), e.mem); } + public long getRecrawlDate(final long oldTimeMinutes) { + return System.currentTimeMillis() - (60000L * oldTimeMinutes); + } + public static class DomProfile { public String referrer; diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 894f2007e..50dac3278 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -122,6 +122,7 @@ import de.anomic.crawler.ResultImages; import de.anomic.crawler.ResultURLs; import de.anomic.crawler.RobotsTxt; import de.anomic.crawler.ZURL; +import de.anomic.crawler.CrawlProfile.entry; import de.anomic.data.URLLicense; import de.anomic.data.blogBoard; import de.anomic.data.blogBoardComments; @@ -1295,6 +1296,31 @@ public final class plasmaSwitchboard extends serverAbstractSwitch it = webIndex.profilesActiveCrawls.profiles(true); + entry selentry; + while (it.hasNext()) { + selentry = it.next(); + if (selentry.name().equals(plasmaWordIndex.CRAWL_PROFILE_PROXY)) + webIndex.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER, + Long.toString(webIndex.profilesActiveCrawls.getRecrawlDate(plasmaWordIndex.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE))); + // if (selentry.name().equals(plasmaWordIndex.CRAWL_PROFILE_REMOTE)); + if (selentry.name().equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT)) + webIndex.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER, + Long.toString(webIndex.profilesActiveCrawls.getRecrawlDate(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE))); + if (selentry.name().equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT)) + webIndex.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER, + Long.toString(webIndex.profilesActiveCrawls.getRecrawlDate(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE))); + if (selentry.name().equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA)) + webIndex.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER, + Long.toString(webIndex.profilesActiveCrawls.getRecrawlDate(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE))); + if (selentry.name().equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) + webIndex.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER, + Long.toString(webIndex.profilesActiveCrawls.getRecrawlDate(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE))); + } + } catch (final IOException e) {}; + // close unused connections JakartaCommonsHttpClient.cleanup(); diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index c6d2d1e76..2e70fc4f8 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -90,6 +90,12 @@ public final class plasmaWordIndex implements indexRI { public static final String DBFILE_ACTIVE_CRAWL_PROFILES = "crawlProfilesActive.db"; public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive.db"; + public static final long CRAWL_PROFILE_PROXY_RECRAWL_CYCLE = 60L * 24L; + public static final long CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L; + public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L; + public static final long CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE = 60L * 24L * 30L; + public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE = 60L * 24L * 30L; + private final kelondroByteOrder indexOrder = kelondroBase64Order.enhancedCoder; private final indexRAMRI dhtOutCache, dhtInCache; @@ -249,7 +255,7 @@ public final class plasmaWordIndex implements indexRI { this.defaultProxyProfile = this.profilesActiveCrawls.newEntry("proxy", null, ".*", ".*", 0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/, 0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/, - 60 * 24, -1, -1, false, + this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, -1, false, true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/, true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/, true, true, @@ -263,22 +269,22 @@ public final class plasmaWordIndex implements indexRI { if (this.defaultTextSnippetLocalProfile == null) { // generate new default entry for snippet fetch and optional crawling defaultTextSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, ".*", ".*", 0, 0, - 60 * 24 * 30, -1, -1, true, false, false, false, false, false, true, true, false); + this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, false, false, false, true, true, false); } if (this.defaultTextSnippetGlobalProfile == null) { // generate new default entry for snippet fetch and optional crawling defaultTextSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, ".*", ".*", 0, 0, - 60 * 24 * 30, -1, -1, true, true, true, true, true, false, true, true, false); + this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false); } if (this.defaultMediaSnippetLocalProfile == null) { // generate new default entry for snippet fetch and optional crawling defaultMediaSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, ".*", ".*", 0, 0, - 60 * 24 * 30, -1, -1, true, false, false, false, false, false, true, true, false); + this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, false, false, false, true, true, false); } if (this.defaultMediaSnippetGlobalProfile == null) { // generate new default entry for snippet fetch and optional crawling defaultMediaSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, ".*", ".*", 0, 0, - 60 * 24 * 30, -1, -1, true, false, true, true, true, false, true, true, false); + this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false); } }