From bd931a82f70a58f17e35fa163bb4c53a9f8a04f1 Mon Sep 17 00:00:00 2001 From: apfelmaennchen Date: Fri, 22 Aug 2008 18:05:05 +0000 Subject: [PATCH] - added dynamic filters to autoReCrawl.conf - Restrict to sub-path: sub - Restrict to start-domain: dom git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5070 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- defaults/yacy.init | 10 +++++++++ source/de/anomic/data/bookmarksDB.java | 30 ++++++++++++++++++-------- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/defaults/yacy.init b/defaults/yacy.init index 4a28d0362..d16f90924 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -880,3 +880,13 @@ routing.deleteOldSeeds.permission__pro = true routing.deleteOldSeeds.time = 7 routing.deleteOldSeeds.time__pro = 30 +# autoReCrawl Options +autoReCrawl_idlesleep = 3600000 +autoReCrawl_busysleep = 3600000 +autoReCrawl_memprereq = -1 + + + + + + diff --git a/source/de/anomic/data/bookmarksDB.java b/source/de/anomic/data/bookmarksDB.java index ce914ed0c..3f0ecc51f 100644 --- a/source/de/anomic/data/bookmarksDB.java +++ b/source/de/anomic/data/bookmarksDB.java @@ -222,16 +222,27 @@ public class bookmarksDB { serverLog.logInfo("BOOKMARKS", "autoReCrawl - checking schedule for: "+"["+serverDate.formatISO8601(date)+"] "+bm.getUrl()); if (interTime >= 0 && interTime < sleepTime) { - try { - // check if the crawl filter works correctly - Pattern.compile(newcrawlingfilter); - - // set crawlingStart to BookmarkUrl - String crawlingStart = bm.getUrl(); - - // stack request - // first delete old entry, if exists + try { + int pos = 0; + // set crawlingStart to BookmarkUrl + String crawlingStart = bm.getUrl(); + yacyURL crawlingStartURL = new yacyURL(crawlingStart, null); + + // set the crawling filter + if (newcrawlingfilter.length() < 2) newcrawlingfilter = ".*"; // avoid that all urls are filtered out if bad value was submitted + + if (crawlingStartURL!= null && newcrawlingfilter.equals("dom")) { + newcrawlingfilter = ".*" + crawlingStartURL.getHost() + ".*"; + } + if (crawlingStart!= null && newcrawlingfilter.equals("sub") && (pos = crawlingStart.lastIndexOf("/")) > 0) { + newcrawlingfilter = crawlingStart.substring(0, pos + 1) + ".*"; + } + sb.setConfig("crawlingFilter", newcrawlingfilter); + + // check if the crawl filter works correctly + Pattern.compile(newcrawlingfilter); + String urlhash = crawlingStartURL.hash(); sb.webIndex.removeURL(urlhash); sb.crawlQueues.noticeURL.removeByURLHash(urlhash); @@ -250,6 +261,7 @@ public class bookmarksDB { if (reasonString == null) { serverLog.logInfo("BOOKMARKS", "autoReCrawl - adding crawl profile for: " + crawlingStart); + serverLog.logInfo("BOOKMARKS", "autoReCrawl - crawl filter is set to: " + newcrawlingfilter); // generate a YaCyNews if the global flag was set if (crawlOrder) { Map m = new HashMap(pe.map()); // must be cloned