From 70dd26ec95e3a7d83958547b0f481f2211350bef Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 19 Aug 2010 23:52:38 +0000 Subject: [PATCH] added the new crawl scheduling function to the crawl start menu: - the scheduler extends the option for re-crawl timing. Many people misunderstood the re-crawl timing feature because that was just a criteria for the url double-check and not a scheduler. Now the scheduler setting is combined with the re-crawl setting and people will have the choice between no re-crawl, re-crawl as was possible so far and a scheduled re-crawl. The 'classic' re-crawl time is set automatically when the scheduling function is selected - removed the bookmark-based scheduler. This scheduler was not able to transport all attributes of a crawl start and did therefore not support special crawling starts i.e. for forums and wikis - since the old scheduler was not aber to crawl special forums and wikis, the must-not-match filter was statically fixed to all bad pages for these special use cases. Since the new scheduler can handle these filters, it is possible to remove the default settings for the filters - removed the busy thread that was used to trigger the bookmark-based scheduler - removed the crontab for the bookmark-based scheduler git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7051 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- defaults/autoReCrawl.conf | 8 - defaults/yacy.init | 5 - htroot/CrawlStart_p.html | 126 +++++++----- htroot/Crawler_p.java | 38 +++- htroot/PerformanceQueues_p.java | 13 +- htroot/QuickCrawlLink_p.java | 2 +- htroot/Table_API_p.java | 2 +- source/de/anomic/crawler/CrawlProfile.java | 1 - .../de/anomic/crawler/CrawlSwitchboard.java | 14 +- source/de/anomic/data/SitemapParser.java | 2 +- source/de/anomic/data/WorkTables.java | 96 ++++++--- source/de/anomic/data/bookmarksDB.java | 188 +----------------- source/de/anomic/search/Switchboard.java | 5 +- source/net/yacy/kelondro/blob/Tables.java | 11 + 14 files changed, 200 insertions(+), 311 deletions(-) delete mode 100644 defaults/autoReCrawl.conf diff --git a/defaults/autoReCrawl.conf b/defaults/autoReCrawl.conf deleted file mode 100644 index 217f2b1ba..000000000 --- a/defaults/autoReCrawl.conf +++ /dev/null @@ -1,8 +0,0 @@ -# YaCy autoReCrawl configuration for bookmark folders -# -# schedule|folder|filter|crawlingdepth|crawlingIfOlder|DomFilterDepth|DomMaxPages|crawlingQ|indexText|indexMedia|crawlOrder|xsstopw|storeHTCache -3600000 /autoReCrawl/hourly .* 1 59 -1 -1 true true true true false false -86400000 /autoReCrawl/daily .* 3 1439 -1 -1 true true true true false false -604800000 /autoReCrawl/weekly .* 3 10079 -1 -1 true true true true false false -2678400000 /autoReCrawl/monthly .* 4 44639 -1 -1 true true true true false false -# eof diff --git a/defaults/yacy.init b/defaults/yacy.init index 54b6f5af9..671341eae 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -579,11 +579,6 @@ filterOutStopwordsFromTopwords=true 90_cleanup_busysleep=300000 90_cleanup_memprereq=0 -# autoReCrawl Options -autoReCrawl_idlesleep = 3600000 -autoReCrawl_busysleep = 3600000 -autoReCrawl_memprereq = -1 - # additional attributes: # performanceIO is a percent-value. a value of 10 means, that 10% of the busysleep time # is used to flush the RAM cache, which is the major part of the IO in YaCy diff --git a/htroot/CrawlStart_p.html b/htroot/CrawlStart_p.html index 56e036261..0ee51c9ae 100644 --- a/htroot/CrawlStart_p.html +++ b/htroot/CrawlStart_p.html @@ -54,7 +54,7 @@ : - + @@ -70,42 +70,65 @@ Other already visited URLs are sorted out as "double", if they are not allowed using the re-crawl option. - - Create Bookmark - - : - -    (works with "Starting Point: From URL" only) -

- :    -

- : - -
  - - - This option lets you create a bookmark from your crawl start URL. For automatic re-crawling you can use the following default folders:
- - Attention: recrawl settings depend on the folder. They can be adjusted in /DATA/SETTINGS/autoReCrawl.conf. - - : - This defines how often the Crawler will follow links embedded in websites.
- A minimum of 0 is recommended and means that the page you enter under "Starting Point" will be added - to the index, but no linked content is indexed. 2-4 is good for normal indexing. - Be careful with the depth. Consider a branching factor of average 20; - A prefetch-depth of 8 would index 25.600.000.000 pages, maybe this is the whole WWW. + This defines how often the Crawler will follow links (of links..) embedded in websites. + 0 means that only the page you enter under "Starting Point" will be added + to the index. 2-4 is good for normal indexing. Values over 8 are not useful, since a depth-8 crawl will + index approximately 25.600.000.000 pages, maybe this is the whole WWW. + Scheduled re-crawl + +
+
no doubles
+
run this crawl once and never load any page that is already known, only the start-url may be loaded again.
+
re-load
+
run this crawl once, but treat urls that are known since
+ + not as double and load them again. No scheduled re-crawl. +
+
scheduled
+
after starting this crawl, repeat the crawl every
+ + automatically. +
+
+ + + A web crawl performs a double-check on all links found in the internet against the internal database. If the same url is found again, + then the url is treated as double when you check the 'no doubles' option. A url may be loaded again when it has reached a specific age, + to use that check the 'once' option. When you want that this web crawl is repeated automatically, then check the 'scheduled' option. + In this case the crawl is repeated after the given time and no url from the previous crawl is omitted as double. + + + : Use filter   @@ -132,26 +155,6 @@ - Re-crawl known URLs: - - : -    - : - - - - - If you use this option, web pages that are already existent in your database are crawled and indexed again. - It depends on the age of the last crawl if this is done or not: if the last crawl is older than the given - date, the page is crawled again, otherwise it is treated as 'double' and not loaded or indexed again. - - - Auto-Dom-Filter: : @@ -167,7 +170,7 @@ The default value 0 gives no restrictions. - + Maximum Pages per Domain: : @@ -181,7 +184,7 @@ the given depth. Domains outside the given depth are then sorted-out anyway. - + : @@ -189,7 +192,7 @@ is accessed with URLs containing question marks. If you are unsure, do not check this to avoid crawl loops. - + : @@ -275,6 +278,23 @@ --> + + Create Bookmark + + : + +    (works with "Starting Point: From URL" only) +

+ :    +

+ : + +
  + + + This option lets you create a bookmark from your crawl start URL. + + diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 6d3d0e090..f918f354e 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -140,9 +140,6 @@ public class Crawler_p { try {crawlingStartURL = new DigestURI(crawlingStart, null);} catch (final MalformedURLException e1) {} crawlingStart = (crawlingStartURL == null) ? null : crawlingStartURL.toNormalform(true, true); - // store this call as api call - sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart); - // set new properties final boolean fullDomain = post.get("range", "wide").equals("domain"); // special property in simple crawl start final boolean subPath = post.get("range", "wide").equals("subpath"); // special property in simple crawl start @@ -167,12 +164,37 @@ public class Crawler_p { env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth)); if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8; - final boolean crawlingIfOlderCheck = post.get("crawlingIfOlderCheck", "off").equals("on"); - final int crawlingIfOlderNumber = Integer.parseInt(post.get("crawlingIfOlderNumber", "-1")); - final String crawlingIfOlderUnit = post.get("crawlingIfOlderUnit","year"); - final long crawlingIfOlder = recrawlIfOlderC(crawlingIfOlderCheck, crawlingIfOlderNumber, crawlingIfOlderUnit); - env.setConfig("crawlingIfOlder", crawlingIfOlder); + // recrawl + final String recrawl = post.get("recrawl", "nodoubles"); // nodoubles, reload, scheduler + boolean crawlingIfOlderCheck = post.get("crawlingIfOlderCheck", "off").equals("on"); + int crawlingIfOlderNumber = Integer.parseInt(post.get("crawlingIfOlderNumber", "-1")); + String crawlingIfOlderUnit = post.get("crawlingIfOlderUnit","year"); // year, month, day, hour + int repeat_time = Integer.parseInt(post.get("repeat_time", "-1")); + final String repeat_unit = post.get("repeat_unit", "seldays"); // selminutes, selhours, seldays + if (recrawl.equals("scheduler")) { + // set crawlingIfOlder attributes that are appropriate for scheduled crawling + crawlingIfOlderCheck = true; + crawlingIfOlderNumber = repeat_unit.equals("selminutes") ? 1 : repeat_unit.equals("selhours") ? repeat_time / 2 : repeat_time * 12; + crawlingIfOlderUnit = "hour"; + } else if (recrawl.equals("reload")) { + repeat_time = -1; + crawlingIfOlderCheck = true; + } else if (recrawl.equals("nodoubles")) { + repeat_time = -1; + crawlingIfOlderCheck = false; + } + long crawlingIfOlder = recrawlIfOlderC(crawlingIfOlderCheck, crawlingIfOlderNumber, crawlingIfOlderUnit); + env.setConfig("crawlingIfOlder", crawlingIfOlder); + + // store this call as api call + if (repeat_time > 0) { + // store as scheduled api call + sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart, repeat_time, repeat_unit.substring(3)); + } else { + // store just a protocol + sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart); + } final boolean crawlingDomFilterCheck = post.get("crawlingDomFilterCheck", "off").equals("on"); final int crawlingDomFilterDepth = (crawlingDomFilterCheck) ? Integer.parseInt(post.get("crawlingDomFilterDepth", "-1")) : -1; env.setConfig("crawlingDomFilterDepth", Integer.toString(crawlingDomFilterDepth)); diff --git a/htroot/PerformanceQueues_p.java b/htroot/PerformanceQueues_p.java index 8ae423edc..8a5efe424 100644 --- a/htroot/PerformanceQueues_p.java +++ b/htroot/PerformanceQueues_p.java @@ -207,13 +207,12 @@ public class PerformanceQueues_p { busysleep = sb.getConfigLong(threadName + "_busysleep", busysleep); } if (setProfile) { - if (threadName.equals(SwitchboardConstants.PEER_PING) - || threadName.equals(SwitchboardConstants.SEED_UPLOAD) - || threadName.equals(SwitchboardConstants.CLEANUP) - || threadName.equals("autoReCrawl") - ) { /* do not change any values */ } - else if (threadName.equals(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER) - || threadName.equals(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL)) { + if (threadName.equals(SwitchboardConstants.PEER_PING) || + threadName.equals(SwitchboardConstants.SEED_UPLOAD) || + threadName.equals(SwitchboardConstants.CLEANUP)) { + /* do not change any values */ + } else if (threadName.equals(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER) || + threadName.equals(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL)) { sb.setRemotecrawlPPM(Math.max(1, (int) (sb.getConfigLong("network.unit.remotecrawl.speed", 60) / multiplier))); } else { diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index bf8de8c21..6d01dae62 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -109,7 +109,7 @@ public class QuickCrawlLink_p { // get other parameters if set final String crawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL); - final String crawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_BAD_URL); + final String crawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER); final int CrawlingDepth = Integer.parseInt(post.get("crawlingDepth", "0")); final boolean crawlDynamic = post.get("crawlingQ", "").equals("on"); final boolean indexText = post.get("indexText", "on").equals("on"); diff --git a/htroot/Table_API_p.java b/htroot/Table_API_p.java index 21d45b733..b8a6d3675 100644 --- a/htroot/Table_API_p.java +++ b/htroot/Table_API_p.java @@ -62,7 +62,7 @@ public class Table_API_p { if (action.equals("on")) { Tables.Row row = sb.tables.select(WorkTables.TABLE_API_NAME, pk.getBytes()); if (row != null) { - row.put(WorkTables.TABLE_API_COL_APICALL_SCHEDULE_TIME, 1); + row.put(WorkTables.TABLE_API_COL_APICALL_SCHEDULE_TIME, 7); row.put(WorkTables.TABLE_API_COL_APICALL_SCHEDULE_UNIT, "days"); WorkTables.calculateAPIScheduler(row, false); sb.tables.update(WorkTables.TABLE_API_NAME, row); diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java index 6d3b84174..4fb53ccc5 100644 --- a/source/de/anomic/crawler/CrawlProfile.java +++ b/source/de/anomic/crawler/CrawlProfile.java @@ -46,7 +46,6 @@ public class CrawlProfile { public static final String MATCH_ALL = ".*"; public static final String MATCH_NEVER = ""; - public static final String MATCH_BAD_URL = ".*memberlist.*|.*previous.*|.*next.*|.*p=.*"; static ConcurrentHashMap> domsCache = new ConcurrentHashMap>(); diff --git a/source/de/anomic/crawler/CrawlSwitchboard.java b/source/de/anomic/crawler/CrawlSwitchboard.java index 4be91cdc3..313d56980 100644 --- a/source/de/anomic/crawler/CrawlSwitchboard.java +++ b/source/de/anomic/crawler/CrawlSwitchboard.java @@ -163,7 +163,7 @@ public final class CrawlSwitchboard { if (this.defaultProxyProfile == null) { // generate new default entry for proxy crawling - this.defaultProxyProfile = this.profilesActiveCrawls.newEntry("proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, + this.defaultProxyProfile = this.profilesActiveCrawls.newEntry("proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/, this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, -1, false, true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/, @@ -174,33 +174,33 @@ public final class CrawlSwitchboard { } if (this.defaultRemoteProfile == null) { // generate new default entry for remote crawling - defaultRemoteProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0, + defaultRemoteProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, -1, -1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH); } if (this.defaultTextSnippetLocalProfile == null) { // generate new default entry for snippet fetch and optional crawling - defaultTextSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0, + defaultTextSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH); } if (this.defaultTextSnippetGlobalProfile == null) { // generate new default entry for snippet fetch and optional crawling - defaultTextSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0, + defaultTextSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); } this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST); if (this.defaultMediaSnippetLocalProfile == null) { // generate new default entry for snippet fetch and optional crawling - defaultMediaSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0, + defaultMediaSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, true, false, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); } if (this.defaultMediaSnippetGlobalProfile == null) { // generate new default entry for snippet fetch and optional crawling - defaultMediaSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0, + defaultMediaSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); } if (this.defaultSurrogateProfile == null) { // generate new default entry for surrogate parsing - defaultSurrogateProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0, + defaultSurrogateProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE); } } diff --git a/source/de/anomic/data/SitemapParser.java b/source/de/anomic/data/SitemapParser.java index eb331a688..01877bbdb 100644 --- a/source/de/anomic/data/SitemapParser.java +++ b/source/de/anomic/data/SitemapParser.java @@ -315,7 +315,7 @@ public class SitemapParser extends DefaultHandler { return this.sb.crawler.profilesActiveCrawls.newEntry( domainName, sitemapURL, // crawling Filter - CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, + CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, // Depth 0, // force recrawling diff --git a/source/de/anomic/data/WorkTables.java b/source/de/anomic/data/WorkTables.java index c1a6df2fd..e1d8c0f74 100644 --- a/source/de/anomic/data/WorkTables.java +++ b/source/de/anomic/data/WorkTables.java @@ -77,13 +77,6 @@ public class WorkTables extends Tables { public void recordAPICall(final serverObjects post, final String servletName, final String type, final String comment) { // remove the apicall attributes from the post object String pk = post.remove(TABLE_API_COL_APICALL_PK); - String count = post.remove(TABLE_API_COL_APICALL_COUNT); - if (count == null) count = "1"; - String time = post.remove(TABLE_API_COL_APICALL_SCHEDULE_TIME); - String unit = post.remove(TABLE_API_COL_APICALL_SCHEDULE_UNIT); - if (time == null || unit == null || unit.length() == 0 || "minutes,hours,days".indexOf(unit) < 0) { - time = ""; unit = ""; - } // generate the apicall url - without the apicall attributes final String apiurl = /*"http://localhost:" + getConfig("port", "8080") +*/ "/" + servletName + "?" + post.toString(); @@ -100,20 +93,7 @@ public class WorkTables extends Tables { // insert or update entry try { - if (row != null) { - // modify and update existing entry - - // modify date attributes and patch old values - row.put(TABLE_API_COL_DATE_LAST_EXEC, DateFormatter.formatShortMilliSecond(new Date()).getBytes()); - if (!row.containsKey(TABLE_API_COL_DATE_RECORDING)) row.put(TABLE_API_COL_DATE_RECORDING, row.get(TABLE_API_COL_DATE)); - row.remove(TABLE_API_COL_DATE); - - // insert APICALL attributes - row.put(TABLE_API_COL_APICALL_COUNT, count.getBytes()); - row.put(TABLE_API_COL_APICALL_SCHEDULE_TIME, time.getBytes()); - row.put(TABLE_API_COL_APICALL_SCHEDULE_UNIT, unit.getBytes()); - super.update(TABLE_API_NAME, row); - } else { + if (row == null) { // create and insert new entry Data data = new Data(); data.put(TABLE_API_COL_TYPE, type.getBytes()); @@ -124,10 +104,19 @@ public class WorkTables extends Tables { data.put(TABLE_API_COL_URL, apiurl.getBytes()); // insert APICALL attributes - data.put(TABLE_API_COL_APICALL_COUNT, count.getBytes()); - data.put(TABLE_API_COL_APICALL_SCHEDULE_TIME, time.getBytes()); - data.put(TABLE_API_COL_APICALL_SCHEDULE_UNIT, unit.getBytes()); + data.put(TABLE_API_COL_APICALL_COUNT, "1"); super.insert(TABLE_API_NAME, data); + } else { + // modify and update existing entry + + // modify date attributes and patch old values + row.put(TABLE_API_COL_DATE_LAST_EXEC, DateFormatter.formatShortMilliSecond(new Date()).getBytes()); + if (!row.containsKey(TABLE_API_COL_DATE_RECORDING)) row.put(TABLE_API_COL_DATE_RECORDING, row.get(TABLE_API_COL_DATE)); + row.remove(TABLE_API_COL_DATE); + + // insert APICALL attributes + row.put(TABLE_API_COL_APICALL_COUNT, row.get(TABLE_API_COL_APICALL_COUNT, 1) + 1); + super.update(TABLE_API_NAME, row); } } catch (IOException e) { Log.logException(e); @@ -137,6 +126,56 @@ public class WorkTables extends Tables { Log.logInfo("APICALL", apiurl); } + /** + * store a API call and set attributes to schedule a re-call of that API call according to a given frequence + * This is the same as the previous method but it also computes a re-call time and stores that additionally + * @param post the post arguments of the api call + * @param servletName the name of the servlet + * @param type name of the servlet category + * @param comment visual description of the process + * @param time the time until next scheduled execution of this api call + * @param unit the time unit for the scheduled call + */ + public void recordAPICall(final serverObjects post, final String servletName, final String type, final String comment, int time, String unit) { + if (post.containsKey(TABLE_API_COL_APICALL_PK)) { + // this api call has already been stored somewhere. + recordAPICall(post, servletName, type, comment); + return; + } + if (time < 0 || unit == null || unit.length() == 0 || "minutes,hours,days".indexOf(unit) < 0) { + time = 0; unit = ""; + } else { + if (unit.equals("minutes") && time < 10) time = 10; + } + + // generate the apicall url - without the apicall attributes + final String apiurl = /*"http://localhost:" + getConfig("port", "8080") +*/ "/" + servletName + "?" + post.toString(); + + // insert entry + try { + // create and insert new entry + Data data = new Data(); + data.put(TABLE_API_COL_TYPE, type.getBytes()); + data.put(TABLE_API_COL_COMMENT, comment.getBytes()); + byte[] date = DateFormatter.formatShortMilliSecond(new Date()).getBytes(); + data.put(TABLE_API_COL_DATE_RECORDING, date); + data.put(TABLE_API_COL_DATE_LAST_EXEC, date); + data.put(TABLE_API_COL_URL, apiurl.getBytes()); + + // insert APICALL attributes + data.put(TABLE_API_COL_APICALL_COUNT, "1".getBytes()); + data.put(TABLE_API_COL_APICALL_SCHEDULE_TIME, Integer.toString(time).getBytes()); + data.put(TABLE_API_COL_APICALL_SCHEDULE_UNIT, unit.getBytes()); + calculateAPIScheduler(data, false); // set next execution time + super.insert(TABLE_API_NAME, data); + } catch (IOException e) { + Log.logException(e); + } catch (RowSpaceExceededException e) { + Log.logException(e); + } + Log.logInfo("APICALL", apiurl); + } + /** * execute an API call using a api table row which contains all essentials * to access the server also the host, port and the authentication realm must be given @@ -164,9 +203,6 @@ public class WorkTables extends Tables { if (row == null) continue; String url = "http://" + host + ":" + port + new String(row.get(WorkTables.TABLE_API_COL_URL)); url += "&" + WorkTables.TABLE_API_COL_APICALL_PK + "=" + new String(row.getPK()); - url += "&" + WorkTables.TABLE_API_COL_APICALL_COUNT + "=" + (row.get(WorkTables.TABLE_API_COL_APICALL_COUNT, 1) + 1); - url += "&" + WorkTables.TABLE_API_COL_APICALL_SCHEDULE_TIME + "=" + row.get(WorkTables.TABLE_API_COL_APICALL_SCHEDULE_TIME, ""); - url += "&" + WorkTables.TABLE_API_COL_APICALL_SCHEDULE_UNIT + "=" + row.get(WorkTables.TABLE_API_COL_APICALL_SCHEDULE_UNIT, ""); try { client.GETbytes(url); l.put(url, client.getStatusCode()); @@ -197,8 +233,9 @@ public class WorkTables extends Tables { /** * calculate the execution time in a api call table based on given scheduling time and last execution time * @param row the database row in the api table + * @param update if true then the next execution time is based on the latest computed execution time; othervise it is based on the last execution time */ - public static void calculateAPIScheduler(Tables.Row row, boolean update) { + public static void calculateAPIScheduler(Tables.Data row, boolean update) { Date date = row.containsKey(WorkTables.TABLE_API_COL_DATE) ? row.get(WorkTables.TABLE_API_COL_DATE, new Date()) : null; date = update ? row.get(WorkTables.TABLE_API_COL_DATE_NEXT_EXEC, date) : row.get(WorkTables.TABLE_API_COL_DATE_LAST_EXEC, date); int time = row.get(WorkTables.TABLE_API_COL_APICALL_SCHEDULE_TIME, 1); @@ -208,10 +245,11 @@ public class WorkTables extends Tables { } String unit = row.get(WorkTables.TABLE_API_COL_APICALL_SCHEDULE_UNIT, "days"); long d = date.getTime(); - if (unit.equals("minutes")) d += 60000L * time; + if (unit.equals("minutes")) d += 60000L * Math.max(10, time); if (unit.equals("hours")) d += 60000L * 60L * time; if (unit.equals("days")) d += 60000L * 60L * 24L * time; if (d < System.currentTimeMillis()) d = System.currentTimeMillis() + 600000L; + d -= d % 60000; // remove seconds row.put(WorkTables.TABLE_API_COL_DATE_NEXT_EXEC, new Date(d)); } diff --git a/source/de/anomic/data/bookmarksDB.java b/source/de/anomic/data/bookmarksDB.java index 54668dd00..385dfa927 100644 --- a/source/de/anomic/data/bookmarksDB.java +++ b/source/de/anomic/data/bookmarksDB.java @@ -23,18 +23,11 @@ package de.anomic.data; -import java.io.BufferedReader; import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.FileReader; -import java.io.FileWriter; import java.io.IOException; -import java.io.InputStreamReader; import java.io.Serializable; import java.net.MalformedURLException; import java.util.Comparator; -import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; @@ -42,24 +35,15 @@ import java.util.Map; import java.util.Set; import java.util.TreeSet; import java.util.concurrent.ConcurrentHashMap; -import java.util.regex.Pattern; import net.yacy.kelondro.blob.MapHeap; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.NaturalOrder; -import net.yacy.kelondro.util.DateFormatter; -import net.yacy.kelondro.workflow.BusyThread; -import net.yacy.kelondro.workflow.InstantBusyThread; - -import de.anomic.crawler.CrawlProfile; -import de.anomic.crawler.retrieval.Request; -import de.anomic.search.Segments; -import de.anomic.search.Switchboard; -import de.anomic.yacy.yacyNewsPool; public class bookmarksDB { + // ------------------------------------ // Declaration of Class-Attributes // ------------------------------------ @@ -67,7 +51,6 @@ public class bookmarksDB { //final static int SORT_ALPHA = 1; private final static int SORT_SIZE = 2; private final static int SHOW_ALL = -1; - private final static String SLEEP_TIME = "3600000"; // default sleepTime: check for recrawls every hour // bookmarks private MapHeap bookmarks; @@ -75,9 +58,6 @@ public class bookmarksDB { // tags private ConcurrentHashMap tags; - // autoReCrawl - private final BusyThread autoReCrawl; - private BookmarkDate dates; // ------------------------------------ @@ -120,15 +100,6 @@ public class bookmarksDB { //this.datesTable = new MapView(BLOBTree.toHeap(datesFile, true, true, 20, 256, '_', NaturalOrder.naturalOrder, datesFileNew), 500, '_'); this.dates = new BookmarkDate(datesFile); if (!datesExisted) this.dates.init(new bookmarkIterator(true)); - - // autoReCrawl - final Switchboard sb = Switchboard.getSwitchboard(); - this.autoReCrawl = new InstantBusyThread(this, "autoReCrawl", null, null, Long.MIN_VALUE, Long.MAX_VALUE, Long.MIN_VALUE, Long.MAX_VALUE); - final long sleepTime = Long.parseLong(sb.getConfig("autoReCrawl_idlesleep" , SLEEP_TIME)); - sb.deployThread("autoReCrawl", "autoReCrawl Scheduler", "simple scheduler for automatic re-crawls of bookmarked urls", null, autoReCrawl, 120000, - sleepTime, sleepTime, Long.parseLong(sb.getConfig("autoReCrawl_memprereq" , "-1")) - ); - Log.logInfo("BOOKMARKS", "autoReCrawl - serverBusyThread initialized checking every "+(sleepTime/1000/60)+" minutes for recrawls"); } // ----------------------------------------------------- @@ -141,163 +112,6 @@ public class bookmarksDB { dates.close(); } - // ----------------------------------------------------- - // bookmarksDB's functions for autoReCrawl - // ----------------------------------------------------- - - public boolean autoReCrawl() { - - // read crontab - final File file = new File (Switchboard.getSwitchboard().getRootPath(),"DATA/SETTINGS/autoReCrawl.conf"); - String s; - try { - final BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(file))); - Log.logInfo("BOOKMARKS", "autoReCrawl - reading schedules from " + file); - while( null != (s = in.readLine()) ) { - if (s.length() > 0 && s.charAt(0) != '#') { - final String parser[] = s.split("\t"); - if (parser.length == 13) { - folderReCrawl(Long.parseLong(parser[0]), parser[1], parser[2], Integer.parseInt(parser[3]), Long.parseLong(parser[4]), - Integer.parseInt(parser[5]), Integer.parseInt(parser[6]), Boolean.parseBoolean(parser[7]), - Boolean.parseBoolean(parser[8]), Boolean.parseBoolean(parser[9]), - Boolean.parseBoolean(parser[10]), Boolean.parseBoolean(parser[11]), - Boolean.parseBoolean(parser[12]), CrawlProfile.CacheStrategy.IFFRESH - ); - } - if (parser.length == 14) { - folderReCrawl(Long.parseLong(parser[0]), parser[1], parser[2], Integer.parseInt(parser[3]), Long.parseLong(parser[4]), - Integer.parseInt(parser[5]), Integer.parseInt(parser[6]), Boolean.parseBoolean(parser[7]), - Boolean.parseBoolean(parser[8]), Boolean.parseBoolean(parser[9]), - Boolean.parseBoolean(parser[10]), Boolean.parseBoolean(parser[11]), - Boolean.parseBoolean(parser[12]), CrawlProfile.CacheStrategy.decode(Integer.parseInt(parser[13])) - ); - } - } - } - in.close(); - } catch( FileNotFoundException ex ) { - try { - Log.logInfo("BOOKMARKS", "autoReCrawl - creating new autoReCrawl.conf"); - final File inputFile = new File(Switchboard.getSwitchboard().getRootPath(),"defaults/autoReCrawl.conf"); - final File outputFile = new File(Switchboard.getSwitchboard().getRootPath(),"DATA/SETTINGS/autoReCrawl.conf"); - final FileReader i = new FileReader(inputFile); - final FileWriter o = new FileWriter(outputFile); - int c; - while ((c = i.read()) != -1) { - o.write(c); - } - i.close(); - o.close(); - autoReCrawl(); - return true; - } catch( FileNotFoundException e ) { - Log.logSevere("BOOKMARKS", "autoReCrawl - file not found error: defaults/autoReCrawl.conf", e); - return false; - } catch (IOException e) { - Log.logSevere("BOOKMARKS", "autoReCrawl - IOException: defaults/autoReCrawl.conf", e); - return false; - } - } catch( Exception ex ) { - Log.logSevere("BOOKMARKS", "autoReCrawl - error reading " + file, ex); - return false; - } - return true; - } - - public void folderReCrawl(long schedule, String folder, String crawlingfilter, int newcrawlingdepth, long crawlingIfOlder, - int crawlingDomFilterDepth, int crawlingDomMaxPages, boolean crawlingQ, boolean indexText, boolean indexMedia, - boolean crawlOrder, boolean xsstopw, boolean storeHTCache, CrawlProfile.CacheStrategy cacheStrategy) { - - final Switchboard sb = Switchboard.getSwitchboard(); - final Iterator bit = getBookmarksIterator(folder, true); - Log.logInfo("BOOKMARKS", "autoReCrawl - processing: "+folder); - - final boolean xdstopw = xsstopw; - final boolean xpstopw = xsstopw; - - while(bit.hasNext()) { - - final Bookmark bm = getBookmark(bit.next()); - final long sleepTime = Long.parseLong(sb.getConfig("autoReCrawl_idlesleep" , SLEEP_TIME)); - final long interTime = (System.currentTimeMillis()-bm.getTimeStamp())%schedule; - - final Date date = new Date(bm.getTimeStamp()); - Log.logInfo("BOOKMARKS", "autoReCrawl - checking schedule for: "+"["+DateFormatter.formatISO8601(date)+"] "+bm.getUrl()); - - if (interTime >= 0 && interTime < sleepTime) { - try { - int pos = 0; - // set crawlingStart to BookmarkUrl - final String crawlingStart = bm.getUrl(); - String newcrawlingMustMatch = crawlingfilter; - - final DigestURI crawlingStartURL = new DigestURI(crawlingStart, null); - - // set the crawling filter - if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = ".*"; // avoid that all urls are filtered out if bad value was submitted - - if (crawlingStartURL!= null && newcrawlingMustMatch.equals("dom")) { - newcrawlingMustMatch = ".*" + crawlingStartURL.getHost() + ".*"; - } - if (crawlingStart!= null && newcrawlingMustMatch.equals("sub") && (pos = crawlingStart.lastIndexOf("/")) > 0) { - newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*"; - } - - // check if the crawl filter works correctly - Pattern.compile(newcrawlingMustMatch); - - final byte[] urlhash = crawlingStartURL.hash(); - - sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).remove(urlhash); - sb.crawlQueues.noticeURL.removeByURLHash(urlhash); - sb.crawlQueues.errorURL.remove(urlhash); - - // stack url - sb.crawler.profilesPassiveCrawls.removeEntry(crawlingStartURL.hash()); // if there is an old entry, delete it - final CrawlProfile.entry pe = sb.crawler.profilesActiveCrawls.newEntry( - folder+"/"+crawlingStartURL, crawlingStartURL, - newcrawlingMustMatch, - CrawlProfile.MATCH_BAD_URL, - newcrawlingdepth, - sb.crawler.profilesActiveCrawls.getRecrawlDate(crawlingIfOlder), crawlingDomFilterDepth, crawlingDomMaxPages, - crawlingQ, - indexText, indexMedia, - storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cacheStrategy); - sb.crawlStacker.enqueueEntry(new Request( - sb.peers.mySeed().hash.getBytes(), - crawlingStartURL, - null, - "CRAWLING-ROOT", - new Date(), - pe.handle(), - 0, - 0, - 0 - )); - Log.logInfo("BOOKMARKS", "autoReCrawl - adding crawl profile for: " + crawlingStart); - // serverLog.logInfo("BOOKMARKS", "autoReCrawl - crawl filter is set to: " + newcrawlingfilter); - // generate a YaCyNews if the global flag was set - if (crawlOrder) { - Map m = new HashMap(pe.map()); // must be cloned - m.remove("specificDepth"); - m.remove("indexText"); - m.remove("indexMedia"); - m.remove("remoteIndexing"); - m.remove("xsstopw"); - m.remove("xpstopw"); - m.remove("xdstopw"); - m.remove("storeTXCache"); - m.remove("storeHTCache"); - m.remove("generalFilter"); - m.remove("specificFilter"); - m.put("intention", "Automatic ReCrawl!"); - sb.peers.newsPool.publishMyNews(sb.peers.mySeed(), yacyNewsPool.CATEGORY_CRAWL_START, m); - } - } catch (MalformedURLException e1) {} - } // if - } // while(bit.hasNext()) - } // } autoReCrawl() - // ----------------------------------------------------------- // bookmarksDB's functions for bookmarksTable / bookmarkCache // ----------------------------------------------------------- diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index da84b7ba5..7cf670836 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -137,7 +137,6 @@ import de.anomic.data.userDB; import de.anomic.data.wiki.wikiBoard; import de.anomic.data.wiki.wikiCode; import de.anomic.data.wiki.wikiParser; -//import de.anomic.http.client.Client; import de.anomic.http.client.Cache; import de.anomic.http.server.HTTPDemon; import de.anomic.http.server.HeaderFramework; @@ -211,7 +210,7 @@ public final class Switchboard extends serverSwitch { public boolean rankingOn; public CRDistribution rankingOwnDistribution; public CRDistribution rankingOtherDistribution; - public Map outgoingCookies, incomingCookies; + public Map outgoingCookies, incomingCookies; public volatile long proxyLastAccess, localSearchLastAccess, remoteSearchLastAccess; public yacyCore yc; public ResourceObserver observer; @@ -608,7 +607,7 @@ public final class Switchboard extends serverSwitch { SwitchboardConstants.CLEANUP_METHOD_JOBCOUNT, SwitchboardConstants.CLEANUP_METHOD_FREEMEM, 60000, Long.MAX_VALUE, 10000, Long.MAX_VALUE), - 600000); // all 5 Minutes, wait 10 minutes until first run + 60000); // all 5 Minutes, wait 1 minute until first run deployThread(SwitchboardConstants.SURROGATES, "Surrogates", "A thread that polls the SURROGATES path and puts all Documents in one surroagte file into the indexing queue.", null, new InstantBusyThread( this, diff --git a/source/net/yacy/kelondro/blob/Tables.java b/source/net/yacy/kelondro/blob/Tables.java index 821e89113..b3d243b2d 100644 --- a/source/net/yacy/kelondro/blob/Tables.java +++ b/source/net/yacy/kelondro/blob/Tables.java @@ -433,6 +433,17 @@ public class Tables { return dflt; } } + + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append('{'); + for (Map.Entry entry: this.entrySet()) { + sb.append(entry.getKey()).append('=').append(new String(entry.getValue())).append(", "); + } + if (sb.length() > 1) sb.setLength(sb.length() - 2); + sb.append('}'); + return sb.toString(); + } } public class Row extends Data {