diff --git a/defaults/autoReCrawl.conf b/defaults/autoReCrawl.conf deleted file mode 100644 index 217f2b1ba..000000000 --- a/defaults/autoReCrawl.conf +++ /dev/null @@ -1,8 +0,0 @@ -# YaCy autoReCrawl configuration for bookmark folders -# -# schedule|folder|filter|crawlingdepth|crawlingIfOlder|DomFilterDepth|DomMaxPages|crawlingQ|indexText|indexMedia|crawlOrder|xsstopw|storeHTCache -3600000 /autoReCrawl/hourly .* 1 59 -1 -1 true true true true false false -86400000 /autoReCrawl/daily .* 3 1439 -1 -1 true true true true false false -604800000 /autoReCrawl/weekly .* 3 10079 -1 -1 true true true true false false -2678400000 /autoReCrawl/monthly .* 4 44639 -1 -1 true true true true false false -# eof diff --git a/defaults/yacy.init b/defaults/yacy.init index 54b6f5af9..671341eae 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -579,11 +579,6 @@ filterOutStopwordsFromTopwords=true 90_cleanup_busysleep=300000 90_cleanup_memprereq=0 -# autoReCrawl Options -autoReCrawl_idlesleep = 3600000 -autoReCrawl_busysleep = 3600000 -autoReCrawl_memprereq = -1 - # additional attributes: # performanceIO is a percent-value. a value of 10 means, that 10% of the busysleep time # is used to flush the RAM cache, which is the major part of the IO in YaCy diff --git a/htroot/CrawlStart_p.html b/htroot/CrawlStart_p.html index 56e036261..0ee51c9ae 100644 --- a/htroot/CrawlStart_p.html +++ b/htroot/CrawlStart_p.html @@ -54,7 +54,7 @@ : - + @@ -70,42 +70,65 @@ Other already visited URLs are sorted out as "double", if they are not allowed using the re-crawl option. - - Create Bookmark - - : - -    (works with "Starting Point: From URL" only) -

- :    -

- : - -
  - - - This option lets you create a bookmark from your crawl start URL. For automatic re-crawling you can use the following default folders:
- - Attention: recrawl settings depend on the folder. They can be adjusted in /DATA/SETTINGS/autoReCrawl.conf. - - : - This defines how often the Crawler will follow links embedded in websites.
- A minimum of 0 is recommended and means that the page you enter under "Starting Point" will be added - to the index, but no linked content is indexed. 2-4 is good for normal indexing. - Be careful with the depth. Consider a branching factor of average 20; - A prefetch-depth of 8 would index 25.600.000.000 pages, maybe this is the whole WWW. + This defines how often the Crawler will follow links (of links..) embedded in websites. + 0 means that only the page you enter under "Starting Point" will be added + to the index. 2-4 is good for normal indexing. Values over 8 are not useful, since a depth-8 crawl will + index approximately 25.600.000.000 pages, maybe this is the whole WWW. + Scheduled re-crawl + +
+
no doubles
+
run this crawl once and never load any page that is already known, only the start-url may be loaded again.
+
re-load
+
run this crawl once, but treat urls that are known since
+ + not as double and load them again. No scheduled re-crawl. +
+
scheduled
+
after starting this crawl, repeat the crawl every
+ + automatically. +
+
+ + + A web crawl performs a double-check on all links found in the internet against the internal database. If the same url is found again, + then the url is treated as double when you check the 'no doubles' option. A url may be loaded again when it has reached a specific age, + to use that check the 'once' option. When you want that this web crawl is repeated automatically, then check the 'scheduled' option. + In this case the crawl is repeated after the given time and no url from the previous crawl is omitted as double. + + + : Use filter   @@ -132,26 +155,6 @@ - Re-crawl known URLs: - - : -    - : - - - - - If you use this option, web pages that are already existent in your database are crawled and indexed again. - It depends on the age of the last crawl if this is done or not: if the last crawl is older than the given - date, the page is crawled again, otherwise it is treated as 'double' and not loaded or indexed again. - - - Auto-Dom-Filter: : @@ -167,7 +170,7 @@ The default value 0 gives no restrictions. - + Maximum Pages per Domain: : @@ -181,7 +184,7 @@ the given depth. Domains outside the given depth are then sorted-out anyway. - + : @@ -189,7 +192,7 @@ is accessed with URLs containing question marks. If you are unsure, do not check this to avoid crawl loops. - + : @@ -275,6 +278,23 @@ --> + + Create Bookmark + + : + +    (works with "Starting Point: From URL" only) +

+ :    +

+ : + +
  + + + This option lets you create a bookmark from your crawl start URL. + + diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 6d3d0e090..f918f354e 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -140,9 +140,6 @@ public class Crawler_p { try {crawlingStartURL = new DigestURI(crawlingStart, null);} catch (final MalformedURLException e1) {} crawlingStart = (crawlingStartURL == null) ? null : crawlingStartURL.toNormalform(true, true); - // store this call as api call - sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart); - // set new properties final boolean fullDomain = post.get("range", "wide").equals("domain"); // special property in simple crawl start final boolean subPath = post.get("range", "wide").equals("subpath"); // special property in simple crawl start @@ -167,12 +164,37 @@ public class Crawler_p { env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth)); if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8; - final boolean crawlingIfOlderCheck = post.get("crawlingIfOlderCheck", "off").equals("on"); - final int crawlingIfOlderNumber = Integer.parseInt(post.get("crawlingIfOlderNumber", "-1")); - final String crawlingIfOlderUnit = post.get("crawlingIfOlderUnit","year"); - final long crawlingIfOlder = recrawlIfOlderC(crawlingIfOlderCheck, crawlingIfOlderNumber, crawlingIfOlderUnit); - env.setConfig("crawlingIfOlder", crawlingIfOlder); + // recrawl + final String recrawl = post.get("recrawl", "nodoubles"); // nodoubles, reload, scheduler + boolean crawlingIfOlderCheck = post.get("crawlingIfOlderCheck", "off").equals("on"); + int crawlingIfOlderNumber = Integer.parseInt(post.get("crawlingIfOlderNumber", "-1")); + String crawlingIfOlderUnit = post.get("crawlingIfOlderUnit","year"); // year, month, day, hour + int repeat_time = Integer.parseInt(post.get("repeat_time", "-1")); + final String repeat_unit = post.get("repeat_unit", "seldays"); // selminutes, selhours, seldays + if (recrawl.equals("scheduler")) { + // set crawlingIfOlder attributes that are appropriate for scheduled crawling + crawlingIfOlderCheck = true; + crawlingIfOlderNumber = repeat_unit.equals("selminutes") ? 1 : repeat_unit.equals("selhours") ? repeat_time / 2 : repeat_time * 12; + crawlingIfOlderUnit = "hour"; + } else if (recrawl.equals("reload")) { + repeat_time = -1; + crawlingIfOlderCheck = true; + } else if (recrawl.equals("nodoubles")) { + repeat_time = -1; + crawlingIfOlderCheck = false; + } + long crawlingIfOlder = recrawlIfOlderC(crawlingIfOlderCheck, crawlingIfOlderNumber, crawlingIfOlderUnit); + env.setConfig("crawlingIfOlder", crawlingIfOlder); + + // store this call as api call + if (repeat_time > 0) { + // store as scheduled api call + sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart, repeat_time, repeat_unit.substring(3)); + } else { + // store just a protocol + sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart); + } final boolean crawlingDomFilterCheck = post.get("crawlingDomFilterCheck", "off").equals("on"); final int crawlingDomFilterDepth = (crawlingDomFilterCheck) ? Integer.parseInt(post.get("crawlingDomFilterDepth", "-1")) : -1; env.setConfig("crawlingDomFilterDepth", Integer.toString(crawlingDomFilterDepth)); diff --git a/htroot/PerformanceQueues_p.java b/htroot/PerformanceQueues_p.java index 8ae423edc..8a5efe424 100644 --- a/htroot/PerformanceQueues_p.java +++ b/htroot/PerformanceQueues_p.java @@ -207,13 +207,12 @@ public class PerformanceQueues_p { busysleep = sb.getConfigLong(threadName + "_busysleep", busysleep); } if (setProfile) { - if (threadName.equals(SwitchboardConstants.PEER_PING) - || threadName.equals(SwitchboardConstants.SEED_UPLOAD) - || threadName.equals(SwitchboardConstants.CLEANUP) - || threadName.equals("autoReCrawl") - ) { /* do not change any values */ } - else if (threadName.equals(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER) - || threadName.equals(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL)) { + if (threadName.equals(SwitchboardConstants.PEER_PING) || + threadName.equals(SwitchboardConstants.SEED_UPLOAD) || + threadName.equals(SwitchboardConstants.CLEANUP)) { + /* do not change any values */ + } else if (threadName.equals(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER) || + threadName.equals(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL)) { sb.setRemotecrawlPPM(Math.max(1, (int) (sb.getConfigLong("network.unit.remotecrawl.speed", 60) / multiplier))); } else { diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index bf8de8c21..6d01dae62 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -109,7 +109,7 @@ public class QuickCrawlLink_p { // get other parameters if set final String crawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL); - final String crawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_BAD_URL); + final String crawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER); final int CrawlingDepth = Integer.parseInt(post.get("crawlingDepth", "0")); final boolean crawlDynamic = post.get("crawlingQ", "").equals("on"); final boolean indexText = post.get("indexText", "on").equals("on"); diff --git a/htroot/Table_API_p.java b/htroot/Table_API_p.java index 21d45b733..b8a6d3675 100644 --- a/htroot/Table_API_p.java +++ b/htroot/Table_API_p.java @@ -62,7 +62,7 @@ public class Table_API_p { if (action.equals("on")) { Tables.Row row = sb.tables.select(WorkTables.TABLE_API_NAME, pk.getBytes()); if (row != null) { - row.put(WorkTables.TABLE_API_COL_APICALL_SCHEDULE_TIME, 1); + row.put(WorkTables.TABLE_API_COL_APICALL_SCHEDULE_TIME, 7); row.put(WorkTables.TABLE_API_COL_APICALL_SCHEDULE_UNIT, "days"); WorkTables.calculateAPIScheduler(row, false); sb.tables.update(WorkTables.TABLE_API_NAME, row); diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java index 6d3b84174..4fb53ccc5 100644 --- a/source/de/anomic/crawler/CrawlProfile.java +++ b/source/de/anomic/crawler/CrawlProfile.java @@ -46,7 +46,6 @@ public class CrawlProfile { public static final String MATCH_ALL = ".*"; public static final String MATCH_NEVER = ""; - public static final String MATCH_BAD_URL = ".*memberlist.*|.*previous.*|.*next.*|.*p=.*"; static ConcurrentHashMap> domsCache = new ConcurrentHashMap>(); diff --git a/source/de/anomic/crawler/CrawlSwitchboard.java b/source/de/anomic/crawler/CrawlSwitchboard.java index 4be91cdc3..313d56980 100644 --- a/source/de/anomic/crawler/CrawlSwitchboard.java +++ b/source/de/anomic/crawler/CrawlSwitchboard.java @@ -163,7 +163,7 @@ public final class CrawlSwitchboard { if (this.defaultProxyProfile == null) { // generate new default entry for proxy crawling - this.defaultProxyProfile = this.profilesActiveCrawls.newEntry("proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, + this.defaultProxyProfile = this.profilesActiveCrawls.newEntry("proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/, this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, -1, false, true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/, @@ -174,33 +174,33 @@ public final class CrawlSwitchboard { } if (this.defaultRemoteProfile == null) { // generate new default entry for remote crawling - defaultRemoteProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0, + defaultRemoteProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, -1, -1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH); } if (this.defaultTextSnippetLocalProfile == null) { // generate new default entry for snippet fetch and optional crawling - defaultTextSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0, + defaultTextSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH); } if (this.defaultTextSnippetGlobalProfile == null) { // generate new default entry for snippet fetch and optional crawling - defaultTextSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0, + defaultTextSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); } this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST); if (this.defaultMediaSnippetLocalProfile == null) { // generate new default entry for snippet fetch and optional crawling - defaultMediaSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0, + defaultMediaSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, true, false, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); } if (this.defaultMediaSnippetGlobalProfile == null) { // generate new default entry for snippet fetch and optional crawling - defaultMediaSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0, + defaultMediaSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); } if (this.defaultSurrogateProfile == null) { // generate new default entry for surrogate parsing - defaultSurrogateProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0, + defaultSurrogateProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE); } } diff --git a/source/de/anomic/data/SitemapParser.java b/source/de/anomic/data/SitemapParser.java index eb331a688..01877bbdb 100644 --- a/source/de/anomic/data/SitemapParser.java +++ b/source/de/anomic/data/SitemapParser.java @@ -315,7 +315,7 @@ public class SitemapParser extends DefaultHandler { return this.sb.crawler.profilesActiveCrawls.newEntry( domainName, sitemapURL, // crawling Filter - CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, + CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, // Depth 0, // force recrawling diff --git a/source/de/anomic/data/WorkTables.java b/source/de/anomic/data/WorkTables.java index c1a6df2fd..e1d8c0f74 100644 --- a/source/de/anomic/data/WorkTables.java +++ b/source/de/anomic/data/WorkTables.java @@ -77,13 +77,6 @@ public class WorkTables extends Tables { public void recordAPICall(final serverObjects post, final String servletName, final String type, final String comment) { // remove the apicall attributes from the post object String pk = post.remove(TABLE_API_COL_APICALL_PK); - String count = post.remove(TABLE_API_COL_APICALL_COUNT); - if (count == null) count = "1"; - String time = post.remove(TABLE_API_COL_APICALL_SCHEDULE_TIME); - String unit = post.remove(TABLE_API_COL_APICALL_SCHEDULE_UNIT); - if (time == null || unit == null || unit.length() == 0 || "minutes,hours,days".indexOf(unit) < 0) { - time = ""; unit = ""; - } // generate the apicall url - without the apicall attributes final String apiurl = /*"http://localhost:" + getConfig("port", "8080") +*/ "/" + servletName + "?" + post.toString(); @@ -100,20 +93,7 @@ public class WorkTables extends Tables { // insert or update entry try { - if (row != null) { - // modify and update existing entry - - // modify date attributes and patch old values - row.put(TABLE_API_COL_DATE_LAST_EXEC, DateFormatter.formatShortMilliSecond(new Date()).getBytes()); - if (!row.containsKey(TABLE_API_COL_DATE_RECORDING)) row.put(TABLE_API_COL_DATE_RECORDING, row.get(TABLE_API_COL_DATE)); - row.remove(TABLE_API_COL_DATE); - - // insert APICALL attributes - row.put(TABLE_API_COL_APICALL_COUNT, count.getBytes()); - row.put(TABLE_API_COL_APICALL_SCHEDULE_TIME, time.getBytes()); - row.put(TABLE_API_COL_APICALL_SCHEDULE_UNIT, unit.getBytes()); - super.update(TABLE_API_NAME, row); - } else { + if (row == null) { // create and insert new entry Data data = new Data(); data.put(TABLE_API_COL_TYPE, type.getBytes()); @@ -124,10 +104,19 @@ public class WorkTables extends Tables { data.put(TABLE_API_COL_URL, apiurl.getBytes()); // insert APICALL attributes - data.put(TABLE_API_COL_APICALL_COUNT, count.getBytes()); - data.put(TABLE_API_COL_APICALL_SCHEDULE_TIME, time.getBytes()); - data.put(TABLE_API_COL_APICALL_SCHEDULE_UNIT, unit.getBytes()); + data.put(TABLE_API_COL_APICALL_COUNT, "1"); super.insert(TABLE_API_NAME, data); + } else { + // modify and update existing entry + + // modify date attributes and patch old values + row.put(TABLE_API_COL_DATE_LAST_EXEC, DateFormatter.formatShortMilliSecond(new Date()).getBytes()); + if (!row.containsKey(TABLE_API_COL_DATE_RECORDING)) row.put(TABLE_API_COL_DATE_RECORDING, row.get(TABLE_API_COL_DATE)); + row.remove(TABLE_API_COL_DATE); + + // insert APICALL attributes + row.put(TABLE_API_COL_APICALL_COUNT, row.get(TABLE_API_COL_APICALL_COUNT, 1) + 1); + super.update(TABLE_API_NAME, row); } } catch (IOException e) { Log.logException(e); @@ -137,6 +126,56 @@ public class WorkTables extends Tables { Log.logInfo("APICALL", apiurl); } + /** + * store a API call and set attributes to schedule a re-call of that API call according to a given frequence + * This is the same as the previous method but it also computes a re-call time and stores that additionally + * @param post the post arguments of the api call + * @param servletName the name of the servlet + * @param type name of the servlet category + * @param comment visual description of the process + * @param time the time until next scheduled execution of this api call + * @param unit the time unit for the scheduled call + */ + public void recordAPICall(final serverObjects post, final String servletName, final String type, final String comment, int time, String unit) { + if (post.containsKey(TABLE_API_COL_APICALL_PK)) { + // this api call has already been stored somewhere. + recordAPICall(post, servletName, type, comment); + return; + } + if (time < 0 || unit == null || unit.length() == 0 || "minutes,hours,days".indexOf(unit) < 0) { + time = 0; unit = ""; + } else { + if (unit.equals("minutes") && time < 10) time = 10; + } + + // generate the apicall url - without the apicall attributes + final String apiurl = /*"http://localhost:" + getConfig("port", "8080") +*/ "/" + servletName + "?" + post.toString(); + + // insert entry + try { + // create and insert new entry + Data data = new Data(); + data.put(TABLE_API_COL_TYPE, type.getBytes()); + data.put(TABLE_API_COL_COMMENT, comment.getBytes()); + byte[] date = DateFormatter.formatShortMilliSecond(new Date()).getBytes(); + data.put(TABLE_API_COL_DATE_RECORDING, date); + data.put(TABLE_API_COL_DATE_LAST_EXEC, date); + data.put(TABLE_API_COL_URL, apiurl.getBytes()); + + // insert APICALL attributes + data.put(TABLE_API_COL_APICALL_COUNT, "1".getBytes()); + data.put(TABLE_API_COL_APICALL_SCHEDULE_TIME, Integer.toString(time).getBytes()); + data.put(TABLE_API_COL_APICALL_SCHEDULE_UNIT, unit.getBytes()); + calculateAPIScheduler(data, false); // set next execution time + super.insert(TABLE_API_NAME, data); + } catch (IOException e) { + Log.logException(e); + } catch (RowSpaceExceededException e) { + Log.logException(e); + } + Log.logInfo("APICALL", apiurl); + } + /** * execute an API call using a api table row which contains all essentials * to access the server also the host, port and the authentication realm must be given @@ -164,9 +203,6 @@ public class WorkTables extends Tables { if (row == null) continue; String url = "http://" + host + ":" + port + new String(row.get(WorkTables.TABLE_API_COL_URL)); url += "&" + WorkTables.TABLE_API_COL_APICALL_PK + "=" + new String(row.getPK()); - url += "&" + WorkTables.TABLE_API_COL_APICALL_COUNT + "=" + (row.get(WorkTables.TABLE_API_COL_APICALL_COUNT, 1) + 1); - url += "&" + WorkTables.TABLE_API_COL_APICALL_SCHEDULE_TIME + "=" + row.get(WorkTables.TABLE_API_COL_APICALL_SCHEDULE_TIME, ""); - url += "&" + WorkTables.TABLE_API_COL_APICALL_SCHEDULE_UNIT + "=" + row.get(WorkTables.TABLE_API_COL_APICALL_SCHEDULE_UNIT, ""); try { client.GETbytes(url); l.put(url, client.getStatusCode()); @@ -197,8 +233,9 @@ public class WorkTables extends Tables { /** * calculate the execution time in a api call table based on given scheduling time and last execution time * @param row the database row in the api table + * @param update if true then the next execution time is based on the latest computed execution time; othervise it is based on the last execution time */ - public static void calculateAPIScheduler(Tables.Row row, boolean update) { + public static void calculateAPIScheduler(Tables.Data row, boolean update) { Date date = row.containsKey(WorkTables.TABLE_API_COL_DATE) ? row.get(WorkTables.TABLE_API_COL_DATE, new Date()) : null; date = update ? row.get(WorkTables.TABLE_API_COL_DATE_NEXT_EXEC, date) : row.get(WorkTables.TABLE_API_COL_DATE_LAST_EXEC, date); int time = row.get(WorkTables.TABLE_API_COL_APICALL_SCHEDULE_TIME, 1); @@ -208,10 +245,11 @@ public class WorkTables extends Tables { } String unit = row.get(WorkTables.TABLE_API_COL_APICALL_SCHEDULE_UNIT, "days"); long d = date.getTime(); - if (unit.equals("minutes")) d += 60000L * time; + if (unit.equals("minutes")) d += 60000L * Math.max(10, time); if (unit.equals("hours")) d += 60000L * 60L * time; if (unit.equals("days")) d += 60000L * 60L * 24L * time; if (d < System.currentTimeMillis()) d = System.currentTimeMillis() + 600000L; + d -= d % 60000; // remove seconds row.put(WorkTables.TABLE_API_COL_DATE_NEXT_EXEC, new Date(d)); } diff --git a/source/de/anomic/data/bookmarksDB.java b/source/de/anomic/data/bookmarksDB.java index 54668dd00..385dfa927 100644 --- a/source/de/anomic/data/bookmarksDB.java +++ b/source/de/anomic/data/bookmarksDB.java @@ -23,18 +23,11 @@ package de.anomic.data; -import java.io.BufferedReader; import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.FileReader; -import java.io.FileWriter; import java.io.IOException; -import java.io.InputStreamReader; import java.io.Serializable; import java.net.MalformedURLException; import java.util.Comparator; -import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; @@ -42,24 +35,15 @@ import java.util.Map; import java.util.Set; import java.util.TreeSet; import java.util.concurrent.ConcurrentHashMap; -import java.util.regex.Pattern; import net.yacy.kelondro.blob.MapHeap; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.NaturalOrder; -import net.yacy.kelondro.util.DateFormatter; -import net.yacy.kelondro.workflow.BusyThread; -import net.yacy.kelondro.workflow.InstantBusyThread; - -import de.anomic.crawler.CrawlProfile; -import de.anomic.crawler.retrieval.Request; -import de.anomic.search.Segments; -import de.anomic.search.Switchboard; -import de.anomic.yacy.yacyNewsPool; public class bookmarksDB { + // ------------------------------------ // Declaration of Class-Attributes // ------------------------------------ @@ -67,7 +51,6 @@ public class bookmarksDB { //final static int SORT_ALPHA = 1; private final static int SORT_SIZE = 2; private final static int SHOW_ALL = -1; - private final static String SLEEP_TIME = "3600000"; // default sleepTime: check for recrawls every hour // bookmarks private MapHeap bookmarks; @@ -75,9 +58,6 @@ public class bookmarksDB { // tags private ConcurrentHashMap tags; - // autoReCrawl - private final BusyThread autoReCrawl; - private BookmarkDate dates; // ------------------------------------ @@ -120,15 +100,6 @@ public class bookmarksDB { //this.datesTable = new MapView(BLOBTree.toHeap(datesFile, true, true, 20, 256, '_', NaturalOrder.naturalOrder, datesFileNew), 500, '_'); this.dates = new BookmarkDate(datesFile); if (!datesExisted) this.dates.init(new bookmarkIterator(true)); - - // autoReCrawl - final Switchboard sb = Switchboard.getSwitchboard(); - this.autoReCrawl = new InstantBusyThread(this, "autoReCrawl", null, null, Long.MIN_VALUE, Long.MAX_VALUE, Long.MIN_VALUE, Long.MAX_VALUE); - final long sleepTime = Long.parseLong(sb.getConfig("autoReCrawl_idlesleep" , SLEEP_TIME)); - sb.deployThread("autoReCrawl", "autoReCrawl Scheduler", "simple scheduler for automatic re-crawls of bookmarked urls", null, autoReCrawl, 120000, - sleepTime, sleepTime, Long.parseLong(sb.getConfig("autoReCrawl_memprereq" , "-1")) - ); - Log.logInfo("BOOKMARKS", "autoReCrawl - serverBusyThread initialized checking every "+(sleepTime/1000/60)+" minutes for recrawls"); } // ----------------------------------------------------- @@ -141,163 +112,6 @@ public class bookmarksDB { dates.close(); } - // ----------------------------------------------------- - // bookmarksDB's functions for autoReCrawl - // ----------------------------------------------------- - - public boolean autoReCrawl() { - - // read crontab - final File file = new File (Switchboard.getSwitchboard().getRootPath(),"DATA/SETTINGS/autoReCrawl.conf"); - String s; - try { - final BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(file))); - Log.logInfo("BOOKMARKS", "autoReCrawl - reading schedules from " + file); - while( null != (s = in.readLine()) ) { - if (s.length() > 0 && s.charAt(0) != '#') { - final String parser[] = s.split("\t"); - if (parser.length == 13) { - folderReCrawl(Long.parseLong(parser[0]), parser[1], parser[2], Integer.parseInt(parser[3]), Long.parseLong(parser[4]), - Integer.parseInt(parser[5]), Integer.parseInt(parser[6]), Boolean.parseBoolean(parser[7]), - Boolean.parseBoolean(parser[8]), Boolean.parseBoolean(parser[9]), - Boolean.parseBoolean(parser[10]), Boolean.parseBoolean(parser[11]), - Boolean.parseBoolean(parser[12]), CrawlProfile.CacheStrategy.IFFRESH - ); - } - if (parser.length == 14) { - folderReCrawl(Long.parseLong(parser[0]), parser[1], parser[2], Integer.parseInt(parser[3]), Long.parseLong(parser[4]), - Integer.parseInt(parser[5]), Integer.parseInt(parser[6]), Boolean.parseBoolean(parser[7]), - Boolean.parseBoolean(parser[8]), Boolean.parseBoolean(parser[9]), - Boolean.parseBoolean(parser[10]), Boolean.parseBoolean(parser[11]), - Boolean.parseBoolean(parser[12]), CrawlProfile.CacheStrategy.decode(Integer.parseInt(parser[13])) - ); - } - } - } - in.close(); - } catch( FileNotFoundException ex ) { - try { - Log.logInfo("BOOKMARKS", "autoReCrawl - creating new autoReCrawl.conf"); - final File inputFile = new File(Switchboard.getSwitchboard().getRootPath(),"defaults/autoReCrawl.conf"); - final File outputFile = new File(Switchboard.getSwitchboard().getRootPath(),"DATA/SETTINGS/autoReCrawl.conf"); - final FileReader i = new FileReader(inputFile); - final FileWriter o = new FileWriter(outputFile); - int c; - while ((c = i.read()) != -1) { - o.write(c); - } - i.close(); - o.close(); - autoReCrawl(); - return true; - } catch( FileNotFoundException e ) { - Log.logSevere("BOOKMARKS", "autoReCrawl - file not found error: defaults/autoReCrawl.conf", e); - return false; - } catch (IOException e) { - Log.logSevere("BOOKMARKS", "autoReCrawl - IOException: defaults/autoReCrawl.conf", e); - return false; - } - } catch( Exception ex ) { - Log.logSevere("BOOKMARKS", "autoReCrawl - error reading " + file, ex); - return false; - } - return true; - } - - public void folderReCrawl(long schedule, String folder, String crawlingfilter, int newcrawlingdepth, long crawlingIfOlder, - int crawlingDomFilterDepth, int crawlingDomMaxPages, boolean crawlingQ, boolean indexText, boolean indexMedia, - boolean crawlOrder, boolean xsstopw, boolean storeHTCache, CrawlProfile.CacheStrategy cacheStrategy) { - - final Switchboard sb = Switchboard.getSwitchboard(); - final Iterator bit = getBookmarksIterator(folder, true); - Log.logInfo("BOOKMARKS", "autoReCrawl - processing: "+folder); - - final boolean xdstopw = xsstopw; - final boolean xpstopw = xsstopw; - - while(bit.hasNext()) { - - final Bookmark bm = getBookmark(bit.next()); - final long sleepTime = Long.parseLong(sb.getConfig("autoReCrawl_idlesleep" , SLEEP_TIME)); - final long interTime = (System.currentTimeMillis()-bm.getTimeStamp())%schedule; - - final Date date = new Date(bm.getTimeStamp()); - Log.logInfo("BOOKMARKS", "autoReCrawl - checking schedule for: "+"["+DateFormatter.formatISO8601(date)+"] "+bm.getUrl()); - - if (interTime >= 0 && interTime < sleepTime) { - try { - int pos = 0; - // set crawlingStart to BookmarkUrl - final String crawlingStart = bm.getUrl(); - String newcrawlingMustMatch = crawlingfilter; - - final DigestURI crawlingStartURL = new DigestURI(crawlingStart, null); - - // set the crawling filter - if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = ".*"; // avoid that all urls are filtered out if bad value was submitted - - if (crawlingStartURL!= null && newcrawlingMustMatch.equals("dom")) { - newcrawlingMustMatch = ".*" + crawlingStartURL.getHost() + ".*"; - } - if (crawlingStart!= null && newcrawlingMustMatch.equals("sub") && (pos = crawlingStart.lastIndexOf("/")) > 0) { - newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*"; - } - - // check if the crawl filter works correctly - Pattern.compile(newcrawlingMustMatch); - - final byte[] urlhash = crawlingStartURL.hash(); - - sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).remove(urlhash); - sb.crawlQueues.noticeURL.removeByURLHash(urlhash); - sb.crawlQueues.errorURL.remove(urlhash); - - // stack url - sb.crawler.profilesPassiveCrawls.removeEntry(crawlingStartURL.hash()); // if there is an old entry, delete it - final CrawlProfile.entry pe = sb.crawler.profilesActiveCrawls.newEntry( - folder+"/"+crawlingStartURL, crawlingStartURL, - newcrawlingMustMatch, - CrawlProfile.MATCH_BAD_URL, - newcrawlingdepth, - sb.crawler.profilesActiveCrawls.getRecrawlDate(crawlingIfOlder), crawlingDomFilterDepth, crawlingDomMaxPages, - crawlingQ, - indexText, indexMedia, - storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cacheStrategy); - sb.crawlStacker.enqueueEntry(new Request( - sb.peers.mySeed().hash.getBytes(), - crawlingStartURL, - null, - "CRAWLING-ROOT", - new Date(), - pe.handle(), - 0, - 0, - 0 - )); - Log.logInfo("BOOKMARKS", "autoReCrawl - adding crawl profile for: " + crawlingStart); - // serverLog.logInfo("BOOKMARKS", "autoReCrawl - crawl filter is set to: " + newcrawlingfilter); - // generate a YaCyNews if the global flag was set - if (crawlOrder) { - Map m = new HashMap(pe.map()); // must be cloned - m.remove("specificDepth"); - m.remove("indexText"); - m.remove("indexMedia"); - m.remove("remoteIndexing"); - m.remove("xsstopw"); - m.remove("xpstopw"); - m.remove("xdstopw"); - m.remove("storeTXCache"); - m.remove("storeHTCache"); - m.remove("generalFilter"); - m.remove("specificFilter"); - m.put("intention", "Automatic ReCrawl!"); - sb.peers.newsPool.publishMyNews(sb.peers.mySeed(), yacyNewsPool.CATEGORY_CRAWL_START, m); - } - } catch (MalformedURLException e1) {} - } // if - } // while(bit.hasNext()) - } // } autoReCrawl() - // ----------------------------------------------------------- // bookmarksDB's functions for bookmarksTable / bookmarkCache // ----------------------------------------------------------- diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index da84b7ba5..7cf670836 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -137,7 +137,6 @@ import de.anomic.data.userDB; import de.anomic.data.wiki.wikiBoard; import de.anomic.data.wiki.wikiCode; import de.anomic.data.wiki.wikiParser; -//import de.anomic.http.client.Client; import de.anomic.http.client.Cache; import de.anomic.http.server.HTTPDemon; import de.anomic.http.server.HeaderFramework; @@ -211,7 +210,7 @@ public final class Switchboard extends serverSwitch { public boolean rankingOn; public CRDistribution rankingOwnDistribution; public CRDistribution rankingOtherDistribution; - public Map outgoingCookies, incomingCookies; + public Map outgoingCookies, incomingCookies; public volatile long proxyLastAccess, localSearchLastAccess, remoteSearchLastAccess; public yacyCore yc; public ResourceObserver observer; @@ -608,7 +607,7 @@ public final class Switchboard extends serverSwitch { SwitchboardConstants.CLEANUP_METHOD_JOBCOUNT, SwitchboardConstants.CLEANUP_METHOD_FREEMEM, 60000, Long.MAX_VALUE, 10000, Long.MAX_VALUE), - 600000); // all 5 Minutes, wait 10 minutes until first run + 60000); // all 5 Minutes, wait 1 minute until first run deployThread(SwitchboardConstants.SURROGATES, "Surrogates", "A thread that polls the SURROGATES path and puts all Documents in one surroagte file into the indexing queue.", null, new InstantBusyThread( this, diff --git a/source/net/yacy/kelondro/blob/Tables.java b/source/net/yacy/kelondro/blob/Tables.java index 821e89113..b3d243b2d 100644 --- a/source/net/yacy/kelondro/blob/Tables.java +++ b/source/net/yacy/kelondro/blob/Tables.java @@ -433,6 +433,17 @@ public class Tables { return dflt; } } + + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append('{'); + for (Map.Entry entry: this.entrySet()) { + sb.append(entry.getKey()).append('=').append(new String(entry.getValue())).append(", "); + } + if (sb.length() > 1) sb.setLength(sb.length() - 2); + sb.append('}'); + return sb.toString(); + } } public class Row extends Data {