diff --git a/defaults/autoReCrawl.conf b/defaults/autoReCrawl.conf
deleted file mode 100644
index 217f2b1ba..000000000
--- a/defaults/autoReCrawl.conf
+++ /dev/null
@@ -1,8 +0,0 @@
-# YaCy autoReCrawl configuration for bookmark folders
-#
-# schedule|folder|filter|crawlingdepth|crawlingIfOlder|DomFilterDepth|DomMaxPages|crawlingQ|indexText|indexMedia|crawlOrder|xsstopw|storeHTCache
-3600000 /autoReCrawl/hourly .* 1 59 -1 -1 true true true true false false
-86400000 /autoReCrawl/daily .* 3 1439 -1 -1 true true true true false false
-604800000 /autoReCrawl/weekly .* 3 10079 -1 -1 true true true true false false
-2678400000 /autoReCrawl/monthly .* 4 44639 -1 -1 true true true true false false
-# eof
diff --git a/defaults/yacy.init b/defaults/yacy.init
index 54b6f5af9..671341eae 100644
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@@ -579,11 +579,6 @@ filterOutStopwordsFromTopwords=true
90_cleanup_busysleep=300000
90_cleanup_memprereq=0
-# autoReCrawl Options
-autoReCrawl_idlesleep = 3600000
-autoReCrawl_busysleep = 3600000
-autoReCrawl_memprereq = -1
-
# additional attributes:
# performanceIO is a percent-value. a value of 10 means, that 10% of the busysleep time
# is used to flush the RAM cache, which is the major part of the IO in YaCy
diff --git a/htroot/CrawlStart_p.html b/htroot/CrawlStart_p.html
index 56e036261..0ee51c9ae 100644
--- a/htroot/CrawlStart_p.html
+++ b/htroot/CrawlStart_p.html
@@ -54,7 +54,7 @@
:
-
+
@@ -70,42 +70,65 @@
Other already visited URLs are sorted out as "double", if they are not allowed using the re-crawl option.
-
-
Create Bookmark
-
- :
-
- (works with "Starting Point: From URL" only)
-
- :
-
- :
-
-
-
-
- This option lets you create a bookmark from your crawl start URL. For automatic re-crawling you can use the following default folders:
-
-
/autoReCrawl/hourly
-
/autoReCrawl/daily
-
/autoReCrawl/weekly
-
/autoReCrawl/monthly
-
- Attention: recrawl settings depend on the folder. They can be adjusted in /DATA/SETTINGS/autoReCrawl.conf.
-
-
:
- This defines how often the Crawler will follow links embedded in websites.
- A minimum of 0 is recommended and means that the page you enter under "Starting Point" will be added
- to the index, but no linked content is indexed. 2-4 is good for normal indexing.
- Be careful with the depth. Consider a branching factor of average 20;
- A prefetch-depth of 8 would index 25.600.000.000 pages, maybe this is the whole WWW.
+ This defines how often the Crawler will follow links (of links..) embedded in websites.
+ 0 means that only the page you enter under "Starting Point" will be added
+ to the index. 2-4 is good for normal indexing. Values over 8 are not useful, since a depth-8 crawl will
+ index approximately 25.600.000.000 pages, maybe this is the whole WWW.
+
Scheduled re-crawl
+
+
+
no doubles
+
run this crawl once and never load any page that is already known, only the start-url may be loaded again.
+
re-load
+
run this crawl once, but treat urls that are known since
+
+ not as double and load them again. No scheduled re-crawl.
+
+
scheduled
+
after starting this crawl, repeat the crawl every
+
+ automatically.
+
+
+
+
+ A web crawl performs a double-check on all links found in the internet against the internal database. If the same url is found again,
+ then the url is treated as double when you check the 'no doubles' option. A url may be loaded again when it has reached a specific age,
+ to use that check the 'once' option. When you want that this web crawl is repeated automatically, then check the 'scheduled' option.
+ In this case the crawl is repeated after the given time and no url from the previous crawl is omitted as double.
+
+
+
:
Use filter
@@ -132,26 +155,6 @@
-
Re-crawl known URLs:
-
- :
-
- :
-
-
-
-
- If you use this option, web pages that are already existent in your database are crawled and indexed again.
- It depends on the age of the last crawl if this is done or not: if the last crawl is older than the given
- date, the page is crawled again, otherwise it is treated as 'double' and not loaded or indexed again.
-
-
-
Auto-Dom-Filter:
:
@@ -167,7 +170,7 @@
The default value 0 gives no restrictions.
-
+
Maximum Pages per Domain:
:
@@ -181,7 +184,7 @@
the given depth. Domains outside the given depth are then sorted-out anyway.
-
+
:
@@ -189,7 +192,7 @@
is accessed with URLs containing question marks. If you are unsure, do not check this to avoid crawl loops.
-
+
:
@@ -275,6 +278,23 @@
-->
+
+
Create Bookmark
+
+ :
+
+ (works with "Starting Point: From URL" only)
+
+ :
+
+ :
+
+
+
+
+ This option lets you create a bookmark from your crawl start URL.
+
+
diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java
index 6d3d0e090..f918f354e 100644
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@@ -140,9 +140,6 @@ public class Crawler_p {
try {crawlingStartURL = new DigestURI(crawlingStart, null);} catch (final MalformedURLException e1) {}
crawlingStart = (crawlingStartURL == null) ? null : crawlingStartURL.toNormalform(true, true);
- // store this call as api call
- sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart);
-
// set new properties
final boolean fullDomain = post.get("range", "wide").equals("domain"); // special property in simple crawl start
final boolean subPath = post.get("range", "wide").equals("subpath"); // special property in simple crawl start
@@ -167,12 +164,37 @@ public class Crawler_p {
env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8;
- final boolean crawlingIfOlderCheck = post.get("crawlingIfOlderCheck", "off").equals("on");
- final int crawlingIfOlderNumber = Integer.parseInt(post.get("crawlingIfOlderNumber", "-1"));
- final String crawlingIfOlderUnit = post.get("crawlingIfOlderUnit","year");
- final long crawlingIfOlder = recrawlIfOlderC(crawlingIfOlderCheck, crawlingIfOlderNumber, crawlingIfOlderUnit);
- env.setConfig("crawlingIfOlder", crawlingIfOlder);
+ // recrawl
+ final String recrawl = post.get("recrawl", "nodoubles"); // nodoubles, reload, scheduler
+ boolean crawlingIfOlderCheck = post.get("crawlingIfOlderCheck", "off").equals("on");
+ int crawlingIfOlderNumber = Integer.parseInt(post.get("crawlingIfOlderNumber", "-1"));
+ String crawlingIfOlderUnit = post.get("crawlingIfOlderUnit","year"); // year, month, day, hour
+ int repeat_time = Integer.parseInt(post.get("repeat_time", "-1"));
+ final String repeat_unit = post.get("repeat_unit", "seldays"); // selminutes, selhours, seldays
+ if (recrawl.equals("scheduler")) {
+ // set crawlingIfOlder attributes that are appropriate for scheduled crawling
+ crawlingIfOlderCheck = true;
+ crawlingIfOlderNumber = repeat_unit.equals("selminutes") ? 1 : repeat_unit.equals("selhours") ? repeat_time / 2 : repeat_time * 12;
+ crawlingIfOlderUnit = "hour";
+ } else if (recrawl.equals("reload")) {
+ repeat_time = -1;
+ crawlingIfOlderCheck = true;
+ } else if (recrawl.equals("nodoubles")) {
+ repeat_time = -1;
+ crawlingIfOlderCheck = false;
+ }
+ long crawlingIfOlder = recrawlIfOlderC(crawlingIfOlderCheck, crawlingIfOlderNumber, crawlingIfOlderUnit);
+ env.setConfig("crawlingIfOlder", crawlingIfOlder);
+
+ // store this call as api call
+ if (repeat_time > 0) {
+ // store as scheduled api call
+ sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart, repeat_time, repeat_unit.substring(3));
+ } else {
+ // store just a protocol
+ sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart);
+ }
final boolean crawlingDomFilterCheck = post.get("crawlingDomFilterCheck", "off").equals("on");
final int crawlingDomFilterDepth = (crawlingDomFilterCheck) ? Integer.parseInt(post.get("crawlingDomFilterDepth", "-1")) : -1;
env.setConfig("crawlingDomFilterDepth", Integer.toString(crawlingDomFilterDepth));
diff --git a/htroot/PerformanceQueues_p.java b/htroot/PerformanceQueues_p.java
index 8ae423edc..8a5efe424 100644
--- a/htroot/PerformanceQueues_p.java
+++ b/htroot/PerformanceQueues_p.java
@@ -207,13 +207,12 @@ public class PerformanceQueues_p {
busysleep = sb.getConfigLong(threadName + "_busysleep", busysleep);
}
if (setProfile) {
- if (threadName.equals(SwitchboardConstants.PEER_PING)
- || threadName.equals(SwitchboardConstants.SEED_UPLOAD)
- || threadName.equals(SwitchboardConstants.CLEANUP)
- || threadName.equals("autoReCrawl")
- ) { /* do not change any values */ }
- else if (threadName.equals(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER)
- || threadName.equals(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL)) {
+ if (threadName.equals(SwitchboardConstants.PEER_PING) ||
+ threadName.equals(SwitchboardConstants.SEED_UPLOAD) ||
+ threadName.equals(SwitchboardConstants.CLEANUP)) {
+ /* do not change any values */
+ } else if (threadName.equals(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER) ||
+ threadName.equals(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL)) {
sb.setRemotecrawlPPM(Math.max(1, (int) (sb.getConfigLong("network.unit.remotecrawl.speed", 60) / multiplier)));
}
else {
diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java
index bf8de8c21..6d01dae62 100644
--- a/htroot/QuickCrawlLink_p.java
+++ b/htroot/QuickCrawlLink_p.java
@@ -109,7 +109,7 @@ public class QuickCrawlLink_p {
// get other parameters if set
final String crawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL);
- final String crawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_BAD_URL);
+ final String crawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER);
final int CrawlingDepth = Integer.parseInt(post.get("crawlingDepth", "0"));
final boolean crawlDynamic = post.get("crawlingQ", "").equals("on");
final boolean indexText = post.get("indexText", "on").equals("on");
diff --git a/htroot/Table_API_p.java b/htroot/Table_API_p.java
index 21d45b733..b8a6d3675 100644
--- a/htroot/Table_API_p.java
+++ b/htroot/Table_API_p.java
@@ -62,7 +62,7 @@ public class Table_API_p {
if (action.equals("on")) {
Tables.Row row = sb.tables.select(WorkTables.TABLE_API_NAME, pk.getBytes());
if (row != null) {
- row.put(WorkTables.TABLE_API_COL_APICALL_SCHEDULE_TIME, 1);
+ row.put(WorkTables.TABLE_API_COL_APICALL_SCHEDULE_TIME, 7);
row.put(WorkTables.TABLE_API_COL_APICALL_SCHEDULE_UNIT, "days");
WorkTables.calculateAPIScheduler(row, false);
sb.tables.update(WorkTables.TABLE_API_NAME, row);
diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java
index 6d3b84174..4fb53ccc5 100644
--- a/source/de/anomic/crawler/CrawlProfile.java
+++ b/source/de/anomic/crawler/CrawlProfile.java
@@ -46,7 +46,6 @@ public class CrawlProfile {
public static final String MATCH_ALL = ".*";
public static final String MATCH_NEVER = "";
- public static final String MATCH_BAD_URL = ".*memberlist.*|.*previous.*|.*next.*|.*p=.*";
static ConcurrentHashMap> domsCache = new ConcurrentHashMap>();
diff --git a/source/de/anomic/crawler/CrawlSwitchboard.java b/source/de/anomic/crawler/CrawlSwitchboard.java
index 4be91cdc3..313d56980 100644
--- a/source/de/anomic/crawler/CrawlSwitchboard.java
+++ b/source/de/anomic/crawler/CrawlSwitchboard.java
@@ -163,7 +163,7 @@ public final class CrawlSwitchboard {
if (this.defaultProxyProfile == null) {
// generate new default entry for proxy crawling
- this.defaultProxyProfile = this.profilesActiveCrawls.newEntry("proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL,
+ this.defaultProxyProfile = this.profilesActiveCrawls.newEntry("proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, -1, false,
true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
@@ -174,33 +174,33 @@ public final class CrawlSwitchboard {
}
if (this.defaultRemoteProfile == null) {
// generate new default entry for remote crawling
- defaultRemoteProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0,
+ defaultRemoteProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-1, -1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
}
if (this.defaultTextSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
- defaultTextSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0,
+ defaultTextSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
}
if (this.defaultTextSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
- defaultTextSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0,
+ defaultTextSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
}
this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST);
if (this.defaultMediaSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
- defaultMediaSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0,
+ defaultMediaSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, true, false, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
}
if (this.defaultMediaSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
- defaultMediaSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0,
+ defaultMediaSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
}
if (this.defaultSurrogateProfile == null) {
// generate new default entry for surrogate parsing
- defaultSurrogateProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0,
+ defaultSurrogateProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE);
}
}
diff --git a/source/de/anomic/data/SitemapParser.java b/source/de/anomic/data/SitemapParser.java
index eb331a688..01877bbdb 100644
--- a/source/de/anomic/data/SitemapParser.java
+++ b/source/de/anomic/data/SitemapParser.java
@@ -315,7 +315,7 @@ public class SitemapParser extends DefaultHandler {
return this.sb.crawler.profilesActiveCrawls.newEntry(
domainName, sitemapURL,
// crawling Filter
- CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL,
+ CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
// Depth
0,
// force recrawling
diff --git a/source/de/anomic/data/WorkTables.java b/source/de/anomic/data/WorkTables.java
index c1a6df2fd..e1d8c0f74 100644
--- a/source/de/anomic/data/WorkTables.java
+++ b/source/de/anomic/data/WorkTables.java
@@ -77,13 +77,6 @@ public class WorkTables extends Tables {
public void recordAPICall(final serverObjects post, final String servletName, final String type, final String comment) {
// remove the apicall attributes from the post object
String pk = post.remove(TABLE_API_COL_APICALL_PK);
- String count = post.remove(TABLE_API_COL_APICALL_COUNT);
- if (count == null) count = "1";
- String time = post.remove(TABLE_API_COL_APICALL_SCHEDULE_TIME);
- String unit = post.remove(TABLE_API_COL_APICALL_SCHEDULE_UNIT);
- if (time == null || unit == null || unit.length() == 0 || "minutes,hours,days".indexOf(unit) < 0) {
- time = ""; unit = "";
- }
// generate the apicall url - without the apicall attributes
final String apiurl = /*"http://localhost:" + getConfig("port", "8080") +*/ "/" + servletName + "?" + post.toString();
@@ -100,20 +93,7 @@ public class WorkTables extends Tables {
// insert or update entry
try {
- if (row != null) {
- // modify and update existing entry
-
- // modify date attributes and patch old values
- row.put(TABLE_API_COL_DATE_LAST_EXEC, DateFormatter.formatShortMilliSecond(new Date()).getBytes());
- if (!row.containsKey(TABLE_API_COL_DATE_RECORDING)) row.put(TABLE_API_COL_DATE_RECORDING, row.get(TABLE_API_COL_DATE));
- row.remove(TABLE_API_COL_DATE);
-
- // insert APICALL attributes
- row.put(TABLE_API_COL_APICALL_COUNT, count.getBytes());
- row.put(TABLE_API_COL_APICALL_SCHEDULE_TIME, time.getBytes());
- row.put(TABLE_API_COL_APICALL_SCHEDULE_UNIT, unit.getBytes());
- super.update(TABLE_API_NAME, row);
- } else {
+ if (row == null) {
// create and insert new entry
Data data = new Data();
data.put(TABLE_API_COL_TYPE, type.getBytes());
@@ -124,10 +104,19 @@ public class WorkTables extends Tables {
data.put(TABLE_API_COL_URL, apiurl.getBytes());
// insert APICALL attributes
- data.put(TABLE_API_COL_APICALL_COUNT, count.getBytes());
- data.put(TABLE_API_COL_APICALL_SCHEDULE_TIME, time.getBytes());
- data.put(TABLE_API_COL_APICALL_SCHEDULE_UNIT, unit.getBytes());
+ data.put(TABLE_API_COL_APICALL_COUNT, "1");
super.insert(TABLE_API_NAME, data);
+ } else {
+ // modify and update existing entry
+
+ // modify date attributes and patch old values
+ row.put(TABLE_API_COL_DATE_LAST_EXEC, DateFormatter.formatShortMilliSecond(new Date()).getBytes());
+ if (!row.containsKey(TABLE_API_COL_DATE_RECORDING)) row.put(TABLE_API_COL_DATE_RECORDING, row.get(TABLE_API_COL_DATE));
+ row.remove(TABLE_API_COL_DATE);
+
+ // insert APICALL attributes
+ row.put(TABLE_API_COL_APICALL_COUNT, row.get(TABLE_API_COL_APICALL_COUNT, 1) + 1);
+ super.update(TABLE_API_NAME, row);
}
} catch (IOException e) {
Log.logException(e);
@@ -137,6 +126,56 @@ public class WorkTables extends Tables {
Log.logInfo("APICALL", apiurl);
}
+ /**
+ * store a API call and set attributes to schedule a re-call of that API call according to a given frequence
+ * This is the same as the previous method but it also computes a re-call time and stores that additionally
+ * @param post the post arguments of the api call
+ * @param servletName the name of the servlet
+ * @param type name of the servlet category
+ * @param comment visual description of the process
+ * @param time the time until next scheduled execution of this api call
+ * @param unit the time unit for the scheduled call
+ */
+ public void recordAPICall(final serverObjects post, final String servletName, final String type, final String comment, int time, String unit) {
+ if (post.containsKey(TABLE_API_COL_APICALL_PK)) {
+ // this api call has already been stored somewhere.
+ recordAPICall(post, servletName, type, comment);
+ return;
+ }
+ if (time < 0 || unit == null || unit.length() == 0 || "minutes,hours,days".indexOf(unit) < 0) {
+ time = 0; unit = "";
+ } else {
+ if (unit.equals("minutes") && time < 10) time = 10;
+ }
+
+ // generate the apicall url - without the apicall attributes
+ final String apiurl = /*"http://localhost:" + getConfig("port", "8080") +*/ "/" + servletName + "?" + post.toString();
+
+ // insert entry
+ try {
+ // create and insert new entry
+ Data data = new Data();
+ data.put(TABLE_API_COL_TYPE, type.getBytes());
+ data.put(TABLE_API_COL_COMMENT, comment.getBytes());
+ byte[] date = DateFormatter.formatShortMilliSecond(new Date()).getBytes();
+ data.put(TABLE_API_COL_DATE_RECORDING, date);
+ data.put(TABLE_API_COL_DATE_LAST_EXEC, date);
+ data.put(TABLE_API_COL_URL, apiurl.getBytes());
+
+ // insert APICALL attributes
+ data.put(TABLE_API_COL_APICALL_COUNT, "1".getBytes());
+ data.put(TABLE_API_COL_APICALL_SCHEDULE_TIME, Integer.toString(time).getBytes());
+ data.put(TABLE_API_COL_APICALL_SCHEDULE_UNIT, unit.getBytes());
+ calculateAPIScheduler(data, false); // set next execution time
+ super.insert(TABLE_API_NAME, data);
+ } catch (IOException e) {
+ Log.logException(e);
+ } catch (RowSpaceExceededException e) {
+ Log.logException(e);
+ }
+ Log.logInfo("APICALL", apiurl);
+ }
+
/**
* execute an API call using a api table row which contains all essentials
* to access the server also the host, port and the authentication realm must be given
@@ -164,9 +203,6 @@ public class WorkTables extends Tables {
if (row == null) continue;
String url = "http://" + host + ":" + port + new String(row.get(WorkTables.TABLE_API_COL_URL));
url += "&" + WorkTables.TABLE_API_COL_APICALL_PK + "=" + new String(row.getPK());
- url += "&" + WorkTables.TABLE_API_COL_APICALL_COUNT + "=" + (row.get(WorkTables.TABLE_API_COL_APICALL_COUNT, 1) + 1);
- url += "&" + WorkTables.TABLE_API_COL_APICALL_SCHEDULE_TIME + "=" + row.get(WorkTables.TABLE_API_COL_APICALL_SCHEDULE_TIME, "");
- url += "&" + WorkTables.TABLE_API_COL_APICALL_SCHEDULE_UNIT + "=" + row.get(WorkTables.TABLE_API_COL_APICALL_SCHEDULE_UNIT, "");
try {
client.GETbytes(url);
l.put(url, client.getStatusCode());
@@ -197,8 +233,9 @@ public class WorkTables extends Tables {
/**
* calculate the execution time in a api call table based on given scheduling time and last execution time
* @param row the database row in the api table
+ * @param update if true then the next execution time is based on the latest computed execution time; othervise it is based on the last execution time
*/
- public static void calculateAPIScheduler(Tables.Row row, boolean update) {
+ public static void calculateAPIScheduler(Tables.Data row, boolean update) {
Date date = row.containsKey(WorkTables.TABLE_API_COL_DATE) ? row.get(WorkTables.TABLE_API_COL_DATE, new Date()) : null;
date = update ? row.get(WorkTables.TABLE_API_COL_DATE_NEXT_EXEC, date) : row.get(WorkTables.TABLE_API_COL_DATE_LAST_EXEC, date);
int time = row.get(WorkTables.TABLE_API_COL_APICALL_SCHEDULE_TIME, 1);
@@ -208,10 +245,11 @@ public class WorkTables extends Tables {
}
String unit = row.get(WorkTables.TABLE_API_COL_APICALL_SCHEDULE_UNIT, "days");
long d = date.getTime();
- if (unit.equals("minutes")) d += 60000L * time;
+ if (unit.equals("minutes")) d += 60000L * Math.max(10, time);
if (unit.equals("hours")) d += 60000L * 60L * time;
if (unit.equals("days")) d += 60000L * 60L * 24L * time;
if (d < System.currentTimeMillis()) d = System.currentTimeMillis() + 600000L;
+ d -= d % 60000; // remove seconds
row.put(WorkTables.TABLE_API_COL_DATE_NEXT_EXEC, new Date(d));
}
diff --git a/source/de/anomic/data/bookmarksDB.java b/source/de/anomic/data/bookmarksDB.java
index 54668dd00..385dfa927 100644
--- a/source/de/anomic/data/bookmarksDB.java
+++ b/source/de/anomic/data/bookmarksDB.java
@@ -23,18 +23,11 @@
package de.anomic.data;
-import java.io.BufferedReader;
import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.FileReader;
-import java.io.FileWriter;
import java.io.IOException;
-import java.io.InputStreamReader;
import java.io.Serializable;
import java.net.MalformedURLException;
import java.util.Comparator;
-import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
@@ -42,24 +35,15 @@ import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;
-import java.util.regex.Pattern;
import net.yacy.kelondro.blob.MapHeap;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.NaturalOrder;
-import net.yacy.kelondro.util.DateFormatter;
-import net.yacy.kelondro.workflow.BusyThread;
-import net.yacy.kelondro.workflow.InstantBusyThread;
-
-import de.anomic.crawler.CrawlProfile;
-import de.anomic.crawler.retrieval.Request;
-import de.anomic.search.Segments;
-import de.anomic.search.Switchboard;
-import de.anomic.yacy.yacyNewsPool;
public class bookmarksDB {
+
// ------------------------------------
// Declaration of Class-Attributes
// ------------------------------------
@@ -67,7 +51,6 @@ public class bookmarksDB {
//final static int SORT_ALPHA = 1;
private final static int SORT_SIZE = 2;
private final static int SHOW_ALL = -1;
- private final static String SLEEP_TIME = "3600000"; // default sleepTime: check for recrawls every hour
// bookmarks
private MapHeap bookmarks;
@@ -75,9 +58,6 @@ public class bookmarksDB {
// tags
private ConcurrentHashMap tags;
- // autoReCrawl
- private final BusyThread autoReCrawl;
-
private BookmarkDate dates;
// ------------------------------------
@@ -120,15 +100,6 @@ public class bookmarksDB {
//this.datesTable = new MapView(BLOBTree.toHeap(datesFile, true, true, 20, 256, '_', NaturalOrder.naturalOrder, datesFileNew), 500, '_');
this.dates = new BookmarkDate(datesFile);
if (!datesExisted) this.dates.init(new bookmarkIterator(true));
-
- // autoReCrawl
- final Switchboard sb = Switchboard.getSwitchboard();
- this.autoReCrawl = new InstantBusyThread(this, "autoReCrawl", null, null, Long.MIN_VALUE, Long.MAX_VALUE, Long.MIN_VALUE, Long.MAX_VALUE);
- final long sleepTime = Long.parseLong(sb.getConfig("autoReCrawl_idlesleep" , SLEEP_TIME));
- sb.deployThread("autoReCrawl", "autoReCrawl Scheduler", "simple scheduler for automatic re-crawls of bookmarked urls", null, autoReCrawl, 120000,
- sleepTime, sleepTime, Long.parseLong(sb.getConfig("autoReCrawl_memprereq" , "-1"))
- );
- Log.logInfo("BOOKMARKS", "autoReCrawl - serverBusyThread initialized checking every "+(sleepTime/1000/60)+" minutes for recrawls");
}
// -----------------------------------------------------
@@ -141,163 +112,6 @@ public class bookmarksDB {
dates.close();
}
- // -----------------------------------------------------
- // bookmarksDB's functions for autoReCrawl
- // -----------------------------------------------------
-
- public boolean autoReCrawl() {
-
- // read crontab
- final File file = new File (Switchboard.getSwitchboard().getRootPath(),"DATA/SETTINGS/autoReCrawl.conf");
- String s;
- try {
- final BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
- Log.logInfo("BOOKMARKS", "autoReCrawl - reading schedules from " + file);
- while( null != (s = in.readLine()) ) {
- if (s.length() > 0 && s.charAt(0) != '#') {
- final String parser[] = s.split("\t");
- if (parser.length == 13) {
- folderReCrawl(Long.parseLong(parser[0]), parser[1], parser[2], Integer.parseInt(parser[3]), Long.parseLong(parser[4]),
- Integer.parseInt(parser[5]), Integer.parseInt(parser[6]), Boolean.parseBoolean(parser[7]),
- Boolean.parseBoolean(parser[8]), Boolean.parseBoolean(parser[9]),
- Boolean.parseBoolean(parser[10]), Boolean.parseBoolean(parser[11]),
- Boolean.parseBoolean(parser[12]), CrawlProfile.CacheStrategy.IFFRESH
- );
- }
- if (parser.length == 14) {
- folderReCrawl(Long.parseLong(parser[0]), parser[1], parser[2], Integer.parseInt(parser[3]), Long.parseLong(parser[4]),
- Integer.parseInt(parser[5]), Integer.parseInt(parser[6]), Boolean.parseBoolean(parser[7]),
- Boolean.parseBoolean(parser[8]), Boolean.parseBoolean(parser[9]),
- Boolean.parseBoolean(parser[10]), Boolean.parseBoolean(parser[11]),
- Boolean.parseBoolean(parser[12]), CrawlProfile.CacheStrategy.decode(Integer.parseInt(parser[13]))
- );
- }
- }
- }
- in.close();
- } catch( FileNotFoundException ex ) {
- try {
- Log.logInfo("BOOKMARKS", "autoReCrawl - creating new autoReCrawl.conf");
- final File inputFile = new File(Switchboard.getSwitchboard().getRootPath(),"defaults/autoReCrawl.conf");
- final File outputFile = new File(Switchboard.getSwitchboard().getRootPath(),"DATA/SETTINGS/autoReCrawl.conf");
- final FileReader i = new FileReader(inputFile);
- final FileWriter o = new FileWriter(outputFile);
- int c;
- while ((c = i.read()) != -1) {
- o.write(c);
- }
- i.close();
- o.close();
- autoReCrawl();
- return true;
- } catch( FileNotFoundException e ) {
- Log.logSevere("BOOKMARKS", "autoReCrawl - file not found error: defaults/autoReCrawl.conf", e);
- return false;
- } catch (IOException e) {
- Log.logSevere("BOOKMARKS", "autoReCrawl - IOException: defaults/autoReCrawl.conf", e);
- return false;
- }
- } catch( Exception ex ) {
- Log.logSevere("BOOKMARKS", "autoReCrawl - error reading " + file, ex);
- return false;
- }
- return true;
- }
-
- public void folderReCrawl(long schedule, String folder, String crawlingfilter, int newcrawlingdepth, long crawlingIfOlder,
- int crawlingDomFilterDepth, int crawlingDomMaxPages, boolean crawlingQ, boolean indexText, boolean indexMedia,
- boolean crawlOrder, boolean xsstopw, boolean storeHTCache, CrawlProfile.CacheStrategy cacheStrategy) {
-
- final Switchboard sb = Switchboard.getSwitchboard();
- final Iterator bit = getBookmarksIterator(folder, true);
- Log.logInfo("BOOKMARKS", "autoReCrawl - processing: "+folder);
-
- final boolean xdstopw = xsstopw;
- final boolean xpstopw = xsstopw;
-
- while(bit.hasNext()) {
-
- final Bookmark bm = getBookmark(bit.next());
- final long sleepTime = Long.parseLong(sb.getConfig("autoReCrawl_idlesleep" , SLEEP_TIME));
- final long interTime = (System.currentTimeMillis()-bm.getTimeStamp())%schedule;
-
- final Date date = new Date(bm.getTimeStamp());
- Log.logInfo("BOOKMARKS", "autoReCrawl - checking schedule for: "+"["+DateFormatter.formatISO8601(date)+"] "+bm.getUrl());
-
- if (interTime >= 0 && interTime < sleepTime) {
- try {
- int pos = 0;
- // set crawlingStart to BookmarkUrl
- final String crawlingStart = bm.getUrl();
- String newcrawlingMustMatch = crawlingfilter;
-
- final DigestURI crawlingStartURL = new DigestURI(crawlingStart, null);
-
- // set the crawling filter
- if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = ".*"; // avoid that all urls are filtered out if bad value was submitted
-
- if (crawlingStartURL!= null && newcrawlingMustMatch.equals("dom")) {
- newcrawlingMustMatch = ".*" + crawlingStartURL.getHost() + ".*";
- }
- if (crawlingStart!= null && newcrawlingMustMatch.equals("sub") && (pos = crawlingStart.lastIndexOf("/")) > 0) {
- newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*";
- }
-
- // check if the crawl filter works correctly
- Pattern.compile(newcrawlingMustMatch);
-
- final byte[] urlhash = crawlingStartURL.hash();
-
- sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).remove(urlhash);
- sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
- sb.crawlQueues.errorURL.remove(urlhash);
-
- // stack url
- sb.crawler.profilesPassiveCrawls.removeEntry(crawlingStartURL.hash()); // if there is an old entry, delete it
- final CrawlProfile.entry pe = sb.crawler.profilesActiveCrawls.newEntry(
- folder+"/"+crawlingStartURL, crawlingStartURL,
- newcrawlingMustMatch,
- CrawlProfile.MATCH_BAD_URL,
- newcrawlingdepth,
- sb.crawler.profilesActiveCrawls.getRecrawlDate(crawlingIfOlder), crawlingDomFilterDepth, crawlingDomMaxPages,
- crawlingQ,
- indexText, indexMedia,
- storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cacheStrategy);
- sb.crawlStacker.enqueueEntry(new Request(
- sb.peers.mySeed().hash.getBytes(),
- crawlingStartURL,
- null,
- "CRAWLING-ROOT",
- new Date(),
- pe.handle(),
- 0,
- 0,
- 0
- ));
- Log.logInfo("BOOKMARKS", "autoReCrawl - adding crawl profile for: " + crawlingStart);
- // serverLog.logInfo("BOOKMARKS", "autoReCrawl - crawl filter is set to: " + newcrawlingfilter);
- // generate a YaCyNews if the global flag was set
- if (crawlOrder) {
- Map m = new HashMap(pe.map()); // must be cloned
- m.remove("specificDepth");
- m.remove("indexText");
- m.remove("indexMedia");
- m.remove("remoteIndexing");
- m.remove("xsstopw");
- m.remove("xpstopw");
- m.remove("xdstopw");
- m.remove("storeTXCache");
- m.remove("storeHTCache");
- m.remove("generalFilter");
- m.remove("specificFilter");
- m.put("intention", "Automatic ReCrawl!");
- sb.peers.newsPool.publishMyNews(sb.peers.mySeed(), yacyNewsPool.CATEGORY_CRAWL_START, m);
- }
- } catch (MalformedURLException e1) {}
- } // if
- } // while(bit.hasNext())
- } // } autoReCrawl()
-
// -----------------------------------------------------------
// bookmarksDB's functions for bookmarksTable / bookmarkCache
// -----------------------------------------------------------
diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java
index da84b7ba5..7cf670836 100644
--- a/source/de/anomic/search/Switchboard.java
+++ b/source/de/anomic/search/Switchboard.java
@@ -137,7 +137,6 @@ import de.anomic.data.userDB;
import de.anomic.data.wiki.wikiBoard;
import de.anomic.data.wiki.wikiCode;
import de.anomic.data.wiki.wikiParser;
-//import de.anomic.http.client.Client;
import de.anomic.http.client.Cache;
import de.anomic.http.server.HTTPDemon;
import de.anomic.http.server.HeaderFramework;
@@ -211,7 +210,7 @@ public final class Switchboard extends serverSwitch {
public boolean rankingOn;
public CRDistribution rankingOwnDistribution;
public CRDistribution rankingOtherDistribution;
- public Map outgoingCookies, incomingCookies;
+ public Map outgoingCookies, incomingCookies;
public volatile long proxyLastAccess, localSearchLastAccess, remoteSearchLastAccess;
public yacyCore yc;
public ResourceObserver observer;
@@ -608,7 +607,7 @@ public final class Switchboard extends serverSwitch {
SwitchboardConstants.CLEANUP_METHOD_JOBCOUNT,
SwitchboardConstants.CLEANUP_METHOD_FREEMEM,
60000, Long.MAX_VALUE, 10000, Long.MAX_VALUE),
- 600000); // all 5 Minutes, wait 10 minutes until first run
+ 60000); // all 5 Minutes, wait 1 minute until first run
deployThread(SwitchboardConstants.SURROGATES, "Surrogates", "A thread that polls the SURROGATES path and puts all Documents in one surroagte file into the indexing queue.", null,
new InstantBusyThread(
this,
diff --git a/source/net/yacy/kelondro/blob/Tables.java b/source/net/yacy/kelondro/blob/Tables.java
index 821e89113..b3d243b2d 100644
--- a/source/net/yacy/kelondro/blob/Tables.java
+++ b/source/net/yacy/kelondro/blob/Tables.java
@@ -433,6 +433,17 @@ public class Tables {
return dflt;
}
}
+
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append('{');
+ for (Map.Entry entry: this.entrySet()) {
+ sb.append(entry.getKey()).append('=').append(new String(entry.getValue())).append(", ");
+ }
+ if (sb.length() > 1) sb.setLength(sb.length() - 2);
+ sb.append('}');
+ return sb.toString();
+ }
}
public class Row extends Data {