-
- #(addedUrls)#::Added #[added]# URLs!#(/addedUrls)#
-
- #%env/templates/footer.template%#
-
-
\ No newline at end of file
diff --git a/htroot/CrawlURLFetchStack_p.java b/htroot/CrawlURLFetchStack_p.java
deleted file mode 100644
index 40e868b4c..000000000
--- a/htroot/CrawlURLFetchStack_p.java
+++ /dev/null
@@ -1,299 +0,0 @@
-// CrawlURLFetchStack_p.java
-// -------------------------------------
-// part of YACY
-//
-// (C) 2007 by Franz Brausze
-//
-// last change: $LastChangedDate: $ by $LastChangedBy: $
-//
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation; either version 2 of the License, or
-// (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with this program; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-//
-// Using this software in any meaning (reading, learning, copying, compiling,
-// running) means that you agree that the Author(s) is (are) not responsible
-// for cost, loss of data or any harm that may be caused directly or indirectly
-// by usage of this softare or this documentation. The usage of this software
-// is on your own risk. The installation and usage (starting/running) of this
-// software may allow other people or application to access your computer and
-// any attached devices and is highly dependent on the configuration of the
-// software which must be done by the user of the software; the author(s) is
-// (are) also not responsible for proper configuration and usage of the
-// software, even if provoked by documentation provided together with
-// the software.
-//
-// Any changes to this file according to the GPL as documented in the file
-// gpl.txt aside this file in the shipment you received can be done to the
-// lines that follows this copyright notice here, but changes must not be
-// done inside the copyright notive above. A re-distribution must contain
-// the intact and unchanged copyright notice.
-// Contributions and changes to the program code must be marked as such.
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.OutputStream;
-import java.io.PrintWriter;
-import java.io.Writer;
-import java.net.MalformedURLException;
-import java.util.HashMap;
-import java.util.Iterator;
-
-import de.anomic.data.URLFetcherStack;
-import de.anomic.htmlFilter.htmlFilterContentScraper;
-import de.anomic.htmlFilter.htmlFilterWriter;
-import de.anomic.http.httpHeader;
-import de.anomic.plasma.plasmaCrawlEntry;
-import de.anomic.plasma.plasmaCrawlNURL;
-import de.anomic.plasma.plasmaSwitchboard;
-import de.anomic.plasma.urlPattern.plasmaURLPattern;
-import de.anomic.server.serverFileUtils;
-import de.anomic.server.serverObjects;
-import de.anomic.server.serverSwitch;
-import de.anomic.server.logging.serverLog;
-import de.anomic.yacy.yacyURL;
-
-public class CrawlURLFetchStack_p {
-
- public static final HashMap /* of PeerName, sent URLs */ fetchMap = new HashMap();
- private static URLFetcherStack stack = null;
- public static int maxURLsPerFetch = 50;
-
- public static URLFetcherStack getURLFetcherStack(serverSwitch env) {
- if (stack == null) try {
- stack = new URLFetcherStack(env.getConfigPath(plasmaSwitchboard.DBPATH, plasmaSwitchboard.DBPATH_DEFAULT));
- } catch (IOException e) {
- serverLog.logSevere("URLFETCHER", "Couldn't initialize URL stack: " + e.getMessage());
- }
- return stack;
- }
-
- public static final String STREAM_CMD_ADDURLS_ = "ADD URLS: ";
- public static final String STREAM_CMD_ADDURLSBLCHK_ = "ADD URLS CHECK BLACKLIST: ";
- public static final String STREAM_CMD_END = "END";
- public static final String STREAM_RESP_OK_ADDURLS_ = "FAILED URLS: ";
- public static final String STREAM_RESP_OK = "OK";
- public static final String STREAM_RESP_FAILED = "FAILED";
-
- public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
- final serverObjects prop = new serverObjects();
- plasmaSwitchboard sb = (plasmaSwitchboard)env;
-
- if (((String)header.get(httpHeader.CONNECTION_PROP_PATH)).endsWith(".stream")) {
- /* =================================================================
- * .stream request
- * ================================================================= */
- InputStream in = (InputStream)header.get(httpHeader.CONNECTION_PROP_INPUTSTREAM);
- OutputStream out = (OutputStream)header.get(httpHeader.CONNECTION_PROP_OUTPUTSTREAM);
- BufferedReader inrb = new BufferedReader(new InputStreamReader(in));
- PrintWriter outw = new PrintWriter(out);
-
- String line;
- int addurls = 0, cururl = 0;
- boolean[] status = new boolean[0];
- boolean blchk = false;
- URLFetcherStack stack = getURLFetcherStack(env);
- try {
- while ((line = inrb.readLine()) != null) {
- // commands
- if (line.startsWith(STREAM_CMD_ADDURLS_)) {
- try {
- addurls = Integer.parseInt(line.substring(STREAM_CMD_ADDURLS_.length()));
- status = new boolean[addurls];
- cururl = 0;
- blchk = false;
- outw.println(STREAM_RESP_OK);
- } catch (NumberFormatException e) {
- outw.println(STREAM_RESP_FAILED);
- }
- } else if (line.startsWith(STREAM_CMD_ADDURLSBLCHK_)) {
- try {
- addurls = Integer.parseInt(line.substring(STREAM_CMD_ADDURLSBLCHK_.length()));
- status = new boolean[addurls];
- cururl = 0;
- blchk = true;
- outw.println(STREAM_RESP_OK);
- } catch (NumberFormatException e) {
- outw.println(STREAM_RESP_FAILED);
- }
- } else if (line.equals(STREAM_CMD_END)) {
- break;
- } else {
- if (cururl < addurls) // add url
- status[cururl++] = addURL(line, blchk, stack);
-
- if (cururl > 0 && cururl == addurls ) {
- // done with parsing the passed URL count, now some status output: i.e. 'FAILED URLS: 5 of 8'
- outw.print(STREAM_RESP_OK_ADDURLS_);
- StringBuffer stat = new StringBuffer();
- for (int i=0; i 0) {
- maxURLsPerFetch = count;
- prop.put("set", "1");
- prop.put("set_value", maxURLsPerFetch);
- } else {
- prop.put("set", "2");
- prop.put("set_value", count);
- }
- }
- else if (post.containsKey("shiftlcq")) {
- final int count = Math.min(post.getInt("shiftloc", 0), sb.crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE));
- final int failed = shiftFromNotice(sb.crawlQueues.noticeURL, plasmaCrawlNURL.STACK_TYPE_CORE, getURLFetcherStack(env), count);
- prop.put("shiftloc", "1");
- prop.put("shiftloc_value", count - failed);
- prop.put("shiftloc_failed", failed);
- }
- else if (post.containsKey("shiftrcq")) {
- final int count = post.getInt("shiftrem", 0);
- final int failed = shiftFromNotice(sb.crawlQueues.noticeURL, plasmaCrawlNURL.STACK_TYPE_LIMIT, getURLFetcherStack(env), count);
- prop.put("shiftrem", "1");
- prop.put("shiftrem_value", count - failed);
- prop.put("shiftrem_failed", failed);
- }
- else if (post.containsKey("subupload")) {
- if (post.get("upload", "").length() == 0) {
- prop.put("uploadError", "1");
- } else {
- final File file = new File(post.get("upload", ""));
- final String content = new String((byte[])post.get("upload$file"));
-
- final String type = post.get("uploadType", "");
- final boolean blCheck = post.containsKey("blacklistCheck");
- if (type.equals("plain")) {
- prop.put("upload_added", addURLs(content.split("\n"), blCheck, getURLFetcherStack(env)));
- prop.put("upload_failed", "0");
- prop.put("upload", "1");
- } else if (type.equals("html")) {
- try {
- final htmlFilterContentScraper scraper = new htmlFilterContentScraper(new yacyURL(file));
- final Writer writer = new htmlFilterWriter(null, null, scraper, null, false);
- serverFileUtils.write(content, writer);
- writer.close();
-
- final Iterator it = ((HashMap)scraper.getAnchors()).keySet().iterator();
- int added = 0, failed = 0;
- yacyURL url;
- while (it.hasNext()) try {
- url = new yacyURL((String) it.next(), null);
- if (blCheck && plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, url)) {
- failed++;
- continue;
- }
- getURLFetcherStack(env).push(url);
- added++;
- } catch (MalformedURLException e) { failed++; }
- prop.put("upload", "1");
- prop.put("upload_added", added);
- prop.put("upload_failed", failed);
- } catch (Exception e) {
- e.printStackTrace();
- prop.put("upload", "2");
- prop.putHTML("upload_error", e.getMessage());
- }
- }
- }
- }
- }
-
- putFetched(prop);
- prop.put("urlCount", getURLFetcherStack(env).size());
- prop.put("totalFetched", getURLFetcherStack(env).getPopped());
- prop.put("totalAdded", getURLFetcherStack(env).getPushed());
- prop.put("maxSize", maxURLsPerFetch);
- prop.put("locurls", sb.crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE));
- prop.put("remurls", sb.crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT));
- prop.put("locurlsVal", Math.min(sb.crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE), 500));
- prop.put("remurlsVal", Math.min(sb.crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT), 500));
- return prop;
- }
-
- private static void putFetched(serverObjects prop) {
- Iterator it = fetchMap.keySet().iterator();
- int count = 0;
- while (it.hasNext()) {
- String key = (String)it.next();
- prop.putHTML("peers_" + count + "_peer", key);
- prop.put("peers_" + count + "_amount", ((Integer)fetchMap.get(key)).intValue());
- count++;
- }
- prop.put("peers", count);
- }
-
- private static int addURLs(String[] urls, boolean blCheck, URLFetcherStack stack) {
- int count = -1;
- for (int i=0; i
-
-
- YaCy '#[clientname]#': URL Fetcher Management
- #%env/templates/metas.template%#
-
-
- #%env/templates/header.template%#
- #%env/templates/submenuCrawlURLFetch.template%#
-
URL-Fetcher
-
-
- #(threadError)#::
- Error on stopping thread, it isn't alive anymore::
- Error on restarting thread, it isn't alive anymore#(/threadError)#
-
- #(runs)#::
-
- #(/runs)#
- #%env/templates/footer.template%#
-
-
\ No newline at end of file
diff --git a/htroot/CrawlURLFetch_p.java b/htroot/CrawlURLFetch_p.java
deleted file mode 100644
index 5e19f2d02..000000000
--- a/htroot/CrawlURLFetch_p.java
+++ /dev/null
@@ -1,543 +0,0 @@
-// CrawlURLFetch_p.java
-// -------------------------------------
-// part of YACY
-//
-// (C) 2007 by Franz Brausze
-//
-// last change: $LastChangedDate: $ by $LastChangedBy: $
-//
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation; either version 2 of the License, or
-// (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with this program; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-//
-// Using this software in any meaning (reading, learning, copying, compiling,
-// running) means that you agree that the Author(s) is (are) not responsible
-// for cost, loss of data or any harm that may be caused directly or indirectly
-// by usage of this softare or this documentation. The usage of this software
-// is on your own risk. The installation and usage (starting/running) of this
-// software may allow other people or application to access your computer and
-// any attached devices and is highly dependent on the configuration of the
-// software which must be done by the user of the software; the author(s) is
-// (are) also not responsible for proper configuration and usage of the
-// software, even if provoked by documentation provided together with
-// the software.
-//
-// Any changes to this file according to the GPL as documented in the file
-// gpl.txt aside this file in the shipment you received can be done to the
-// lines that follows this copyright notice here, but changes must not be
-// done inside the copyright notive above. A re-distribution must contain
-// the intact and unchanged copyright notice.
-// Contributions and changes to the program code must be marked as such.
-
-import java.io.IOException;
-import java.net.MalformedURLException;
-import java.util.ArrayList;
-import java.util.Date;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.Random;
-import java.util.TreeMap;
-
-import de.anomic.plasma.plasmaCrawlProfile;
-import de.anomic.plasma.plasmaCrawlZURL;
-import de.anomic.plasma.plasmaSwitchboard;
-import de.anomic.server.serverByteBuffer;
-import de.anomic.server.serverSwitch;
-import de.anomic.http.httpHeader;
-import de.anomic.http.httpRemoteProxyConfig;
-import de.anomic.http.httpc;
-import de.anomic.server.serverObjects;
-import de.anomic.server.logging.serverLog;
-import de.anomic.yacy.yacyCore;
-import de.anomic.yacy.yacySeed;
-import de.anomic.yacy.yacyURL;
-import de.anomic.yacy.yacyVersion;
-
-public class CrawlURLFetch_p {
-
- private static final long ERR_DATE = 1;
- private static final long ERR_HOST_MALFORMED_URL = 1;
- private static final long ERR_PEER_GENERAL_CONN = 1;
- private static final long ERR_PEER_OFFLINE = 2;
- private static final long ERR_THREAD_STOP = 1;
- private static final long ERR_THREAD_RESUME = 2;
-
- private static final long STAT_THREAD_ALIVE = 0;
- private static final long STAT_THREAD_STOPPED = 1;
- private static final long STAT_THREAD_PAUSED = 2;
-
- private static URLFetcher fetcher = null;
- private static plasmaCrawlProfile.entry profile = null;
- private static ArrayList savedURLs = new ArrayList();
-
- public static plasmaCrawlProfile.entry getCrawlProfile(serverSwitch env) {
- if (profile == null) {
- profile = ((plasmaSwitchboard)env).profilesActiveCrawls.newEntry(
- "URLFetcher", // Name
- null, // URL
- ".*", ".*", // General / specific filter
- 0, 0, // General / specific depth
- -1, -1, -1, // Recrawl / Dom-filter depth / Dom-max-pages
- true, // Crawl query
- true, true, // Index text / media
- false, true, // Store in HT- / TX-Cache
- false, // Remote indexing
- true, false, false); // Exclude static / dynamic / parent stopwords
- }
- return profile;
- }
-
- public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
- serverObjects prop = new serverObjects();
- prop.put("host", "");
-
- // List previously saved URLs for easy selection
- listURLs(prop);
-
- // List known hosts
- listPeers(prop,
- post != null && post.containsKey("checkPeerURLCount"),
- ((plasmaSwitchboard)env).remoteProxyConfig);
-
- if (post != null) {
- if (post.containsKey("start")) {
- long frequency = URLFetcher.DELAY_ONCE;
- if (post.containsKey("reg")) {
- if (post.get("reg", "").equals("self_det")) {
- frequency = URLFetcher.DELAY_SELF_DET;
- } else if (post.get("reg", "").equals("delay")) {
- frequency = getDate(post.get("frequency", ""), post.get("freq_type", ""));
- if (frequency == -1)
- prop.put("freqError", ERR_DATE);
- }
- }
-
- int count = 50;
- if (post.get("amount", "").matches("\\d+")) {
- count = Integer.parseInt(post.get("amount", ""));
- if (count > 999) count = 999;
- }
-
- if (fetcher != null) fetcher.interrupt();
- fetcher = null;
- if (post.get("source", "").equals("peer") &&
- post.get("peerhash", "").equals("random")) {
- fetcher = new URLFetcher(
- env,
- getCrawlProfile(env),
- count,
- frequency);
- } else {
- yacyURL url = null;
- if (post.get("source", "").equals("url")) {
- try {
- url = new yacyURL(post.get("host", null), null);
- if (!savedURLs.contains(url.toNormalform(true, true)))
- savedURLs.add(url.toNormalform(true, true));
- prop.put("host", post.get("host", url.toString()));
- } catch (MalformedURLException e) {
- prop.put("host", post.get("host", ""));
- prop.put("hostError", ERR_HOST_MALFORMED_URL);
- }
- } else if (post.get("source", "").equals("savedURL")) {
- try {
- url = new yacyURL(post.get("saved", ""), null);
- } catch (MalformedURLException e) {
- /* should never appear, except for invalid input, see above */
- }
- } else if (post.get("source", "").equals("peer")) {
- yacySeed ys = null;
- ys = yacyCore.seedDB.get(post.get("peerhash", null));
- if (ys != null) {
- if ((url = URLFetcher.getListServletURL(
- ys.getPublicAddress(),
- URLFetcher.MODE_LIST,
- count,
- yacyCore.seedDB.mySeed().hash)) == null) {
- prop.put("peerError", ERR_PEER_GENERAL_CONN);
- prop.put("peerError_hash", post.get("peerhash", ""));
- prop.put("peerError_name", ys.getName());
- }
- } else {
- prop.put("peerError", ERR_PEER_OFFLINE);
- prop.put("peerError_hash", post.get("peerhash", ""));
- }
- }
-
- if (url != null) {
- fetcher = new URLFetcher(
- env,
- getCrawlProfile(env),
- url,
- count,
- frequency);
- }
- }
- if (fetcher != null) fetcher.start();
- }
- else if (post.containsKey("stop")) {
- if (fetcher != null) {
- fetcher.interrupt();
- } else {
- prop.put("threadError", ERR_THREAD_STOP);
- }
- }
- else if (post.containsKey("restart")) {
- if (fetcher != null) {
- fetcher.interrupt();
- if (fetcher.url == null) {
- fetcher = new URLFetcher(
- env,
- getCrawlProfile(env),
- fetcher.count,
- fetcher.delay);
- } else {
- fetcher = new URLFetcher(
- env,
- getCrawlProfile(env),
- fetcher.url,
- fetcher.count,
- fetcher.delay);
- }
- fetcher.start();
- } else {
- prop.put("threadError", ERR_THREAD_RESUME);
- }
- }
- else if (post.containsKey("resetDelay")) {
- final long frequency = getDate(post.get("newDelay", ""), "minutes");
- if (frequency == -1) {
- prop.put("freqError", ERR_DATE);
- } else {
- fetcher.delay = frequency;
- }
- }
- prop.put("LOCATION", "/CrawlURLFetch_p.html");
- }
-
- if (fetcher != null) {
- prop.put("runs", "1");
- prop.put("runs_status",
- ((fetcher.paused && fetcher.isAlive()) ? STAT_THREAD_PAUSED :
- (fetcher.isAlive()) ? STAT_THREAD_ALIVE : STAT_THREAD_STOPPED));
- prop.putNum("runs_totalRuns", URLFetcher.totalRuns);
- prop.putNum("runs_totalFetchedURLs", URLFetcher.totalFetchedURLs);
- prop.putNum("runs_totalFailedURLs", URLFetcher.totalFailed);
- prop.putNum("runs_lastRun", fetcher.lastRun);
- prop.putNum("runs_lastFetchedURLs", fetcher.lastFetchedURLs);
- prop.put("runs_lastServerResponse", (fetcher.lastServerResponse == null)
- ? "" : fetcher.lastServerResponse);
- prop.putNum("runs_curDelay", (int)(fetcher.delay / 60000));
-
- Iterator it = fetcher.failed.keySet().iterator();
- int i = 0;
- Object key;
- while (it.hasNext()) {
- key = it.next();
- prop.put("runs_error_" + i + "_reason", fetcher.failed.get(key));
- prop.put("runs_error_" + i + "_url", (String)key);
- i++;
- }
- prop.put("runs_error", i);
- }
-
- return prop;
- }
-
- private static int listURLs(serverObjects prop) {
- if (savedURLs.size() == 0) return 0;
- prop.put("saved", "1");
- for (int i=0; i 0) {
- final Iterator e = yacyCore.seedDB.seedsConnected(true, false, null, yacyVersion.YACY_PROVIDES_CRAWLS_VIA_LIST_HTML);
- int dbsize;
- while (e.hasNext()) {
- yacySeed seed = (yacySeed) e.next();
- if (seed != null && !seed.hash.equals(yacyCore.seedDB.mySeed().hash)) {
- peername = seed.get(yacySeed.NAME, "nameless");
- if (checkURLCount && (dbsize = getURLs2Fetch(seed, theRemoteProxyConfig)) > 0) {
- hostList.put(peername + " (" + dbsize + ")", seed.hash);
- } else {
- hostList.put(peername, seed.hash);
- }
- }
- }
- }
-
- if (hostList.size() > 0) {
- while (!hostList.isEmpty() && (peername = (String) hostList.firstKey()) != null) {
- final String hash = (String) hostList.get(peername);
- prop.put("peersKnown_peers_" + peerCount + "_hash", hash);
- prop.put("peersKnown_peers_" + peerCount + "_name", peername);
- hostList.remove(peername);
- peerCount++;
- }
- prop.put("peersKnown_peers", peerCount);
- prop.put("peersKnown", "1");
- } else {
- prop.put("peersKnown", "0");
- }
- return peerCount;
- }
-
- private static int getURLs2Fetch(yacySeed seed, httpRemoteProxyConfig theRemoteProxyConfig) {
- try {
- String answer = new String(httpc.wget(
- URLFetcher.getListServletURL(seed.getPublicAddress(), URLFetcher.MODE_COUNT, 0, null),
- seed.getIP(),
- 5000,
- null, null,
- theRemoteProxyConfig,
- null,
- null));
- if (answer.matches("\\d+"))
- return Integer.parseInt(answer);
- else {
- serverLog.logFine("URLFETCHER", "Retrieved invalid answer from " + seed.getName() + ": '" + answer + "'");
- return -1;
- }
- } catch (MalformedURLException e) {
- /* should not happen */
- return -3;
- } catch (IOException e) {
- return -2;
- }
- }
-
- private static long getDate(String count, String type) {
- long r = 0;
- if (count != null && count.matches("\\d+")) r = Long.parseLong(count);
- if (r < 1) return -1;
-
- r *= 60000;
- if (type.equals("days")) return r * 60 * 24;
- else if (type.equals("hours")) return r * 60;
- else if (type.equals("minutes")) return r;
- else return -1;
- }
-
- public static class URLFetcher extends Thread {
-
- public static final long DELAY_ONCE = -1;
- public static final long DELAY_SELF_DET = 0;
-
- public static final int MODE_LIST = 0;
- public static final int MODE_COUNT = 1;
-
- public static int totalRuns = 0;
- public static int totalFetchedURLs = 0;
- public static int totalFailed = 0;
-
- public final HashMap failed = new HashMap();
-
- public int lastFetchedURLs = 0;
- public long lastRun = 0;
- public String lastServerResponse = null;
- public int lastFailed = 0;
-
- public final yacyURL url;
- public final int count;
- public long delay;
- public final plasmaSwitchboard sb;
- public final plasmaCrawlProfile.entry profile;
-
- public boolean paused = false;
-
- public static yacyURL getListServletURL(String host, int mode, int count, String peerHash) {
- String r = "http://" + host + "/yacy/list.html?list=queueUrls&display=";
-
- switch (mode) {
- case MODE_LIST: r += "list"; break;
- case MODE_COUNT: r += "count"; break;
- }
-
- if (count > 0) r += "&count=" + count;
-
- if (peerHash != null && peerHash.length() > 0) {
- r += "&iam=" + peerHash;
- } else if (mode == MODE_LIST) {
- r += "&iam=" + yacyCore.seedDB.mySeed().hash;
- }
-
- try {
- return new yacyURL(r, null);
- } catch (MalformedURLException e) {
- return null;
- }
- }
-
- public URLFetcher(
- serverSwitch env,
- plasmaCrawlProfile.entry profile,
- yacyURL url,
- int count,
- long delayMs) {
- if (env == null || profile == null || url == null)
- throw new NullPointerException("env, profile or url must not be null");
- this.sb = (plasmaSwitchboard)env;
- this.profile = profile;
- this.url = url;
- this.count = count;
- this.delay = delayMs;
- this.setName("URLFetcher");
- }
-
- public URLFetcher(
- serverSwitch env,
- plasmaCrawlProfile.entry profile,
- int count,
- long delayMs) {
- if (env == null || profile == null)
- throw new NullPointerException("env or profile must not be null");
- this.sb = (plasmaSwitchboard)env;
- this.profile = profile;
- this.url = null;
- this.count = count;
- this.delay = delayMs;
- this.setName("URLFetcher");
- }
-
- public void run() {
- this.paused = false;
- long start;
- yacyURL url;
- while (!isInterrupted()) {
- try {
- start = System.currentTimeMillis();
- url = getDLURL();
- if (url == null) {
- serverLog.logSevere(this.getName(), "canceled because no valid URL for the URL-list could be determinded");
- return;
- }
- totalFetchedURLs += stackURLs(getURLs(url));
- this.lastRun = System.currentTimeMillis() - start;
- totalRuns++;
- serverLog.logInfo(this.getName(), "Loaded " + this.lastFetchedURLs + " URLs from " + url + " in " + this.lastRun + " ms into stackcrawler.");
- if (this.delay < 0 || isInterrupted()) {
- return;
- } else synchronized (this) {
- if (this.delay == 0) {
- this.paused = true;
- while (this.paused) this.wait();
- } else {
- this.paused = true;
- this.wait(this.delay);
- }
- }
- this.paused = false;
- } catch (InterruptedException e) { return; }
- }
- }
-
- private yacyURL getDLURL() {
- if (this.url != null) return this.url;
-
- // choose random seed
- yacySeed ys = null;
- Iterator e = yacyCore.seedDB.seedsConnected(true, false, null, yacyVersion.YACY_PROVIDES_CRAWLS_VIA_LIST_HTML);
- int num = new Random().nextInt(yacyCore.seedDB.sizeConnected()) + 1;
- Object o;
- for (int i=0; i
\ No newline at end of file
diff --git a/htroot/js/WatchCrawler.js b/htroot/js/WatchCrawler.js
index 0bc7002d0..96f9d4691 100644
--- a/htroot/js/WatchCrawler.js
+++ b/htroot/js/WatchCrawler.js
@@ -150,9 +150,16 @@ function handleQueues(){
updateTable(localcrawlerqueue, "local crawler");
+ limitcrawlerqueue=getFirstChild(xml, "limitcrawlerqueue");
+ updateTable(limitcrawlerqueue, "limitCrawlerTable");
+ limitcrawlerqueue_size=getValue(getFirstChild(limitcrawlerqueue, "size"));
+ limitcrawlerqueue_state=getValue(getFirstChild(limitcrawlerqueue, "state"));
+ document.getElementById("limitcrawlerqueuesize").firstChild.nodeValue=limitcrawlerqueue_size;
+ putQueueState("limitcrawler", limitcrawlerqueue_state);
+ updateTable(limitcrawlerqueue, "limit crawler");
+
remotecrawlerqueue=getFirstChild(xml, "remotecrawlerqueue");
updateTable(remotecrawlerqueue, "remoteCrawlerTable");
-
remotecrawlerqueue_size=getValue(getFirstChild(remotecrawlerqueue, "size"));
remotecrawlerqueue_state=getValue(getFirstChild(remotecrawlerqueue, "state"));
document.getElementById("remotecrawlerqueuesize").firstChild.nodeValue=remotecrawlerqueue_size;
diff --git a/htroot/rct_p.html b/htroot/rct_p.html
new file mode 100644
index 000000000..af4a2dda4
--- /dev/null
+++ b/htroot/rct_p.html
@@ -0,0 +1,30 @@
+
+
+
+ YaCy '#[clientname]#': Index Control
+ #%env/templates/metas.template%#
+
+
+ #%env/templates/header.template%#
+
remote crawl fetch test
+
+
+
+ #%env/templates/footer.template%#
+
+
\ No newline at end of file
diff --git a/htroot/rct_p.java b/htroot/rct_p.java
new file mode 100644
index 000000000..c2c33a2f7
--- /dev/null
+++ b/htroot/rct_p.java
@@ -0,0 +1,124 @@
+// rct_p.java
+// -----------------------
+// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
+// first published 28.11.2007 on http://yacy.net
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// $LastChangedDate: 2007-11-14 01:15:28 +0000 (Mi, 14 Nov 2007) $
+// $LastChangedRevision: 4216 $
+// $LastChangedBy: orbiter $
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+import java.net.MalformedURLException;
+import java.text.ParseException;
+import java.util.Date;
+import java.util.Iterator;
+
+import de.anomic.http.httpHeader;
+import de.anomic.plasma.plasmaSwitchboard;
+import de.anomic.server.serverDate;
+import de.anomic.server.serverObjects;
+import de.anomic.server.serverSwitch;
+import de.anomic.xml.rssReader;
+import de.anomic.yacy.yacyClient;
+import de.anomic.yacy.yacyCore;
+import de.anomic.yacy.yacySeed;
+import de.anomic.yacy.yacyURL;
+
+public class rct_p {
+
+ public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
+ // return variable that accumulates replacements
+ plasmaSwitchboard sb = (plasmaSwitchboard) env;
+ serverObjects prop = new serverObjects();
+
+ if (post != null) {
+ if (post.containsKey("retrieve")) {
+ String peerhash = post.get("peer", null);
+ yacySeed seed = (peerhash == null) ? null : yacyCore.seedDB.getConnected(peerhash);
+ rssReader reader = (seed == null) ? null : yacyClient.queryRemoteCrawlURLs(seed, 10);
+ if (reader != null) {
+ rssReader.Item item;
+ for (int i = 0; i < reader.items(); i++) {
+ item = reader.getItem(i);
+ //System.out.println("URL=" + item.getLink() + ", desc=" + item.getDescription() + ", pubDate=" + item.getPubDate());
+
+ // put url on remote crawl stack
+ yacyURL url;
+ try {
+ url = new yacyURL(item.getLink(), null);
+ } catch (MalformedURLException e) {
+ url = null;
+ }
+ Date loaddate;
+ try {
+ loaddate = serverDate.parseShortSecondTime(item.getPubDate());
+ } catch (ParseException e) {
+ loaddate = new Date();
+ }
+ yacyURL referrer = null; // referrer needed!
+ if (sb.acceptURL(url)) {
+ // stack url
+ sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
+ String reasonString = sb.crawlStacker.stackCrawl(url, referrer, peerhash, "REMOTE-CRAWLING", loaddate, 0, sb.defaultRemoteProfile);
+
+ if (reasonString == null) {
+ // done
+ env.getLog().logInfo("crawlOrder: added remote crawl url: " + url.toNormalform(true, false));
+ } else if (reasonString.startsWith("double")) {
+ // case where we have already the url loaded;
+ env.getLog().logInfo("crawlOrder: ignored double remote crawl url: " + url.toNormalform(true, false));
+ } else {
+ env.getLog().logInfo("crawlOrder: ignored [" + reasonString + "] remote crawl url: " + url.toNormalform(true, false));
+ }
+ } else {
+ env.getLog().logWarning("crawlOrder: Received URL outside of our domain: " + url.toNormalform(true, false));
+ }
+ }
+ }
+ }
+ }
+
+ listHosts(prop);
+
+ // return rewrite properties
+ return prop;
+ }
+
+ private static void listHosts(serverObjects prop) {
+ // list known hosts
+ yacySeed seed;
+ int hc = 0;
+ if (yacyCore.seedDB != null && yacyCore.seedDB.sizeConnected() > 0) {
+ Iterator e = yacyCore.dhtAgent.getProvidesRemoteCrawlURLs();
+ while (e.hasNext()) {
+ seed = (yacySeed) e.next();
+ if (seed != null) {
+ prop.put("hosts_" + hc + "_hosthash", seed.hash);
+ prop.putHTML("hosts_" + hc + "_hostname", seed.hash + " " + seed.get(yacySeed.NAME, "nameless") + " (" + seed.getLong(yacySeed.RCOUNT, 0) + ")");
+ hc++;
+ }
+ }
+ prop.put("hosts", hc);
+ } else {
+ prop.put("hosts", "0");
+ }
+ }
+
+}
diff --git a/htroot/xml/queues_p.java b/htroot/xml/queues_p.java
index 82105b2ac..15e8e8321 100644
--- a/htroot/xml/queues_p.java
+++ b/htroot/xml/queues_p.java
@@ -163,11 +163,15 @@ public class queues_p {
prop.put("localCrawlState", sb.crawlJobIsPaused(plasmaSwitchboard.CRAWLJOB_LOCAL_CRAWL) ? STATE_PAUSED : STATE_RUNNING);
int stackSize = sb.crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
addNTable(prop, "list-local", sb.crawlQueues.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_CORE, Math.min(10, stackSize)));
-
-
+
//global crawl queue
- prop.putNum("remoteCrawlSize", Integer.toString(sb.getThread(plasmaSwitchboard.CRAWLJOB_GLOBAL_CRAWL_TRIGGER).getJobCount()));
- prop.put("remoteCrawlState", sb.crawlJobIsPaused(plasmaSwitchboard.CRAWLJOB_GLOBAL_CRAWL_TRIGGER) ? STATE_PAUSED : STATE_RUNNING);
+ prop.putNum("limitCrawlSize", Integer.toString(sb.crawlQueues.limitCrawlJobSize()));
+ prop.put("limitCrawlState", STATE_RUNNING);
+ stackSize = sb.crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT);
+
+ //global crawl queue
+ prop.putNum("remoteCrawlSize", Integer.toString(sb.getThread(plasmaSwitchboard.CRAWLJOB_REMOTE_TRIGGERED_CRAWL).getJobCount()));
+ prop.put("remoteCrawlState", sb.crawlJobIsPaused(plasmaSwitchboard.CRAWLJOB_REMOTE_TRIGGERED_CRAWL) ? STATE_PAUSED : STATE_RUNNING);
stackSize = sb.crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT);
if (stackSize == 0) {
diff --git a/htroot/xml/queues_p.xml b/htroot/xml/queues_p.xml
index ed82dd877..23cfea8a0 100644
--- a/htroot/xml/queues_p.xml
+++ b/htroot/xml/queues_p.xml
@@ -49,6 +49,22 @@
#{/list-local}#
+
+ #[limitCrawlSize]#
+ #[limitCrawlState]#
+#{list-limit}#
+
+ #[profile]#
+ #[initiator]#
+ #[depth]#
+ #[modified]#
+ #[anchor]#
+ #[url]#
+ #[hash]#
+ #(inProcess)#false::true#(/inProcess)#
+
+#{/list-limit}#
+#[remoteCrawlSize]##[remoteCrawlState]#
diff --git a/htroot/yacy/crawlOrder.java b/htroot/yacy/crawlOrder.java
index 5016d0694..33b20be67 100644
--- a/htroot/yacy/crawlOrder.java
+++ b/htroot/yacy/crawlOrder.java
@@ -129,7 +129,7 @@ public final class crawlOrder {
delay = "3600"; // may request one hour later again
} else try {
yacySeed requester = yacyCore.seedDB.getConnected(iam);
- int queuesize = switchboard.crawlQueues.coreCrawlJobSize() + switchboard.crawlQueues.limitCrawlTriggerJobSize() + switchboard.crawlQueues.remoteTriggeredCrawlJobSize() + switchboard.queueSize();
+ int queuesize = switchboard.crawlQueues.coreCrawlJobSize() + switchboard.crawlQueues.limitCrawlJobSize() + switchboard.crawlQueues.remoteTriggeredCrawlJobSize() + switchboard.queueSize();
if (requester == null) {
response = "denied";
reason = "unknown-client";
@@ -190,7 +190,7 @@ public final class crawlOrder {
env.getLog().logWarning("crawlOrder: Received not normalized Referer URL " + refv.get(0) + " of URL " + urlv.get(0));
}
- if (!switchboard.acceptURL(new yacyURL(newURL, null))) {
+ if (!switchboard.acceptURL(url)) {
env.getLog().logWarning("crawlOrder: Received URL outside of our domain: " + newURL);
return null;
}
diff --git a/htroot/yacy/list.html b/htroot/yacy/list.html
deleted file mode 100644
index 285c7277d..000000000
--- a/htroot/yacy/list.html
+++ /dev/null
@@ -1 +0,0 @@
-#[list]#
diff --git a/htroot/yacy/list.java b/htroot/yacy/list.java
deleted file mode 100644
index 91cccd224..000000000
--- a/htroot/yacy/list.java
+++ /dev/null
@@ -1,152 +0,0 @@
-// list.java
-// -----------------------
-// part of YaCy
-// (C) by Michael Peter Christen; mc@anomic.de
-// first published on http://www.anomic.de
-// Frankfurt, Germany, 2004
-//
-// This File is contributed by Alexander Schier
-//
-// $LastChangedDate$
-// $LastChangedRevision$
-// $LastChangedBy$
-//
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation; either version 2 of the License, or
-// (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with this program; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-//
-// Using this software in any meaning (reading, learning, copying, compiling,
-// running) means that you agree that the Author(s) is (are) not responsible
-// for cost, loss of data or any harm that may be caused directly or indirectly
-// by usage of this softare or this documentation. The usage of this software
-// is on your own risk. The installation and usage (starting/running) of this
-// software may allow other people or application to access your computer and
-// any attached devices and is highly dependent on the configuration of the
-// software which must be done by the user of the software; the author(s) is
-// (are) also not responsible for proper configuration and usage of the
-// software, even if provoked by documentation provided together with
-// the software.
-//
-// Any changes to this file according to the GPL as documented in the file
-// gpl.txt aside this file in the shipment you received can be done to the
-// lines that follows this copyright notice here, but changes must not be
-// done inside the copyright notive above. A re-distribution must contain
-// the intact and unchanged copyright notice.
-// Contributions and changes to the program code must be marked as such.
-
-// You must compile this file with
-// javac -classpath .:../../classes list.java
-// if the shell's current path is HTROOT
-
-// contains contributions by [FB] to support listing URLs for URL Fetcher
-
-import java.io.File;
-
-import de.anomic.data.URLFetcherStack;
-import de.anomic.data.htmlTools;
-import de.anomic.data.listManager;
-import de.anomic.http.httpHeader;
-import de.anomic.plasma.plasmaSwitchboard;
-import de.anomic.server.serverCore;
-import de.anomic.server.serverObjects;
-import de.anomic.server.serverSwitch;
-import de.anomic.server.logging.serverLog;
-import de.anomic.yacy.yacyCore;
-import de.anomic.yacy.yacyNetwork;
-import de.anomic.yacy.yacySeed;
-import de.anomic.yacy.yacyURL;
-
-public final class list {
-
- public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
- if (post == null || env == null)
- throw new NullPointerException("post: " + post + ", sb: " + env);
- plasmaSwitchboard sb = (plasmaSwitchboard) env;
-
- // return variable that accumulates replacements
- final serverObjects prop = new serverObjects();
- if ((post == null) || (env == null)) return prop;
- if (!yacyNetwork.authentifyRequest(post, env)) return prop;
-
- final String col = post.get("col", "");
- final File listsPath = env.getConfigPath(plasmaSwitchboard.LISTS_PATH, plasmaSwitchboard.LISTS_PATH_DEFAULT);
-
- String otherPeerName = null;
- if (post.containsKey("iam")) {
- yacySeed bla = yacyCore.seedDB.get(post.get("iam", ""));
- if (bla != null) otherPeerName = bla.getName();
- }
- if (otherPeerName == null) otherPeerName = (String)header.get(httpHeader.CONNECTION_PROP_CLIENTIP);
-
- if ((sb.isRobinsonMode()) && (!sb.isInMyCluster(otherPeerName))) {
- // if we are a robinson cluster, answer only if this client is known by our network definition
- return null;
- }
-
- if (col.equals("black")) {
- final StringBuffer out = new StringBuffer();
-
- final String filenames=env.getConfig("BlackLists.Shared", "");
- final String[] filenamesarray = filenames.split(",");
-
- if(filenamesarray.length > 0){
- for(int i = 0;i < filenamesarray.length; i++){
- String filename = filenamesarray[i];
- File fileObj = new File(listsPath,filename);
- out.append(listManager.getListString(fileObj, false))
- .append(serverCore.crlfString);
- }
- } // if filenamesarray.length > 0
-
- prop.put("list",out.toString());
- }
- // start contrib by [FB]
- else if (col.length() == 0 && post.get("list", "").equals("queueUrls")) {
- final URLFetcherStack db = CrawlURLFetchStack_p.getURLFetcherStack(env);
- final String display = post.get("display", "list");
- if (display.equals("list")) {
- // list urls from remote crawler queue for other peers
- final int count = Math.min(post.getInt("count", 50), CrawlURLFetchStack_p.maxURLsPerFetch);
-
- if (count > 0 && db.size() > 0) {
- final StringBuffer b = new StringBuffer();
-
- yacyURL url;
- int cnt = 0;
- for (int i=0; i
+
+
+
+
+
+
+c_32kgI-4HTE
+3226
+20071128030353
+ok
+
+
+
+
+
+
+
+
+
+
+http://publish.vx.roo.com/australian/ithomepagemini/
+sub
+
+20071126173629
+mlD2rBhnfuoY
+
+
+
+
+http://www.news.com.au/story/0%2C23599%2C22835669-2%2C00.html
+
+
+20071128014306
+qT1GjNRe_5SQ
+
+
+
+http://www.news.com.au/perthnow/story/0%2C21598%2C22835663-2761%2C00.html
+Driver injured: Willagee crash witnesses sought
+
+
+20071128014306
+yGMa4uRe_5SQ
+
+
+
+http://www.news.com.au/travel/story/0%2C26058%2C22835185-5014090%2C00.html
+
+
+20071128014306
+qfob36Re_5SQ
+
+
+
+
+http://www.news.com.au/story/0%2C23599%2C22835311-421%2C00.html
+
+
+20071128014306
+YBLVBNRe_5SQ
+
+
+
+http://www.thirdwayblog.com/wp-content/uploads/
+sub
+
+
+20071128010343
+9rnz2MUqGq6Z
+
+
+
+http://www.parliament.gr/kouselas/koino_dra/koino_docs/
+sub
+
+20071128010343
+hSTvg-u6LxcB
+
+
+
+
+http://upload.wikimedia.org/wikipedia/el/f/f1/
+sub
+
+20071128010343
+F-3WVJBs-F4R
+
+
+
+http://www.logiprint.nl/nl/Briefpapier_drukken_Eindhoven.html
+Briefpapier drukken Eindhoven
+
+20071011104246
+
+bmBv8j07Ta7B
+
+
+
+*/
\ No newline at end of file
diff --git a/htroot/yacy/urls.xml b/htroot/yacy/urls.xml
index c94fb4958..e4adbf389 100644
--- a/htroot/yacy/urls.xml
+++ b/htroot/yacy/urls.xml
@@ -24,7 +24,7 @@
#[description]##[author]##[pubDate]#
-#[guid]#
+#[guid]#
#{/item}#
diff --git a/source/de/anomic/plasma/crawler/plasmaCrawlQueues.java b/source/de/anomic/plasma/crawler/plasmaCrawlQueues.java
index acc3128e0..4719fd882 100644
--- a/source/de/anomic/plasma/crawler/plasmaCrawlQueues.java
+++ b/source/de/anomic/plasma/crawler/plasmaCrawlQueues.java
@@ -28,12 +28,14 @@ package de.anomic.plasma.crawler;
import java.io.File;
import java.io.IOException;
+import java.net.MalformedURLException;
+import java.text.ParseException;
+import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import de.anomic.data.robotsParser;
-import de.anomic.index.indexURLEntry;
import de.anomic.plasma.plasmaCrawlEntry;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaCrawlProfile;
@@ -41,8 +43,9 @@ import de.anomic.plasma.plasmaCrawlZURL;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaSwitchboard;
+import de.anomic.server.serverDate;
import de.anomic.server.logging.serverLog;
-import de.anomic.tools.crypt;
+import de.anomic.xml.rssReader;
import de.anomic.yacy.yacyClient;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeed;
@@ -54,6 +57,7 @@ public class plasmaCrawlQueues {
private serverLog log;
private HashMap workers; // mapping from url hash to Worker thread object
private plasmaProtocolLoader loader;
+ private ArrayList remoteCrawlProviderHashes;
public plasmaCrawlNURL noticeURL;
public plasmaCrawlZURL errorURL, delegatedURL;
@@ -63,6 +67,7 @@ public class plasmaCrawlQueues {
this.log = new serverLog("CRAWLER");
this.workers = new HashMap();
this.loader = new plasmaProtocolLoader(sb, log);
+ this.remoteCrawlProviderHashes = new ArrayList();
// start crawling management
log.logConfig("Starting Crawling Management");
@@ -108,6 +113,9 @@ public class plasmaCrawlQueues {
Iterator i = workers.values().iterator();
while (i.hasNext()) ((Thread) i.next()).interrupt();
// TODO: wait some more time until all threads are finished
+ noticeURL.close();
+ errorURL.close();
+ delegatedURL.close();
}
public plasmaCrawlEntry[] activeWorker() {
@@ -131,18 +139,32 @@ public class plasmaCrawlQueues {
}
public boolean coreCrawlJob() {
+
+ boolean robinsonPrivateCase = ((sb.isRobinsonMode()) &&
+ (!sb.getConfig(plasmaSwitchboard.CLUSTER_MODE, "").equals(plasmaSwitchboard.CLUSTER_MODE_PUBLIC_CLUSTER)) &&
+ (!sb.getConfig(plasmaSwitchboard.CLUSTER_MODE, "").equals(plasmaSwitchboard.CLUSTER_MODE_PRIVATE_CLUSTER)));
+
+ if ((robinsonPrivateCase) || ((coreCrawlJobSize() <= 20) && (limitCrawlJobSize() > 0))) {
+ // move some tasks to the core crawl job so we have something to do
+ int toshift = Math.min(10, limitCrawlJobSize()); // this cannot be a big number because the balancer makes a forced waiting if it cannot balance
+ for (int i = 0; i < toshift; i++) {
+ noticeURL.shift(plasmaCrawlNURL.STACK_TYPE_LIMIT, plasmaCrawlNURL.STACK_TYPE_CORE);
+ }
+ log.logInfo("shifted " + toshift + " jobs from global crawl to local crawl (coreCrawlJobSize()=" + coreCrawlJobSize() +
+ ", limitCrawlJobSize()=" + limitCrawlJobSize() + ", cluster.mode=" + sb.getConfig(plasmaSwitchboard.CLUSTER_MODE, "") +
+ ", robinsonMode=" + ((sb.isRobinsonMode()) ? "on" : "off"));
+ }
+
if (noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) == 0) {
//log.logDebug("CoreCrawl: queue is empty");
return false;
}
if (sb.sbQueue.size() >= (int) sb.getConfigLong(plasmaSwitchboard.INDEXER_SLOTS, 30)) {
- log.logFine("CoreCrawl: too many processes in indexing queue, dismissed (" +
- "sbQueueSize=" + sb.sbQueue.size() + ")");
+ log.logFine("CoreCrawl: too many processes in indexing queue, dismissed (" + "sbQueueSize=" + sb.sbQueue.size() + ")");
return false;
}
if (this.size() >= sb.getConfigLong(plasmaSwitchboard.CRAWLER_THREADS_ACTIVE_MAX, 10)) {
- log.logFine("CoreCrawl: too many processes in loader queue, dismissed (" +
- "cacheLoader=" + this.size() + ")");
+ log.logFine("CoreCrawl: too many processes in loader queue, dismissed (" + "cacheLoader=" + this.size() + ")");
return false;
}
if (sb.onlineCaution()) {
@@ -203,107 +225,84 @@ public class plasmaCrawlQueues {
return true;
}
-
- public int limitCrawlTriggerJobSize() {
- return noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT);
- }
-
- public boolean limitCrawlTriggerJob() {
- if (noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) == 0) {
- //log.logDebug("LimitCrawl: queue is empty");
- return false;
- }
- boolean robinsonPrivateCase = ((sb.isRobinsonMode()) &&
- (!sb.getConfig(plasmaSwitchboard.CLUSTER_MODE, "").equals(plasmaSwitchboard.CLUSTER_MODE_PUBLIC_CLUSTER)) &&
- (!sb.getConfig(plasmaSwitchboard.CLUSTER_MODE, "").equals(plasmaSwitchboard.CLUSTER_MODE_PRIVATE_CLUSTER)));
+ public boolean remoteCrawlLoaderJob() {
+ // check if we are allowed to crawl urls provided by other peers
+ if (!yacyCore.seedDB.mySeed().getFlagAcceptRemoteCrawl()) return false;
- if ((robinsonPrivateCase) || ((coreCrawlJobSize() <= 20) && (limitCrawlTriggerJobSize() > 10))) {
- // it is not efficient if the core crawl job is empty and we have too much to do
- // move some tasks to the core crawl job
- int toshift = 10; // this cannot be a big number because the balancer makes a forced waiting if it cannot balance
- if (toshift > limitCrawlTriggerJobSize()) toshift = limitCrawlTriggerJobSize();
- for (int i = 0; i < toshift; i++) {
- noticeURL.shift(plasmaCrawlNURL.STACK_TYPE_LIMIT, plasmaCrawlNURL.STACK_TYPE_CORE);
- }
- log.logInfo("shifted " + toshift + " jobs from global crawl to local crawl (coreCrawlJobSize()=" + coreCrawlJobSize() + ", limitCrawlTriggerJobSize()=" + limitCrawlTriggerJobSize() + ", cluster.mode=" + sb.getConfig(plasmaSwitchboard.CLUSTER_MODE, "") + ", robinsonMode=" + ((sb.isRobinsonMode()) ? "on" : "off"));
- if (robinsonPrivateCase) return false;
- }
+ // check if we are a senior peer
+ if (!yacyCore.seedDB.mySeed().isActive()) return false;
- // check local indexing queues
- // in case the placing of remote crawl fails, there must be space in the local queue to work off the remote crawl
- if (sb.sbQueue.size() >= (int) sb.getConfigLong(plasmaSwitchboard.INDEXER_SLOTS, 30) * 2) {
- log.logFine("LimitCrawl: too many processes in indexing queue, dismissed (" +
- "sbQueueSize=" + sb.sbQueue.size() + ")");
- return false;
- }
- if (this.size() >= sb.getConfigLong(plasmaSwitchboard.CRAWLER_THREADS_ACTIVE_MAX, 10)) {
- log.logFine("LimitCrawl: too many processes in loader queue, dismissed (" +
- "cacheLoader=" + this.size() + ")");
- return false;
- }
- if (sb.onlineCaution()) {
- log.logFine("LimitCrawl: online caution, omitting processing");
- return false;
- }
-
- // if crawling was paused we have to wait until we were notified to continue
- Object[] status = (Object[]) sb.crawlJobsStatus.get(plasmaSwitchboard.CRAWLJOB_GLOBAL_CRAWL_TRIGGER);
- synchronized(status[plasmaSwitchboard.CRAWLJOB_SYNC]) {
- if (((Boolean)status[plasmaSwitchboard.CRAWLJOB_STATUS]).booleanValue()) {
- try {
- status[plasmaSwitchboard.CRAWLJOB_SYNC].wait();
+ // check if we have an entry in the provider list, othervise fill the list
+ yacySeed seed;
+ if ((remoteCrawlProviderHashes.size() == 0) && (remoteTriggeredCrawlJobSize() == 0)) {
+ if (yacyCore.seedDB != null && yacyCore.seedDB.sizeConnected() > 0) {
+ Iterator e = yacyCore.dhtAgent.getProvidesRemoteCrawlURLs();
+ while (e.hasNext()) {
+ seed = (yacySeed) e.next();
+ if (seed != null) {
+ remoteCrawlProviderHashes.add(seed.hash);
+
+ }
}
- catch (InterruptedException e){ return false;}
}
}
+ if (remoteCrawlProviderHashes.size() == 0) return false;
- // start a global crawl, if possible
- String stats = "REMOTECRAWLTRIGGER[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", "
- + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]";
- try {
- plasmaCrawlEntry urlEntry = noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT, true);
- String profileHandle = urlEntry.profileHandle();
- // System.out.println("DEBUG plasmaSwitchboard.processCrawling:
- // profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
- plasmaCrawlProfile.entry profile = sb.profilesActiveCrawls.getEntry(profileHandle);
- if (profile == null) {
- log.logWarning(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
- return true;
- }
+ // take one entry from the provider list and load the entries from the remote peer
+ seed = null;
+ String hash = null;
+ while ((seed == null) && (remoteCrawlProviderHashes.size() > 0)) {
+ hash = (String) remoteCrawlProviderHashes.remove(remoteCrawlProviderHashes.size() - 1);
+ seed = yacyCore.seedDB.get(hash);
+ }
+ if (seed == null) return false;
+
+ // we know a peer which should provide remote crawl entries. load them now.
+ rssReader reader = (seed == null) ? null : yacyClient.queryRemoteCrawlURLs(seed, 10);
+ if (reader == null) return true;
+ // parse the rss
+ rssReader.Item item;
+ for (int i = 0; i < reader.items(); i++) {
+ item = reader.getItem(i);
+ //System.out.println("URL=" + item.getLink() + ", desc=" + item.getDescription() + ", pubDate=" + item.getPubDate());
- // check if the protocol is supported
- yacyURL url = urlEntry.url();
- String urlProtocol = url.getProtocol();
- if (!this.sb.crawlQueues.isSupportedProtocol(urlProtocol)) {
- this.log.logSevere("Unsupported protocol in URL '" + url.toString());
- return true;
+ // put url on remote crawl stack
+ yacyURL url;
+ try {
+ url = new yacyURL(item.getLink(), null);
+ } catch (MalformedURLException e) {
+ url = null;
}
-
- log.logFine("plasmaSwitchboard.limitCrawlTriggerJob: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() + ", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false") + ", depth=" + urlEntry.depth() + ", crawlDepth=" + profile.generalDepth() + ", filter="
- + profile.generalFilter() + ", permission=" + ((yacyCore.seedDB == null) ? "undefined" : (((yacyCore.seedDB.mySeed().isSenior()) || (yacyCore.seedDB.mySeed().isPrincipal())) ? "true" : "false")));
+ Date loaddate;
+ try {
+ loaddate = serverDate.parseShortSecondTime(item.getPubDate());
+ } catch (ParseException e) {
+ loaddate = new Date();
+ }
+ yacyURL referrer = null; // referrer needed!
+ if (sb.acceptURL(url)) {
+ // stack url
+ sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
+ String reasonString = sb.crawlStacker.stackCrawl(url, referrer, hash, "REMOTE-CRAWLING", loaddate, 0, sb.defaultRemoteProfile);
- boolean tryRemote = ((noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) != 0) || (sb.sbQueue.size() != 0)) &&
- (profile.remoteIndexing()) &&
- (urlEntry.initiator() != null) &&
- // (!(urlEntry.initiator().equals(indexURL.dummyHash))) &&
- ((yacyCore.seedDB.mySeed().isSenior()) || (yacyCore.seedDB.mySeed().isPrincipal()));
- if (tryRemote) {
- // checking robots.txt for http(s) resources
- if ((urlProtocol.equals("http") || urlProtocol.equals("https")) && robotsParser.isDisallowed(url)) {
- this.log.logFine("Crawling of URL '" + url.toString() + "' disallowed by robots.txt.");
- return true;
+ if (reasonString == null) {
+ // done
+ log.logInfo("crawlOrder: added remote crawl url: " + url.toNormalform(true, false));
+ } else if (reasonString.startsWith("double")) {
+ // case where we have already the url loaded;
+ log.logInfo("crawlOrder: ignored double remote crawl url: " + url.toNormalform(true, false));
+ } else {
+ log.logInfo("crawlOrder: ignored [" + reasonString + "] remote crawl url: " + url.toNormalform(true, false));
}
- boolean success = processRemoteCrawlTrigger(urlEntry);
- if (success) return true;
+ } else {
+ log.logWarning("crawlOrder: Received URL outside of our domain: " + url.toNormalform(true, false));
}
-
- processLocalCrawling(urlEntry, stats); // emergency case, work off the crawl locally
- return true;
- } catch (IOException e) {
- log.logSevere(stats + ": CANNOT FETCH ENTRY: " + e.getMessage(), e);
- if (e.getMessage().indexOf("hash is null") > 0) noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_LIMIT);
- return true; // if we return a false here we will block everything
}
+ return true;
+ }
+
+ public int limitCrawlJobSize() {
+ return noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT);
}
public int remoteTriggeredCrawlJobSize() {
@@ -399,108 +398,6 @@ public class plasmaCrawlQueues {
return;
}
- private boolean processRemoteCrawlTrigger(plasmaCrawlEntry urlEntry) {
- // if this returns true, then the urlEntry is considered as stored somewhere and the case is finished
- // if this returns false, the urlEntry will be enqueued to the local crawl again
-
- // wrong access
- if (urlEntry == null) {
- log.logInfo("REMOTECRAWLTRIGGER[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: urlEntry=null");
- return true; // superfluous request; true correct in this context because the urlEntry shall not be tracked any more
- }
-
- // check url
- if (urlEntry.url() == null) {
- log.logFine("ERROR: plasmaSwitchboard.processRemoteCrawlTrigger - url is null. name=" + urlEntry.name());
- return true; // same case as above: no more consideration
- }
-
- // are we qualified for a remote crawl?
- if ((yacyCore.seedDB.mySeed() == null) || (yacyCore.seedDB.mySeed().isJunior())) {
- log.logFine("plasmaSwitchboard.processRemoteCrawlTrigger: no permission");
- return false; // no, we must crawl this page ourselves
- }
-
- // check if peer for remote crawl is available
- yacySeed remoteSeed = ((sb.isPublicRobinson()) && (sb.getConfig("cluster.mode", "").equals("publiccluster"))) ?
- yacyCore.dhtAgent.getPublicClusterCrawlSeed(urlEntry.url().hash(), sb.clusterhashes) :
- yacyCore.dhtAgent.getGlobalCrawlSeed(urlEntry.url().hash());
- if (remoteSeed == null) {
- log.logFine("plasmaSwitchboard.processRemoteCrawlTrigger: no remote crawl seed available");
- return false;
- }
-
- // do the request
- HashMap page = yacyClient.crawlOrder(remoteSeed, urlEntry.url(), sb.getURL(urlEntry.referrerhash()), 6000);
- if (page == null) {
- log.logSevere(plasmaSwitchboard.STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " FAILED. URL CANNOT BE RETRIEVED from referrer hash: " + urlEntry.referrerhash());
- return false;
- }
-
- // check if we got contact to peer and the peer respondet
- if ((page == null) || (page.get("delay") == null)) {
- log.logInfo("CRAWL: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " FAILED. CAUSE: unknown (URL=" + urlEntry.url().toString() + "). Removed peer.");
- yacyCore.peerActions.peerDeparture(remoteSeed, "remote crawl to peer failed; peer answered unappropriate");
- return false; // no response from peer, we will crawl this ourself
- }
-
- String response = (String) page.get("response");
- log.logFine("plasmaSwitchboard.processRemoteCrawlTrigger: remoteSeed="
- + remoteSeed.getName() + ", url=" + urlEntry.url().toString()
- + ", response=" + page.toString()); // DEBUG
-
- // we received an answer and we are told to wait a specific time until we shall ask again for another crawl
- int newdelay = Integer.parseInt((String) page.get("delay"));
- yacyCore.dhtAgent.setCrawlDelay(remoteSeed.hash, newdelay);
- if (response.equals("stacked")) {
- // success, the remote peer accepted the crawl
- log.logInfo(plasmaSwitchboard.STR_REMOTECRAWLTRIGGER + remoteSeed.getName()
- + " PLACED URL=" + urlEntry.url().toString()
- + "; NEW DELAY=" + newdelay);
- // track this remote crawl
- delegatedURL.newEntry(urlEntry, remoteSeed.hash, new Date(), 0, response).store();
- return true;
- }
-
- // check other cases: the remote peer may respond that it already knows that url
- if (response.equals("double")) {
- // in case the peer answers double, it transmits the complete lurl data
- String lurl = (String) page.get("lurl");
- if ((lurl != null) && (lurl.length() != 0)) {
- String propStr = crypt.simpleDecode(lurl, (String) page.get("key"));
- indexURLEntry entry = sb.wordIndex.loadedURL.newEntry(propStr);
- try {
- sb.wordIndex.loadedURL.store(entry);
- sb.wordIndex.loadedURL.stack(entry, yacyCore.seedDB.mySeed().hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt?
- // noticeURL.remove(entry.hash());
- } catch (IOException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
-
- log.logInfo(plasmaSwitchboard.STR_REMOTECRAWLTRIGGER + remoteSeed.getName()
- + " SUPERFLUOUS. CAUSE: " + page.get("reason")
- + " (URL=" + urlEntry.url().toString()
- + "). URL IS CONSIDERED AS 'LOADED!'");
- return true;
- } else {
- log.logInfo(plasmaSwitchboard.STR_REMOTECRAWLTRIGGER + remoteSeed.getName()
- + " REJECTED. CAUSE: bad lurl response / " + page.get("reason") + " (URL="
- + urlEntry.url().toString() + ")");
- remoteSeed.setFlagAcceptRemoteCrawl(false);
- yacyCore.seedDB.update(remoteSeed.hash, remoteSeed);
- return false;
- }
- }
-
- log.logInfo(plasmaSwitchboard.STR_REMOTECRAWLTRIGGER + remoteSeed.getName()
- + " DENIED. RESPONSE=" + response + ", CAUSE="
- + page.get("reason") + ", URL=" + urlEntry.url().toString());
- remoteSeed.setFlagAcceptRemoteCrawl(false);
- yacyCore.seedDB.update(remoteSeed.hash, remoteSeed);
- return false;
- }
-
public plasmaHTCache.Entry loadResourceFromWeb(
yacyURL url,
int socketTimeout,
diff --git a/source/de/anomic/plasma/plasmaCrawlBalancer.java b/source/de/anomic/plasma/plasmaCrawlBalancer.java
index 078f2222a..fe5213953 100644
--- a/source/de/anomic/plasma/plasmaCrawlBalancer.java
+++ b/source/de/anomic/plasma/plasmaCrawlBalancer.java
@@ -119,7 +119,6 @@ public class plasmaCrawlBalancer {
resetFileIndex();
}
-
private void openFileIndex() {
cacheStacksPath.mkdirs();
urlFileIndex = new kelondroCache(new kelondroFlexTable(cacheStacksPath, stackname + indexSuffix, -1, plasmaCrawlEntry.rowdef, true), true, false);
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index 0f9aef309..be762399d 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -348,18 +348,17 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// 61_globalcawltrigger
/**
- *
public static final String CRAWLJOB_GLOBAL_CRAWL_TRIGGER = "61_globalcrawltrigger"
- *
Name of the global crawl trigger thread, popping one entry off it's queue and sending it to a non-busy peer to
- * crawl it
+ *
public static final String CRAWLJOB_REMOTE_CRAWL_LOADER = "60_remotecrawlloader"
+ *
Name of the remote crawl list loading thread
*
- * @see plasmaSwitchboard#CRAWLJOB_REMOTE_TRIGGERED_CRAWL
+ * @see plasmaSwitchboard#CRAWLJOB_REMOTE_CRAWL_LOADER
*/
- public static final String CRAWLJOB_GLOBAL_CRAWL_TRIGGER = "61_globalcrawltrigger";
- public static final String CRAWLJOB_GLOBAL_CRAWL_TRIGGER_METHOD_START = "limitCrawlTriggerJob";
- public static final String CRAWLJOB_GLOBAL_CRAWL_TRIGGER_METHOD_JOBCOUNT = "limitCrawlTriggerJobSize";
- public static final String CRAWLJOB_GLOBAL_CRAWL_TRIGGER_METHOD_FREEMEM = null;
- public static final String CRAWLJOB_GLOBAL_CRAWL_TRIGGER_IDLESLEEP = "61_globalcrawltrigger_idlesleep";
- public static final String CRAWLJOB_GLOBAL_CRAWL_TRIGGER_BUSYSLEEP = "61_globalcrawltrigger_busysleep";
+ public static final String CRAWLJOB_REMOTE_CRAWL_LOADER = "60_remotecrawlloader";
+ public static final String CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_START = "remoteCrawlLoaderJob";
+ public static final String CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_JOBCOUNT = null;
+ public static final String CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_FREEMEM = null;
+ public static final String CRAWLJOB_REMOTE_CRAWL_LOADER_IDLESLEEP = "60_remotecrawlloader_idlesleep";
+ public static final String CRAWLJOB_REMOTE_CRAWL_LOADER_BUSYSLEEP = "60_remotecrawlloader_busysleep";
// 62_remotetriggeredcrawl
/**
@@ -1208,9 +1207,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
this.crawlJobsStatus.put(CRAWLJOB_REMOTE_TRIGGERED_CRAWL, new Object[]{
new Object(),
Boolean.valueOf(getConfig(CRAWLJOB_REMOTE_TRIGGERED_CRAWL + "_isPaused", "false"))});
- this.crawlJobsStatus.put(CRAWLJOB_GLOBAL_CRAWL_TRIGGER, new Object[]{
+ this.crawlJobsStatus.put(CRAWLJOB_REMOTE_CRAWL_LOADER, new Object[]{
new Object(),
- Boolean.valueOf(getConfig(CRAWLJOB_GLOBAL_CRAWL_TRIGGER + "_isPaused", "false"))});
+ Boolean.valueOf(getConfig(CRAWLJOB_REMOTE_CRAWL_LOADER + "_isPaused", "false"))});
// init cookie-Monitor
this.log.logConfig("Starting Cookie Monitor");
@@ -1340,8 +1339,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
new serverInstantThread(this, PROXY_CACHE_ENQUEUE_METHOD_START, PROXY_CACHE_ENQUEUE_METHOD_JOBCOUNT, PROXY_CACHE_ENQUEUE_METHOD_FREEMEM), 10000);
deployThread(CRAWLJOB_REMOTE_TRIGGERED_CRAWL, "Remote Crawl Job", "thread that performes a single crawl/indexing step triggered by a remote peer", null,
new serverInstantThread(crawlQueues, CRAWLJOB_REMOTE_TRIGGERED_CRAWL_METHOD_START, CRAWLJOB_REMOTE_TRIGGERED_CRAWL_METHOD_JOBCOUNT, CRAWLJOB_REMOTE_TRIGGERED_CRAWL_METHOD_FREEMEM), 30000);
- deployThread(CRAWLJOB_GLOBAL_CRAWL_TRIGGER, "Global Crawl Trigger", "thread that triggeres remote peers for crawling", "/IndexCreateWWWGlobalQueue_p.html",
- new serverInstantThread(crawlQueues, CRAWLJOB_GLOBAL_CRAWL_TRIGGER_METHOD_START, CRAWLJOB_GLOBAL_CRAWL_TRIGGER_METHOD_JOBCOUNT, CRAWLJOB_GLOBAL_CRAWL_TRIGGER_METHOD_FREEMEM), 30000); // error here?
+ deployThread(CRAWLJOB_REMOTE_CRAWL_LOADER, "Remote Crawl URL Loader", "thread that loads remote crawl lists from other peers", "",
+ new serverInstantThread(crawlQueues, CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_START, CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_JOBCOUNT, CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_FREEMEM), 30000); // error here?
deployThread(CRAWLJOB_LOCAL_CRAWL, "Local Crawl", "thread that performes a single crawl step from the local crawl queue", "/IndexCreateWWWLocalQueue_p.html",
new serverInstantThread(crawlQueues, CRAWLJOB_LOCAL_CRAWL_METHOD_START, CRAWLJOB_LOCAL_CRAWL_METHOD_JOBCOUNT, CRAWLJOB_LOCAL_CRAWL_METHOD_FREEMEM), 10000);
deployThread(SEED_UPLOAD, "Seed-List Upload", "task that a principal peer performes to generate and upload a seed-list to a ftp account", null,
@@ -2639,18 +2638,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
thread.setIdleSleep(1000);
}
- thread = getThread(CRAWLJOB_GLOBAL_CRAWL_TRIGGER);
- if (thread != null) {
- setConfig(CRAWLJOB_GLOBAL_CRAWL_TRIGGER_BUSYSLEEP , thread.setBusySleep(Math.max(1000, newBusySleep * 3)));
- thread.setIdleSleep(10000);
- }
- /*
- thread = getThread(CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
- if (thread != null) {
- setConfig(CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP , thread.setBusySleep(newBusySleep * 10));
- thread.setIdleSleep(10000);
- }
- */
thread = getThread(PROXY_CACHE_ENQUEUE);
if (thread != null) {
setConfig(PROXY_CACHE_ENQUEUE_BUSYSLEEP , thread.setBusySleep(0));
diff --git a/source/de/anomic/xml/rssReader.java b/source/de/anomic/xml/rssReader.java
index 313f9408d..4616ec200 100644
--- a/source/de/anomic/xml/rssReader.java
+++ b/source/de/anomic/xml/rssReader.java
@@ -26,6 +26,8 @@
package de.anomic.xml;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
@@ -38,6 +40,9 @@ import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
+import de.anomic.server.serverByteBuffer;
+import de.anomic.server.logging.serverLog;
+
public class rssReader extends DefaultHandler {
// statics for item generation and automatic categorization
@@ -72,17 +77,7 @@ public class rssReader extends DefaultHandler {
private HashMap items; // a guid:Item map
- public rssReader(String path) {
- init();
- parse(path);
- }
-
- public rssReader(InputStream stream) {
- init();
- parse(stream);
- }
-
- private void init() {
+ public rssReader() {
itemsGUID = new ArrayList();
items = new HashMap();
buffer = new StringBuffer();
@@ -93,7 +88,8 @@ public class rssReader extends DefaultHandler {
parsingItem = false;
}
- private void parse(String path) {
+ public rssReader(String path) {
+ this();
try {
SAXParserFactory factory = SAXParserFactory.newInstance();
SAXParser saxParser = factory.newSAXParser();
@@ -103,7 +99,8 @@ public class rssReader extends DefaultHandler {
}
}
- private void parse(InputStream stream) {
+ public rssReader(InputStream stream) {
+ this();
try {
SAXParserFactory factory = SAXParserFactory.newInstance();
SAXParser saxParser = factory.newSAXParser();
@@ -112,6 +109,42 @@ public class rssReader extends DefaultHandler {
e.printStackTrace();
}
}
+
+ public static rssReader parse(byte[] a) {
+
+ // check integrity of array
+ if ((a == null) || (a.length == 0)) {
+ serverLog.logWarning("rssReader", "response=null");
+ return null;
+ }
+ if (a.length < 100) {
+ serverLog.logWarning("rssReader", "response=" + new String(a));
+ return null;
+ }
+ if (!serverByteBuffer.equals(a, " 0) return s;
+ }
+ } catch (kelondroException e) {
+ System.out.println("DEBUG providesRemoteCrawlURLsEnum:" + e.getMessage());
+ yacyCore.log.logSevere("database inconsistency (" + e.getMessage() + "), re-set of db.");
+ seedDB.resetActiveTable();
+ return null;
+ }
+ return null;
+ }
+
+ public Object next() {
+ yacySeed next = nextSeed;
+ nextSeed = nextInternal();
+ return next;
+ }
+
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+
+ }
+
public Iterator getAcceptRemoteIndexSeeds(String starthash) {
// returns an enumeration of yacySeed-Objects
// that have the AcceptRemoteIndex-Flag set
diff --git a/source/de/anomic/yacy/yacyVersion.java b/source/de/anomic/yacy/yacyVersion.java
index 4c1f67f91..792b0e04b 100644
--- a/source/de/anomic/yacy/yacyVersion.java
+++ b/source/de/anomic/yacy/yacyVersion.java
@@ -53,7 +53,7 @@ public final class yacyVersion implements Comparator, Comparable {
public static final float YACY_SUPPORTS_GZIP_POST_REQUESTS = (float) 0.40300772;
public static final float YACY_ACCEPTS_RANKING_TRANSMISSION = (float) 0.414;
public static final float YACY_HANDLES_COLLECTION_INDEX = (float) 0.486;
- public static final float YACY_PROVIDES_CRAWLS_VIA_LIST_HTML = (float) 0.50403367;
+ public static final float YACY_POVIDES_REMOTECRAWL_LISTS = (float) 0.550;
// information about latest release, retrieved by other peers release version
public static double latestRelease = 0.1; // this value is overwritten when a peer with later version appears
diff --git a/yacy.init b/yacy.init
index 55264ca16..6b536fb45 100644
--- a/yacy.init
+++ b/yacy.init
@@ -558,12 +558,12 @@ filterOutStopwordsFromTopwords=true
50_localcrawl_busysleep__pro=100
50_localcrawl_memprereq=4194304
50_localcrawl_isPaused=false
-61_globalcrawltrigger_idlesleep=10000
-61_globalcrawltrigger_busysleep=500
-61_globalcrawltrigger_memprereq=2097152
-61_globalcrawltrigger_isPaused=false
+60_remotecrawlloader_idlesleep=10000
+60_remotecrawlloader_busysleep=2000
+60_remotecrawlloader_memprereq=2097152
+60_remotecrawlloader_isPaused=false
62_remotetriggeredcrawl_idlesleep=10000
-62_remotetriggeredcrawl_busysleep=1000
+62_remotetriggeredcrawl_busysleep=500
62_remotetriggeredcrawl_memprereq=6291456
62_remotetriggeredcrawl_isPaused=false
70_cachemanager_idlesleep=1000