From 89b9b2b02a2494097847147c66ba5d5abc721f88 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 29 Nov 2007 02:07:37 +0000 Subject: [PATCH] redesigned remote crawl process: - instead of pushing urls to other peers, the urls are actively pulled by the peer that wants to do a remote crawl - the remote crawl push process had been removed - a process that adds urls from remote peers had been added - the server-side interface for providing 'limit'-urls exists since 0.55 and works with this version - the list-interface had been removed - servlets using the list-interface had been removed (this implementation did not properly manage double-check) - changes in configuration file to support new pull-process - fixed a bug in crawl balancer (status was not saved/closed properly) - the yacy/urls-protocol was extended to support different networks/clusters - many interface-adoptions to new stack counters git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4232 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- build.properties | 2 +- htroot/CrawlURLFetchStack_p.html | 68 --- htroot/CrawlURLFetchStack_p.java | 299 ---------- htroot/CrawlURLFetch_p.html | 107 ---- htroot/CrawlURLFetch_p.java | 543 ------------------ htroot/PerformanceQueues_p.java | 2 +- htroot/ScreenSaver.java | 6 +- htroot/Status.java | 7 - htroot/Status_p.inc | 6 - htroot/WatchCrawler_p.html | 10 + htroot/WatchCrawler_p.java | 6 +- .../templates/submenuCrawlURLFetch.template | 7 - htroot/js/WatchCrawler.js | 9 +- htroot/rct_p.html | 30 + htroot/rct_p.java | 124 ++++ htroot/xml/queues_p.java | 12 +- htroot/xml/queues_p.xml | 16 + htroot/yacy/crawlOrder.java | 4 +- htroot/yacy/list.html | 1 - htroot/yacy/list.java | 152 ----- htroot/yacy/urls.java | 114 +++- htroot/yacy/urls.xml | 2 +- .../plasma/crawler/plasmaCrawlQueues.java | 291 +++------- .../de/anomic/plasma/plasmaCrawlBalancer.java | 1 - .../de/anomic/plasma/plasmaSwitchboard.java | 39 +- source/de/anomic/xml/rssReader.java | 59 +- source/de/anomic/yacy/yacyClient.java | 97 ++-- source/de/anomic/yacy/yacyDHTAction.java | 47 ++ source/de/anomic/yacy/yacyVersion.java | 2 +- yacy.init | 10 +- 30 files changed, 564 insertions(+), 1509 deletions(-) delete mode 100644 htroot/CrawlURLFetchStack_p.html delete mode 100644 htroot/CrawlURLFetchStack_p.java delete mode 100644 htroot/CrawlURLFetch_p.html delete mode 100644 htroot/CrawlURLFetch_p.java delete mode 100644 htroot/env/templates/submenuCrawlURLFetch.template create mode 100644 htroot/rct_p.html create mode 100644 htroot/rct_p.java delete mode 100644 htroot/yacy/list.html delete mode 100644 htroot/yacy/list.java diff --git a/build.properties b/build.properties index a237a15f4..ffd3b2acc 100644 --- a/build.properties +++ b/build.properties @@ -3,7 +3,7 @@ javacSource=1.4 javacTarget=1.4 # Release Configuration -releaseVersion=0.554 +releaseVersion=0.555 releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz proReleaseFile=yacy_pro_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz releaseFileParentDir=yacy diff --git a/htroot/CrawlURLFetchStack_p.html b/htroot/CrawlURLFetchStack_p.html deleted file mode 100644 index 152c1a683..000000000 --- a/htroot/CrawlURLFetchStack_p.html +++ /dev/null @@ -1,68 +0,0 @@ - - - - YaCy '#[clientname]#': URL Fetcher Stack Management - #%env/templates/metas.template%# - - - #%env/templates/header.template%# - #%env/templates/submenuCrawlURLFetch.template%# -

Manage stack for remote URL fetches

- - #(addedUrls)#::Added #[added]# URLs!#(/addedUrls)# -
-
Statistics -
-
Currently stacked URLs:
#[urlCount]#
-
Totally fetched / added URLs:
#[totalFetched]# / #[totalAdded]#
- #{peers}# -
Fetched from #[peer]#
#[amount]#
#{/peers}# -
-
- -
Settings -
-
:
-
- - - #(set)#:: - Set max. size for each transfer to #[value]#:: - Setting max. size for each transfer to #[value]# was unsuccessful: may not be negative#(/set)# -
-
-
- -
Add URLs to stack -
-
:
-
- - of #[locurls]# URLs - #(shiftloc)#:: - Shifted #[value]# URLs from Local Crawler Queue to URL Fetcher Stack (not bound: #[failed]#)#(/shiftloc)# -
-
:
-
- - of #[remurls]# URLs - #(shiftrem)#:: - Shifted #[value]# URLs from Remote Crawler Queue to URL Fetcher Stack (not bound: #[failed]#)#(/shiftrem)# -
-
:
-
- #(uploadError)#:: No file entered for upload#(/uploadError)#
-
-
-
- - #(upload)#:: - Added #[added]# and rejected #[failed]# URLs from uploaded file successfully:: - An internal error occured processing the uploaded file: #[error]##(/upload)# -
-
-
-
- #%env/templates/footer.template%# - - \ No newline at end of file diff --git a/htroot/CrawlURLFetchStack_p.java b/htroot/CrawlURLFetchStack_p.java deleted file mode 100644 index 40e868b4c..000000000 --- a/htroot/CrawlURLFetchStack_p.java +++ /dev/null @@ -1,299 +0,0 @@ -// CrawlURLFetchStack_p.java -// ------------------------------------- -// part of YACY -// -// (C) 2007 by Franz Brausze -// -// last change: $LastChangedDate: $ by $LastChangedBy: $ -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// -// Using this software in any meaning (reading, learning, copying, compiling, -// running) means that you agree that the Author(s) is (are) not responsible -// for cost, loss of data or any harm that may be caused directly or indirectly -// by usage of this softare or this documentation. The usage of this software -// is on your own risk. The installation and usage (starting/running) of this -// software may allow other people or application to access your computer and -// any attached devices and is highly dependent on the configuration of the -// software which must be done by the user of the software; the author(s) is -// (are) also not responsible for proper configuration and usage of the -// software, even if provoked by documentation provided together with -// the software. -// -// Any changes to this file according to the GPL as documented in the file -// gpl.txt aside this file in the shipment you received can be done to the -// lines that follows this copyright notice here, but changes must not be -// done inside the copyright notive above. A re-distribution must contain -// the intact and unchanged copyright notice. -// Contributions and changes to the program code must be marked as such. - -import java.io.BufferedReader; -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.OutputStream; -import java.io.PrintWriter; -import java.io.Writer; -import java.net.MalformedURLException; -import java.util.HashMap; -import java.util.Iterator; - -import de.anomic.data.URLFetcherStack; -import de.anomic.htmlFilter.htmlFilterContentScraper; -import de.anomic.htmlFilter.htmlFilterWriter; -import de.anomic.http.httpHeader; -import de.anomic.plasma.plasmaCrawlEntry; -import de.anomic.plasma.plasmaCrawlNURL; -import de.anomic.plasma.plasmaSwitchboard; -import de.anomic.plasma.urlPattern.plasmaURLPattern; -import de.anomic.server.serverFileUtils; -import de.anomic.server.serverObjects; -import de.anomic.server.serverSwitch; -import de.anomic.server.logging.serverLog; -import de.anomic.yacy.yacyURL; - -public class CrawlURLFetchStack_p { - - public static final HashMap /* of PeerName, sent URLs */ fetchMap = new HashMap(); - private static URLFetcherStack stack = null; - public static int maxURLsPerFetch = 50; - - public static URLFetcherStack getURLFetcherStack(serverSwitch env) { - if (stack == null) try { - stack = new URLFetcherStack(env.getConfigPath(plasmaSwitchboard.DBPATH, plasmaSwitchboard.DBPATH_DEFAULT)); - } catch (IOException e) { - serverLog.logSevere("URLFETCHER", "Couldn't initialize URL stack: " + e.getMessage()); - } - return stack; - } - - public static final String STREAM_CMD_ADDURLS_ = "ADD URLS: "; - public static final String STREAM_CMD_ADDURLSBLCHK_ = "ADD URLS CHECK BLACKLIST: "; - public static final String STREAM_CMD_END = "END"; - public static final String STREAM_RESP_OK_ADDURLS_ = "FAILED URLS: "; - public static final String STREAM_RESP_OK = "OK"; - public static final String STREAM_RESP_FAILED = "FAILED"; - - public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) { - final serverObjects prop = new serverObjects(); - plasmaSwitchboard sb = (plasmaSwitchboard)env; - - if (((String)header.get(httpHeader.CONNECTION_PROP_PATH)).endsWith(".stream")) { - /* ================================================================= - * .stream request - * ================================================================= */ - InputStream in = (InputStream)header.get(httpHeader.CONNECTION_PROP_INPUTSTREAM); - OutputStream out = (OutputStream)header.get(httpHeader.CONNECTION_PROP_OUTPUTSTREAM); - BufferedReader inrb = new BufferedReader(new InputStreamReader(in)); - PrintWriter outw = new PrintWriter(out); - - String line; - int addurls = 0, cururl = 0; - boolean[] status = new boolean[0]; - boolean blchk = false; - URLFetcherStack stack = getURLFetcherStack(env); - try { - while ((line = inrb.readLine()) != null) { - // commands - if (line.startsWith(STREAM_CMD_ADDURLS_)) { - try { - addurls = Integer.parseInt(line.substring(STREAM_CMD_ADDURLS_.length())); - status = new boolean[addurls]; - cururl = 0; - blchk = false; - outw.println(STREAM_RESP_OK); - } catch (NumberFormatException e) { - outw.println(STREAM_RESP_FAILED); - } - } else if (line.startsWith(STREAM_CMD_ADDURLSBLCHK_)) { - try { - addurls = Integer.parseInt(line.substring(STREAM_CMD_ADDURLSBLCHK_.length())); - status = new boolean[addurls]; - cururl = 0; - blchk = true; - outw.println(STREAM_RESP_OK); - } catch (NumberFormatException e) { - outw.println(STREAM_RESP_FAILED); - } - } else if (line.equals(STREAM_CMD_END)) { - break; - } else { - if (cururl < addurls) // add url - status[cururl++] = addURL(line, blchk, stack); - - if (cururl > 0 && cururl == addurls ) { - // done with parsing the passed URL count, now some status output: i.e. 'FAILED URLS: 5 of 8' - outw.print(STREAM_RESP_OK_ADDURLS_); - StringBuffer stat = new StringBuffer(); - for (int i=0; i 0) { - maxURLsPerFetch = count; - prop.put("set", "1"); - prop.put("set_value", maxURLsPerFetch); - } else { - prop.put("set", "2"); - prop.put("set_value", count); - } - } - else if (post.containsKey("shiftlcq")) { - final int count = Math.min(post.getInt("shiftloc", 0), sb.crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE)); - final int failed = shiftFromNotice(sb.crawlQueues.noticeURL, plasmaCrawlNURL.STACK_TYPE_CORE, getURLFetcherStack(env), count); - prop.put("shiftloc", "1"); - prop.put("shiftloc_value", count - failed); - prop.put("shiftloc_failed", failed); - } - else if (post.containsKey("shiftrcq")) { - final int count = post.getInt("shiftrem", 0); - final int failed = shiftFromNotice(sb.crawlQueues.noticeURL, plasmaCrawlNURL.STACK_TYPE_LIMIT, getURLFetcherStack(env), count); - prop.put("shiftrem", "1"); - prop.put("shiftrem_value", count - failed); - prop.put("shiftrem_failed", failed); - } - else if (post.containsKey("subupload")) { - if (post.get("upload", "").length() == 0) { - prop.put("uploadError", "1"); - } else { - final File file = new File(post.get("upload", "")); - final String content = new String((byte[])post.get("upload$file")); - - final String type = post.get("uploadType", ""); - final boolean blCheck = post.containsKey("blacklistCheck"); - if (type.equals("plain")) { - prop.put("upload_added", addURLs(content.split("\n"), blCheck, getURLFetcherStack(env))); - prop.put("upload_failed", "0"); - prop.put("upload", "1"); - } else if (type.equals("html")) { - try { - final htmlFilterContentScraper scraper = new htmlFilterContentScraper(new yacyURL(file)); - final Writer writer = new htmlFilterWriter(null, null, scraper, null, false); - serverFileUtils.write(content, writer); - writer.close(); - - final Iterator it = ((HashMap)scraper.getAnchors()).keySet().iterator(); - int added = 0, failed = 0; - yacyURL url; - while (it.hasNext()) try { - url = new yacyURL((String) it.next(), null); - if (blCheck && plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, url)) { - failed++; - continue; - } - getURLFetcherStack(env).push(url); - added++; - } catch (MalformedURLException e) { failed++; } - prop.put("upload", "1"); - prop.put("upload_added", added); - prop.put("upload_failed", failed); - } catch (Exception e) { - e.printStackTrace(); - prop.put("upload", "2"); - prop.putHTML("upload_error", e.getMessage()); - } - } - } - } - } - - putFetched(prop); - prop.put("urlCount", getURLFetcherStack(env).size()); - prop.put("totalFetched", getURLFetcherStack(env).getPopped()); - prop.put("totalAdded", getURLFetcherStack(env).getPushed()); - prop.put("maxSize", maxURLsPerFetch); - prop.put("locurls", sb.crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE)); - prop.put("remurls", sb.crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT)); - prop.put("locurlsVal", Math.min(sb.crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE), 500)); - prop.put("remurlsVal", Math.min(sb.crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT), 500)); - return prop; - } - - private static void putFetched(serverObjects prop) { - Iterator it = fetchMap.keySet().iterator(); - int count = 0; - while (it.hasNext()) { - String key = (String)it.next(); - prop.putHTML("peers_" + count + "_peer", key); - prop.put("peers_" + count + "_amount", ((Integer)fetchMap.get(key)).intValue()); - count++; - } - prop.put("peers", count); - } - - private static int addURLs(String[] urls, boolean blCheck, URLFetcherStack stack) { - int count = -1; - for (int i=0; i - - - YaCy '#[clientname]#': URL Fetcher Management - #%env/templates/metas.template%# - - - #%env/templates/header.template%# - #%env/templates/submenuCrawlURLFetch.template%# -

URL-Fetcher

-
-
Fetch new URLs to crawl -

- The newly added URLs will be crawled without any filter restricions except of the static stop-words. - The Re-Crawl option isn't used and the sites won't be stored in the Proxy Cache. Text and media types will be indexed. - Since these URLs will be requested explicitely from another peer, they won't be distributed for remote indexing. -

-
-
:
-
- - - #(hostError)#:: Malformed URL#(/hostError)# - #(saved)#:: -
-
:
-
- - #(/saved)# -
- - #(peersKnown)#:: -
:
-
- - - -  : - - #(peerError)#:: -  Error fetching URL-list from #[hash]#:#[name]#:: -  Peer with hash #[hash]# doesn't seem to be online anymore#(/peerError)# -
#(/peersKnown)# - -
Frequency:
-
-
-
- : - -   - - #(freqError)#:: Invalid period, fetching only once#(/freqError)# -
-
-
-
-
- - #(threadError)#:: - Error on stopping thread, it isn't alive anymore:: - Error on restarting thread, it isn't alive anymore#(/threadError)# - - #(runs)#:: -
-
Thread to fetch URLs is #(status)#running::stopped::paused#(/status)# -
-
Total runs:
#[totalRuns]#
-
Total fetched URLs:
#[totalFetchedURLs]#
-
Total failed URLs:
#[totalFailedURLs]#
-
Last run duration:
#[lastRun]# ms
-
Last server response:
#[lastServerResponse]#
-
Last fetched URLs:
#[lastFetchedURLs]#
-
Last failed URLs:
-
- #[error]# -
    #{error}# -
  • #[reason]#: #[url]#
  • #{/error}# -
-
-
:
-
- minutes - -
-
#(status)# - :: - :: - - #(/status)# -
-
-
-
- #(/runs)# - #%env/templates/footer.template%# - - \ No newline at end of file diff --git a/htroot/CrawlURLFetch_p.java b/htroot/CrawlURLFetch_p.java deleted file mode 100644 index 5e19f2d02..000000000 --- a/htroot/CrawlURLFetch_p.java +++ /dev/null @@ -1,543 +0,0 @@ -// CrawlURLFetch_p.java -// ------------------------------------- -// part of YACY -// -// (C) 2007 by Franz Brausze -// -// last change: $LastChangedDate: $ by $LastChangedBy: $ -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// -// Using this software in any meaning (reading, learning, copying, compiling, -// running) means that you agree that the Author(s) is (are) not responsible -// for cost, loss of data or any harm that may be caused directly or indirectly -// by usage of this softare or this documentation. The usage of this software -// is on your own risk. The installation and usage (starting/running) of this -// software may allow other people or application to access your computer and -// any attached devices and is highly dependent on the configuration of the -// software which must be done by the user of the software; the author(s) is -// (are) also not responsible for proper configuration and usage of the -// software, even if provoked by documentation provided together with -// the software. -// -// Any changes to this file according to the GPL as documented in the file -// gpl.txt aside this file in the shipment you received can be done to the -// lines that follows this copyright notice here, but changes must not be -// done inside the copyright notive above. A re-distribution must contain -// the intact and unchanged copyright notice. -// Contributions and changes to the program code must be marked as such. - -import java.io.IOException; -import java.net.MalformedURLException; -import java.util.ArrayList; -import java.util.Date; -import java.util.HashMap; -import java.util.Iterator; -import java.util.Random; -import java.util.TreeMap; - -import de.anomic.plasma.plasmaCrawlProfile; -import de.anomic.plasma.plasmaCrawlZURL; -import de.anomic.plasma.plasmaSwitchboard; -import de.anomic.server.serverByteBuffer; -import de.anomic.server.serverSwitch; -import de.anomic.http.httpHeader; -import de.anomic.http.httpRemoteProxyConfig; -import de.anomic.http.httpc; -import de.anomic.server.serverObjects; -import de.anomic.server.logging.serverLog; -import de.anomic.yacy.yacyCore; -import de.anomic.yacy.yacySeed; -import de.anomic.yacy.yacyURL; -import de.anomic.yacy.yacyVersion; - -public class CrawlURLFetch_p { - - private static final long ERR_DATE = 1; - private static final long ERR_HOST_MALFORMED_URL = 1; - private static final long ERR_PEER_GENERAL_CONN = 1; - private static final long ERR_PEER_OFFLINE = 2; - private static final long ERR_THREAD_STOP = 1; - private static final long ERR_THREAD_RESUME = 2; - - private static final long STAT_THREAD_ALIVE = 0; - private static final long STAT_THREAD_STOPPED = 1; - private static final long STAT_THREAD_PAUSED = 2; - - private static URLFetcher fetcher = null; - private static plasmaCrawlProfile.entry profile = null; - private static ArrayList savedURLs = new ArrayList(); - - public static plasmaCrawlProfile.entry getCrawlProfile(serverSwitch env) { - if (profile == null) { - profile = ((plasmaSwitchboard)env).profilesActiveCrawls.newEntry( - "URLFetcher", // Name - null, // URL - ".*", ".*", // General / specific filter - 0, 0, // General / specific depth - -1, -1, -1, // Recrawl / Dom-filter depth / Dom-max-pages - true, // Crawl query - true, true, // Index text / media - false, true, // Store in HT- / TX-Cache - false, // Remote indexing - true, false, false); // Exclude static / dynamic / parent stopwords - } - return profile; - } - - public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) { - serverObjects prop = new serverObjects(); - prop.put("host", ""); - - // List previously saved URLs for easy selection - listURLs(prop); - - // List known hosts - listPeers(prop, - post != null && post.containsKey("checkPeerURLCount"), - ((plasmaSwitchboard)env).remoteProxyConfig); - - if (post != null) { - if (post.containsKey("start")) { - long frequency = URLFetcher.DELAY_ONCE; - if (post.containsKey("reg")) { - if (post.get("reg", "").equals("self_det")) { - frequency = URLFetcher.DELAY_SELF_DET; - } else if (post.get("reg", "").equals("delay")) { - frequency = getDate(post.get("frequency", ""), post.get("freq_type", "")); - if (frequency == -1) - prop.put("freqError", ERR_DATE); - } - } - - int count = 50; - if (post.get("amount", "").matches("\\d+")) { - count = Integer.parseInt(post.get("amount", "")); - if (count > 999) count = 999; - } - - if (fetcher != null) fetcher.interrupt(); - fetcher = null; - if (post.get("source", "").equals("peer") && - post.get("peerhash", "").equals("random")) { - fetcher = new URLFetcher( - env, - getCrawlProfile(env), - count, - frequency); - } else { - yacyURL url = null; - if (post.get("source", "").equals("url")) { - try { - url = new yacyURL(post.get("host", null), null); - if (!savedURLs.contains(url.toNormalform(true, true))) - savedURLs.add(url.toNormalform(true, true)); - prop.put("host", post.get("host", url.toString())); - } catch (MalformedURLException e) { - prop.put("host", post.get("host", "")); - prop.put("hostError", ERR_HOST_MALFORMED_URL); - } - } else if (post.get("source", "").equals("savedURL")) { - try { - url = new yacyURL(post.get("saved", ""), null); - } catch (MalformedURLException e) { - /* should never appear, except for invalid input, see above */ - } - } else if (post.get("source", "").equals("peer")) { - yacySeed ys = null; - ys = yacyCore.seedDB.get(post.get("peerhash", null)); - if (ys != null) { - if ((url = URLFetcher.getListServletURL( - ys.getPublicAddress(), - URLFetcher.MODE_LIST, - count, - yacyCore.seedDB.mySeed().hash)) == null) { - prop.put("peerError", ERR_PEER_GENERAL_CONN); - prop.put("peerError_hash", post.get("peerhash", "")); - prop.put("peerError_name", ys.getName()); - } - } else { - prop.put("peerError", ERR_PEER_OFFLINE); - prop.put("peerError_hash", post.get("peerhash", "")); - } - } - - if (url != null) { - fetcher = new URLFetcher( - env, - getCrawlProfile(env), - url, - count, - frequency); - } - } - if (fetcher != null) fetcher.start(); - } - else if (post.containsKey("stop")) { - if (fetcher != null) { - fetcher.interrupt(); - } else { - prop.put("threadError", ERR_THREAD_STOP); - } - } - else if (post.containsKey("restart")) { - if (fetcher != null) { - fetcher.interrupt(); - if (fetcher.url == null) { - fetcher = new URLFetcher( - env, - getCrawlProfile(env), - fetcher.count, - fetcher.delay); - } else { - fetcher = new URLFetcher( - env, - getCrawlProfile(env), - fetcher.url, - fetcher.count, - fetcher.delay); - } - fetcher.start(); - } else { - prop.put("threadError", ERR_THREAD_RESUME); - } - } - else if (post.containsKey("resetDelay")) { - final long frequency = getDate(post.get("newDelay", ""), "minutes"); - if (frequency == -1) { - prop.put("freqError", ERR_DATE); - } else { - fetcher.delay = frequency; - } - } - prop.put("LOCATION", "/CrawlURLFetch_p.html"); - } - - if (fetcher != null) { - prop.put("runs", "1"); - prop.put("runs_status", - ((fetcher.paused && fetcher.isAlive()) ? STAT_THREAD_PAUSED : - (fetcher.isAlive()) ? STAT_THREAD_ALIVE : STAT_THREAD_STOPPED)); - prop.putNum("runs_totalRuns", URLFetcher.totalRuns); - prop.putNum("runs_totalFetchedURLs", URLFetcher.totalFetchedURLs); - prop.putNum("runs_totalFailedURLs", URLFetcher.totalFailed); - prop.putNum("runs_lastRun", fetcher.lastRun); - prop.putNum("runs_lastFetchedURLs", fetcher.lastFetchedURLs); - prop.put("runs_lastServerResponse", (fetcher.lastServerResponse == null) - ? "" : fetcher.lastServerResponse); - prop.putNum("runs_curDelay", (int)(fetcher.delay / 60000)); - - Iterator it = fetcher.failed.keySet().iterator(); - int i = 0; - Object key; - while (it.hasNext()) { - key = it.next(); - prop.put("runs_error_" + i + "_reason", fetcher.failed.get(key)); - prop.put("runs_error_" + i + "_url", (String)key); - i++; - } - prop.put("runs_error", i); - } - - return prop; - } - - private static int listURLs(serverObjects prop) { - if (savedURLs.size() == 0) return 0; - prop.put("saved", "1"); - for (int i=0; i 0) { - final Iterator e = yacyCore.seedDB.seedsConnected(true, false, null, yacyVersion.YACY_PROVIDES_CRAWLS_VIA_LIST_HTML); - int dbsize; - while (e.hasNext()) { - yacySeed seed = (yacySeed) e.next(); - if (seed != null && !seed.hash.equals(yacyCore.seedDB.mySeed().hash)) { - peername = seed.get(yacySeed.NAME, "nameless"); - if (checkURLCount && (dbsize = getURLs2Fetch(seed, theRemoteProxyConfig)) > 0) { - hostList.put(peername + " (" + dbsize + ")", seed.hash); - } else { - hostList.put(peername, seed.hash); - } - } - } - } - - if (hostList.size() > 0) { - while (!hostList.isEmpty() && (peername = (String) hostList.firstKey()) != null) { - final String hash = (String) hostList.get(peername); - prop.put("peersKnown_peers_" + peerCount + "_hash", hash); - prop.put("peersKnown_peers_" + peerCount + "_name", peername); - hostList.remove(peername); - peerCount++; - } - prop.put("peersKnown_peers", peerCount); - prop.put("peersKnown", "1"); - } else { - prop.put("peersKnown", "0"); - } - return peerCount; - } - - private static int getURLs2Fetch(yacySeed seed, httpRemoteProxyConfig theRemoteProxyConfig) { - try { - String answer = new String(httpc.wget( - URLFetcher.getListServletURL(seed.getPublicAddress(), URLFetcher.MODE_COUNT, 0, null), - seed.getIP(), - 5000, - null, null, - theRemoteProxyConfig, - null, - null)); - if (answer.matches("\\d+")) - return Integer.parseInt(answer); - else { - serverLog.logFine("URLFETCHER", "Retrieved invalid answer from " + seed.getName() + ": '" + answer + "'"); - return -1; - } - } catch (MalformedURLException e) { - /* should not happen */ - return -3; - } catch (IOException e) { - return -2; - } - } - - private static long getDate(String count, String type) { - long r = 0; - if (count != null && count.matches("\\d+")) r = Long.parseLong(count); - if (r < 1) return -1; - - r *= 60000; - if (type.equals("days")) return r * 60 * 24; - else if (type.equals("hours")) return r * 60; - else if (type.equals("minutes")) return r; - else return -1; - } - - public static class URLFetcher extends Thread { - - public static final long DELAY_ONCE = -1; - public static final long DELAY_SELF_DET = 0; - - public static final int MODE_LIST = 0; - public static final int MODE_COUNT = 1; - - public static int totalRuns = 0; - public static int totalFetchedURLs = 0; - public static int totalFailed = 0; - - public final HashMap failed = new HashMap(); - - public int lastFetchedURLs = 0; - public long lastRun = 0; - public String lastServerResponse = null; - public int lastFailed = 0; - - public final yacyURL url; - public final int count; - public long delay; - public final plasmaSwitchboard sb; - public final plasmaCrawlProfile.entry profile; - - public boolean paused = false; - - public static yacyURL getListServletURL(String host, int mode, int count, String peerHash) { - String r = "http://" + host + "/yacy/list.html?list=queueUrls&display="; - - switch (mode) { - case MODE_LIST: r += "list"; break; - case MODE_COUNT: r += "count"; break; - } - - if (count > 0) r += "&count=" + count; - - if (peerHash != null && peerHash.length() > 0) { - r += "&iam=" + peerHash; - } else if (mode == MODE_LIST) { - r += "&iam=" + yacyCore.seedDB.mySeed().hash; - } - - try { - return new yacyURL(r, null); - } catch (MalformedURLException e) { - return null; - } - } - - public URLFetcher( - serverSwitch env, - plasmaCrawlProfile.entry profile, - yacyURL url, - int count, - long delayMs) { - if (env == null || profile == null || url == null) - throw new NullPointerException("env, profile or url must not be null"); - this.sb = (plasmaSwitchboard)env; - this.profile = profile; - this.url = url; - this.count = count; - this.delay = delayMs; - this.setName("URLFetcher"); - } - - public URLFetcher( - serverSwitch env, - plasmaCrawlProfile.entry profile, - int count, - long delayMs) { - if (env == null || profile == null) - throw new NullPointerException("env or profile must not be null"); - this.sb = (plasmaSwitchboard)env; - this.profile = profile; - this.url = null; - this.count = count; - this.delay = delayMs; - this.setName("URLFetcher"); - } - - public void run() { - this.paused = false; - long start; - yacyURL url; - while (!isInterrupted()) { - try { - start = System.currentTimeMillis(); - url = getDLURL(); - if (url == null) { - serverLog.logSevere(this.getName(), "canceled because no valid URL for the URL-list could be determinded"); - return; - } - totalFetchedURLs += stackURLs(getURLs(url)); - this.lastRun = System.currentTimeMillis() - start; - totalRuns++; - serverLog.logInfo(this.getName(), "Loaded " + this.lastFetchedURLs + " URLs from " + url + " in " + this.lastRun + " ms into stackcrawler."); - if (this.delay < 0 || isInterrupted()) { - return; - } else synchronized (this) { - if (this.delay == 0) { - this.paused = true; - while (this.paused) this.wait(); - } else { - this.paused = true; - this.wait(this.delay); - } - } - this.paused = false; - } catch (InterruptedException e) { return; } - } - } - - private yacyURL getDLURL() { - if (this.url != null) return this.url; - - // choose random seed - yacySeed ys = null; - Iterator e = yacyCore.seedDB.seedsConnected(true, false, null, yacyVersion.YACY_PROVIDES_CRAWLS_VIA_LIST_HTML); - int num = new Random().nextInt(yacyCore.seedDB.sizeConnected()) + 1; - Object o; - for (int i=0; i#(remoteTriggeredCrawlPaused)#pause remote triggered crawl::continue remote triggered crawl#(/remoteTriggeredCrawlPaused)# #(remoteTriggeredCrawlPaused)# ::(paused)#(/remoteTriggeredCrawlPaused)# - - Global Crawl Trigger - #[globalCrawlTriggerQueueSize]# - #(globalCrawlTriggerPaused)#pause global crawl trigger::continue global crawl trigger#(/globalCrawlTriggerPaused)# - #(globalCrawlTriggerPaused)# ::(paused)#(/globalCrawlTriggerPaused)# - Pre-Queueing #[stackCrawlQueueSize]# diff --git a/htroot/WatchCrawler_p.html b/htroot/WatchCrawler_p.html index 5a935e3c1..6d335fd02 100644 --- a/htroot/WatchCrawler_p.html +++ b/htroot/WatchCrawler_p.html @@ -44,6 +44,16 @@ unlimited + + Limit Crawler +     + + + + + + unlimited + Remote Crawler     diff --git a/htroot/WatchCrawler_p.java b/htroot/WatchCrawler_p.java index 565ee5deb..e2553e75e 100644 --- a/htroot/WatchCrawler_p.java +++ b/htroot/WatchCrawler_p.java @@ -71,7 +71,7 @@ public class WatchCrawler_p { } else { prop.put("info", "0"); - if ((post.containsKey("autoforward")) && (switchboard.crawlQueues.coreCrawlJobSize() == 0)) { + if ((post.containsKey("autoforward")) && (switchboard.crawlQueues.coreCrawlJobSize() == 0) && (switchboard.crawlQueues.remoteTriggeredCrawlJobSize() == 0)) { prop.put("forwardToCrawlStart", "1"); } @@ -81,7 +81,7 @@ public class WatchCrawler_p { if (queue.equals("localcrawler")) { switchboard.continueCrawlJob(plasmaSwitchboard.CRAWLJOB_LOCAL_CRAWL); } else if (queue.equals("remotecrawler")) { - switchboard.continueCrawlJob(plasmaSwitchboard.CRAWLJOB_GLOBAL_CRAWL_TRIGGER); + switchboard.continueCrawlJob(plasmaSwitchboard.CRAWLJOB_REMOTE_TRIGGERED_CRAWL); } } @@ -91,7 +91,7 @@ public class WatchCrawler_p { if (queue.equals("localcrawler")) { switchboard.pauseCrawlJob(plasmaSwitchboard.CRAWLJOB_LOCAL_CRAWL); } else if (queue.equals("remotecrawler")) { - switchboard.pauseCrawlJob(plasmaSwitchboard.CRAWLJOB_GLOBAL_CRAWL_TRIGGER); + switchboard.pauseCrawlJob(plasmaSwitchboard.CRAWLJOB_REMOTE_TRIGGERED_CRAWL); } } diff --git a/htroot/env/templates/submenuCrawlURLFetch.template b/htroot/env/templates/submenuCrawlURLFetch.template deleted file mode 100644 index 35264b859..000000000 --- a/htroot/env/templates/submenuCrawlURLFetch.template +++ /dev/null @@ -1,7 +0,0 @@ - \ No newline at end of file diff --git a/htroot/js/WatchCrawler.js b/htroot/js/WatchCrawler.js index 0bc7002d0..96f9d4691 100644 --- a/htroot/js/WatchCrawler.js +++ b/htroot/js/WatchCrawler.js @@ -150,9 +150,16 @@ function handleQueues(){ updateTable(localcrawlerqueue, "local crawler"); + limitcrawlerqueue=getFirstChild(xml, "limitcrawlerqueue"); + updateTable(limitcrawlerqueue, "limitCrawlerTable"); + limitcrawlerqueue_size=getValue(getFirstChild(limitcrawlerqueue, "size")); + limitcrawlerqueue_state=getValue(getFirstChild(limitcrawlerqueue, "state")); + document.getElementById("limitcrawlerqueuesize").firstChild.nodeValue=limitcrawlerqueue_size; + putQueueState("limitcrawler", limitcrawlerqueue_state); + updateTable(limitcrawlerqueue, "limit crawler"); + remotecrawlerqueue=getFirstChild(xml, "remotecrawlerqueue"); updateTable(remotecrawlerqueue, "remoteCrawlerTable"); - remotecrawlerqueue_size=getValue(getFirstChild(remotecrawlerqueue, "size")); remotecrawlerqueue_state=getValue(getFirstChild(remotecrawlerqueue, "state")); document.getElementById("remotecrawlerqueuesize").firstChild.nodeValue=remotecrawlerqueue_size; diff --git a/htroot/rct_p.html b/htroot/rct_p.html new file mode 100644 index 000000000..af4a2dda4 --- /dev/null +++ b/htroot/rct_p.html @@ -0,0 +1,30 @@ + + + + YaCy '#[clientname]#': Index Control + #%env/templates/metas.template%# + + + #%env/templates/header.template%# +

remote crawl fetch test

+ +
+
Retrieve remote crawl url list +
+
Target Peer:
+
select +
+
+
+
+
+
+
+ + #%env/templates/footer.template%# + + \ No newline at end of file diff --git a/htroot/rct_p.java b/htroot/rct_p.java new file mode 100644 index 000000000..c2c33a2f7 --- /dev/null +++ b/htroot/rct_p.java @@ -0,0 +1,124 @@ +// rct_p.java +// ----------------------- +// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 28.11.2007 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2007-11-14 01:15:28 +0000 (Mi, 14 Nov 2007) $ +// $LastChangedRevision: 4216 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +import java.net.MalformedURLException; +import java.text.ParseException; +import java.util.Date; +import java.util.Iterator; + +import de.anomic.http.httpHeader; +import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.server.serverDate; +import de.anomic.server.serverObjects; +import de.anomic.server.serverSwitch; +import de.anomic.xml.rssReader; +import de.anomic.yacy.yacyClient; +import de.anomic.yacy.yacyCore; +import de.anomic.yacy.yacySeed; +import de.anomic.yacy.yacyURL; + +public class rct_p { + + public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) { + // return variable that accumulates replacements + plasmaSwitchboard sb = (plasmaSwitchboard) env; + serverObjects prop = new serverObjects(); + + if (post != null) { + if (post.containsKey("retrieve")) { + String peerhash = post.get("peer", null); + yacySeed seed = (peerhash == null) ? null : yacyCore.seedDB.getConnected(peerhash); + rssReader reader = (seed == null) ? null : yacyClient.queryRemoteCrawlURLs(seed, 10); + if (reader != null) { + rssReader.Item item; + for (int i = 0; i < reader.items(); i++) { + item = reader.getItem(i); + //System.out.println("URL=" + item.getLink() + ", desc=" + item.getDescription() + ", pubDate=" + item.getPubDate()); + + // put url on remote crawl stack + yacyURL url; + try { + url = new yacyURL(item.getLink(), null); + } catch (MalformedURLException e) { + url = null; + } + Date loaddate; + try { + loaddate = serverDate.parseShortSecondTime(item.getPubDate()); + } catch (ParseException e) { + loaddate = new Date(); + } + yacyURL referrer = null; // referrer needed! + if (sb.acceptURL(url)) { + // stack url + sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'"); + String reasonString = sb.crawlStacker.stackCrawl(url, referrer, peerhash, "REMOTE-CRAWLING", loaddate, 0, sb.defaultRemoteProfile); + + if (reasonString == null) { + // done + env.getLog().logInfo("crawlOrder: added remote crawl url: " + url.toNormalform(true, false)); + } else if (reasonString.startsWith("double")) { + // case where we have already the url loaded; + env.getLog().logInfo("crawlOrder: ignored double remote crawl url: " + url.toNormalform(true, false)); + } else { + env.getLog().logInfo("crawlOrder: ignored [" + reasonString + "] remote crawl url: " + url.toNormalform(true, false)); + } + } else { + env.getLog().logWarning("crawlOrder: Received URL outside of our domain: " + url.toNormalform(true, false)); + } + } + } + } + } + + listHosts(prop); + + // return rewrite properties + return prop; + } + + private static void listHosts(serverObjects prop) { + // list known hosts + yacySeed seed; + int hc = 0; + if (yacyCore.seedDB != null && yacyCore.seedDB.sizeConnected() > 0) { + Iterator e = yacyCore.dhtAgent.getProvidesRemoteCrawlURLs(); + while (e.hasNext()) { + seed = (yacySeed) e.next(); + if (seed != null) { + prop.put("hosts_" + hc + "_hosthash", seed.hash); + prop.putHTML("hosts_" + hc + "_hostname", seed.hash + " " + seed.get(yacySeed.NAME, "nameless") + " (" + seed.getLong(yacySeed.RCOUNT, 0) + ")"); + hc++; + } + } + prop.put("hosts", hc); + } else { + prop.put("hosts", "0"); + } + } + +} diff --git a/htroot/xml/queues_p.java b/htroot/xml/queues_p.java index 82105b2ac..15e8e8321 100644 --- a/htroot/xml/queues_p.java +++ b/htroot/xml/queues_p.java @@ -163,11 +163,15 @@ public class queues_p { prop.put("localCrawlState", sb.crawlJobIsPaused(plasmaSwitchboard.CRAWLJOB_LOCAL_CRAWL) ? STATE_PAUSED : STATE_RUNNING); int stackSize = sb.crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE); addNTable(prop, "list-local", sb.crawlQueues.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_CORE, Math.min(10, stackSize))); - - + //global crawl queue - prop.putNum("remoteCrawlSize", Integer.toString(sb.getThread(plasmaSwitchboard.CRAWLJOB_GLOBAL_CRAWL_TRIGGER).getJobCount())); - prop.put("remoteCrawlState", sb.crawlJobIsPaused(plasmaSwitchboard.CRAWLJOB_GLOBAL_CRAWL_TRIGGER) ? STATE_PAUSED : STATE_RUNNING); + prop.putNum("limitCrawlSize", Integer.toString(sb.crawlQueues.limitCrawlJobSize())); + prop.put("limitCrawlState", STATE_RUNNING); + stackSize = sb.crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT); + + //global crawl queue + prop.putNum("remoteCrawlSize", Integer.toString(sb.getThread(plasmaSwitchboard.CRAWLJOB_REMOTE_TRIGGERED_CRAWL).getJobCount())); + prop.put("remoteCrawlState", sb.crawlJobIsPaused(plasmaSwitchboard.CRAWLJOB_REMOTE_TRIGGERED_CRAWL) ? STATE_PAUSED : STATE_RUNNING); stackSize = sb.crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT); if (stackSize == 0) { diff --git a/htroot/xml/queues_p.xml b/htroot/xml/queues_p.xml index ed82dd877..23cfea8a0 100644 --- a/htroot/xml/queues_p.xml +++ b/htroot/xml/queues_p.xml @@ -49,6 +49,22 @@ #{/list-local}# + + #[limitCrawlSize]# + #[limitCrawlState]# +#{list-limit}# + + #[profile]# + #[initiator]# + #[depth]# + #[modified]# + #[anchor]# + #[url]# + #[hash]# + #(inProcess)#false::true#(/inProcess)# + +#{/list-limit}# + #[remoteCrawlSize]# #[remoteCrawlState]# diff --git a/htroot/yacy/crawlOrder.java b/htroot/yacy/crawlOrder.java index 5016d0694..33b20be67 100644 --- a/htroot/yacy/crawlOrder.java +++ b/htroot/yacy/crawlOrder.java @@ -129,7 +129,7 @@ public final class crawlOrder { delay = "3600"; // may request one hour later again } else try { yacySeed requester = yacyCore.seedDB.getConnected(iam); - int queuesize = switchboard.crawlQueues.coreCrawlJobSize() + switchboard.crawlQueues.limitCrawlTriggerJobSize() + switchboard.crawlQueues.remoteTriggeredCrawlJobSize() + switchboard.queueSize(); + int queuesize = switchboard.crawlQueues.coreCrawlJobSize() + switchboard.crawlQueues.limitCrawlJobSize() + switchboard.crawlQueues.remoteTriggeredCrawlJobSize() + switchboard.queueSize(); if (requester == null) { response = "denied"; reason = "unknown-client"; @@ -190,7 +190,7 @@ public final class crawlOrder { env.getLog().logWarning("crawlOrder: Received not normalized Referer URL " + refv.get(0) + " of URL " + urlv.get(0)); } - if (!switchboard.acceptURL(new yacyURL(newURL, null))) { + if (!switchboard.acceptURL(url)) { env.getLog().logWarning("crawlOrder: Received URL outside of our domain: " + newURL); return null; } diff --git a/htroot/yacy/list.html b/htroot/yacy/list.html deleted file mode 100644 index 285c7277d..000000000 --- a/htroot/yacy/list.html +++ /dev/null @@ -1 +0,0 @@ -#[list]# diff --git a/htroot/yacy/list.java b/htroot/yacy/list.java deleted file mode 100644 index 91cccd224..000000000 --- a/htroot/yacy/list.java +++ /dev/null @@ -1,152 +0,0 @@ -// list.java -// ----------------------- -// part of YaCy -// (C) by Michael Peter Christen; mc@anomic.de -// first published on http://www.anomic.de -// Frankfurt, Germany, 2004 -// -// This File is contributed by Alexander Schier -// -// $LastChangedDate$ -// $LastChangedRevision$ -// $LastChangedBy$ -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// -// Using this software in any meaning (reading, learning, copying, compiling, -// running) means that you agree that the Author(s) is (are) not responsible -// for cost, loss of data or any harm that may be caused directly or indirectly -// by usage of this softare or this documentation. The usage of this software -// is on your own risk. The installation and usage (starting/running) of this -// software may allow other people or application to access your computer and -// any attached devices and is highly dependent on the configuration of the -// software which must be done by the user of the software; the author(s) is -// (are) also not responsible for proper configuration and usage of the -// software, even if provoked by documentation provided together with -// the software. -// -// Any changes to this file according to the GPL as documented in the file -// gpl.txt aside this file in the shipment you received can be done to the -// lines that follows this copyright notice here, but changes must not be -// done inside the copyright notive above. A re-distribution must contain -// the intact and unchanged copyright notice. -// Contributions and changes to the program code must be marked as such. - -// You must compile this file with -// javac -classpath .:../../classes list.java -// if the shell's current path is HTROOT - -// contains contributions by [FB] to support listing URLs for URL Fetcher - -import java.io.File; - -import de.anomic.data.URLFetcherStack; -import de.anomic.data.htmlTools; -import de.anomic.data.listManager; -import de.anomic.http.httpHeader; -import de.anomic.plasma.plasmaSwitchboard; -import de.anomic.server.serverCore; -import de.anomic.server.serverObjects; -import de.anomic.server.serverSwitch; -import de.anomic.server.logging.serverLog; -import de.anomic.yacy.yacyCore; -import de.anomic.yacy.yacyNetwork; -import de.anomic.yacy.yacySeed; -import de.anomic.yacy.yacyURL; - -public final class list { - - public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) { - if (post == null || env == null) - throw new NullPointerException("post: " + post + ", sb: " + env); - plasmaSwitchboard sb = (plasmaSwitchboard) env; - - // return variable that accumulates replacements - final serverObjects prop = new serverObjects(); - if ((post == null) || (env == null)) return prop; - if (!yacyNetwork.authentifyRequest(post, env)) return prop; - - final String col = post.get("col", ""); - final File listsPath = env.getConfigPath(plasmaSwitchboard.LISTS_PATH, plasmaSwitchboard.LISTS_PATH_DEFAULT); - - String otherPeerName = null; - if (post.containsKey("iam")) { - yacySeed bla = yacyCore.seedDB.get(post.get("iam", "")); - if (bla != null) otherPeerName = bla.getName(); - } - if (otherPeerName == null) otherPeerName = (String)header.get(httpHeader.CONNECTION_PROP_CLIENTIP); - - if ((sb.isRobinsonMode()) && (!sb.isInMyCluster(otherPeerName))) { - // if we are a robinson cluster, answer only if this client is known by our network definition - return null; - } - - if (col.equals("black")) { - final StringBuffer out = new StringBuffer(); - - final String filenames=env.getConfig("BlackLists.Shared", ""); - final String[] filenamesarray = filenames.split(","); - - if(filenamesarray.length > 0){ - for(int i = 0;i < filenamesarray.length; i++){ - String filename = filenamesarray[i]; - File fileObj = new File(listsPath,filename); - out.append(listManager.getListString(fileObj, false)) - .append(serverCore.crlfString); - } - } // if filenamesarray.length > 0 - - prop.put("list",out.toString()); - } - // start contrib by [FB] - else if (col.length() == 0 && post.get("list", "").equals("queueUrls")) { - final URLFetcherStack db = CrawlURLFetchStack_p.getURLFetcherStack(env); - final String display = post.get("display", "list"); - if (display.equals("list")) { - // list urls from remote crawler queue for other peers - final int count = Math.min(post.getInt("count", 50), CrawlURLFetchStack_p.maxURLsPerFetch); - - if (count > 0 && db.size() > 0) { - final StringBuffer b = new StringBuffer(); - - yacyURL url; - int cnt = 0; - for (int i=0; i + + + + + + +c_32kgI-4HTE +3226 +20071128030353 +ok + + + + + + + + + + +http://publish.vx.roo.com/australian/ithomepagemini/ +sub + +20071126173629 +mlD2rBhnfuoY + + + + +http://www.news.com.au/story/0%2C23599%2C22835669-2%2C00.html + + +20071128014306 +qT1GjNRe_5SQ + + + +http://www.news.com.au/perthnow/story/0%2C21598%2C22835663-2761%2C00.html +Driver injured: Willagee crash witnesses sought + + +20071128014306 +yGMa4uRe_5SQ + + + +http://www.news.com.au/travel/story/0%2C26058%2C22835185-5014090%2C00.html + + +20071128014306 +qfob36Re_5SQ + + + + +http://www.news.com.au/story/0%2C23599%2C22835311-421%2C00.html + + +20071128014306 +YBLVBNRe_5SQ + + + +http://www.thirdwayblog.com/wp-content/uploads/ +sub + + +20071128010343 +9rnz2MUqGq6Z + + + +http://www.parliament.gr/kouselas/koino_dra/koino_docs/ +sub + +20071128010343 +hSTvg-u6LxcB + + + + +http://upload.wikimedia.org/wikipedia/el/f/f1/ +sub + +20071128010343 +F-3WVJBs-F4R + + + +http://www.logiprint.nl/nl/Briefpapier_drukken_Eindhoven.html +Briefpapier drukken Eindhoven + +20071011104246 + +bmBv8j07Ta7B + + + +*/ \ No newline at end of file diff --git a/htroot/yacy/urls.xml b/htroot/yacy/urls.xml index c94fb4958..e4adbf389 100644 --- a/htroot/yacy/urls.xml +++ b/htroot/yacy/urls.xml @@ -24,7 +24,7 @@ #[description]# #[author]# #[pubDate]# -#[guid]# +#[guid]# #{/item}# diff --git a/source/de/anomic/plasma/crawler/plasmaCrawlQueues.java b/source/de/anomic/plasma/crawler/plasmaCrawlQueues.java index acc3128e0..4719fd882 100644 --- a/source/de/anomic/plasma/crawler/plasmaCrawlQueues.java +++ b/source/de/anomic/plasma/crawler/plasmaCrawlQueues.java @@ -28,12 +28,14 @@ package de.anomic.plasma.crawler; import java.io.File; import java.io.IOException; +import java.net.MalformedURLException; +import java.text.ParseException; +import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.Iterator; import de.anomic.data.robotsParser; -import de.anomic.index.indexURLEntry; import de.anomic.plasma.plasmaCrawlEntry; import de.anomic.plasma.plasmaCrawlNURL; import de.anomic.plasma.plasmaCrawlProfile; @@ -41,8 +43,9 @@ import de.anomic.plasma.plasmaCrawlZURL; import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaParser; import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.server.serverDate; import de.anomic.server.logging.serverLog; -import de.anomic.tools.crypt; +import de.anomic.xml.rssReader; import de.anomic.yacy.yacyClient; import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacySeed; @@ -54,6 +57,7 @@ public class plasmaCrawlQueues { private serverLog log; private HashMap workers; // mapping from url hash to Worker thread object private plasmaProtocolLoader loader; + private ArrayList remoteCrawlProviderHashes; public plasmaCrawlNURL noticeURL; public plasmaCrawlZURL errorURL, delegatedURL; @@ -63,6 +67,7 @@ public class plasmaCrawlQueues { this.log = new serverLog("CRAWLER"); this.workers = new HashMap(); this.loader = new plasmaProtocolLoader(sb, log); + this.remoteCrawlProviderHashes = new ArrayList(); // start crawling management log.logConfig("Starting Crawling Management"); @@ -108,6 +113,9 @@ public class plasmaCrawlQueues { Iterator i = workers.values().iterator(); while (i.hasNext()) ((Thread) i.next()).interrupt(); // TODO: wait some more time until all threads are finished + noticeURL.close(); + errorURL.close(); + delegatedURL.close(); } public plasmaCrawlEntry[] activeWorker() { @@ -131,18 +139,32 @@ public class plasmaCrawlQueues { } public boolean coreCrawlJob() { + + boolean robinsonPrivateCase = ((sb.isRobinsonMode()) && + (!sb.getConfig(plasmaSwitchboard.CLUSTER_MODE, "").equals(plasmaSwitchboard.CLUSTER_MODE_PUBLIC_CLUSTER)) && + (!sb.getConfig(plasmaSwitchboard.CLUSTER_MODE, "").equals(plasmaSwitchboard.CLUSTER_MODE_PRIVATE_CLUSTER))); + + if ((robinsonPrivateCase) || ((coreCrawlJobSize() <= 20) && (limitCrawlJobSize() > 0))) { + // move some tasks to the core crawl job so we have something to do + int toshift = Math.min(10, limitCrawlJobSize()); // this cannot be a big number because the balancer makes a forced waiting if it cannot balance + for (int i = 0; i < toshift; i++) { + noticeURL.shift(plasmaCrawlNURL.STACK_TYPE_LIMIT, plasmaCrawlNURL.STACK_TYPE_CORE); + } + log.logInfo("shifted " + toshift + " jobs from global crawl to local crawl (coreCrawlJobSize()=" + coreCrawlJobSize() + + ", limitCrawlJobSize()=" + limitCrawlJobSize() + ", cluster.mode=" + sb.getConfig(plasmaSwitchboard.CLUSTER_MODE, "") + + ", robinsonMode=" + ((sb.isRobinsonMode()) ? "on" : "off")); + } + if (noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) == 0) { //log.logDebug("CoreCrawl: queue is empty"); return false; } if (sb.sbQueue.size() >= (int) sb.getConfigLong(plasmaSwitchboard.INDEXER_SLOTS, 30)) { - log.logFine("CoreCrawl: too many processes in indexing queue, dismissed (" + - "sbQueueSize=" + sb.sbQueue.size() + ")"); + log.logFine("CoreCrawl: too many processes in indexing queue, dismissed (" + "sbQueueSize=" + sb.sbQueue.size() + ")"); return false; } if (this.size() >= sb.getConfigLong(plasmaSwitchboard.CRAWLER_THREADS_ACTIVE_MAX, 10)) { - log.logFine("CoreCrawl: too many processes in loader queue, dismissed (" + - "cacheLoader=" + this.size() + ")"); + log.logFine("CoreCrawl: too many processes in loader queue, dismissed (" + "cacheLoader=" + this.size() + ")"); return false; } if (sb.onlineCaution()) { @@ -203,107 +225,84 @@ public class plasmaCrawlQueues { return true; } - - public int limitCrawlTriggerJobSize() { - return noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT); - } - - public boolean limitCrawlTriggerJob() { - if (noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) == 0) { - //log.logDebug("LimitCrawl: queue is empty"); - return false; - } - boolean robinsonPrivateCase = ((sb.isRobinsonMode()) && - (!sb.getConfig(plasmaSwitchboard.CLUSTER_MODE, "").equals(plasmaSwitchboard.CLUSTER_MODE_PUBLIC_CLUSTER)) && - (!sb.getConfig(plasmaSwitchboard.CLUSTER_MODE, "").equals(plasmaSwitchboard.CLUSTER_MODE_PRIVATE_CLUSTER))); + public boolean remoteCrawlLoaderJob() { + // check if we are allowed to crawl urls provided by other peers + if (!yacyCore.seedDB.mySeed().getFlagAcceptRemoteCrawl()) return false; - if ((robinsonPrivateCase) || ((coreCrawlJobSize() <= 20) && (limitCrawlTriggerJobSize() > 10))) { - // it is not efficient if the core crawl job is empty and we have too much to do - // move some tasks to the core crawl job - int toshift = 10; // this cannot be a big number because the balancer makes a forced waiting if it cannot balance - if (toshift > limitCrawlTriggerJobSize()) toshift = limitCrawlTriggerJobSize(); - for (int i = 0; i < toshift; i++) { - noticeURL.shift(plasmaCrawlNURL.STACK_TYPE_LIMIT, plasmaCrawlNURL.STACK_TYPE_CORE); - } - log.logInfo("shifted " + toshift + " jobs from global crawl to local crawl (coreCrawlJobSize()=" + coreCrawlJobSize() + ", limitCrawlTriggerJobSize()=" + limitCrawlTriggerJobSize() + ", cluster.mode=" + sb.getConfig(plasmaSwitchboard.CLUSTER_MODE, "") + ", robinsonMode=" + ((sb.isRobinsonMode()) ? "on" : "off")); - if (robinsonPrivateCase) return false; - } + // check if we are a senior peer + if (!yacyCore.seedDB.mySeed().isActive()) return false; - // check local indexing queues - // in case the placing of remote crawl fails, there must be space in the local queue to work off the remote crawl - if (sb.sbQueue.size() >= (int) sb.getConfigLong(plasmaSwitchboard.INDEXER_SLOTS, 30) * 2) { - log.logFine("LimitCrawl: too many processes in indexing queue, dismissed (" + - "sbQueueSize=" + sb.sbQueue.size() + ")"); - return false; - } - if (this.size() >= sb.getConfigLong(plasmaSwitchboard.CRAWLER_THREADS_ACTIVE_MAX, 10)) { - log.logFine("LimitCrawl: too many processes in loader queue, dismissed (" + - "cacheLoader=" + this.size() + ")"); - return false; - } - if (sb.onlineCaution()) { - log.logFine("LimitCrawl: online caution, omitting processing"); - return false; - } - - // if crawling was paused we have to wait until we were notified to continue - Object[] status = (Object[]) sb.crawlJobsStatus.get(plasmaSwitchboard.CRAWLJOB_GLOBAL_CRAWL_TRIGGER); - synchronized(status[plasmaSwitchboard.CRAWLJOB_SYNC]) { - if (((Boolean)status[plasmaSwitchboard.CRAWLJOB_STATUS]).booleanValue()) { - try { - status[plasmaSwitchboard.CRAWLJOB_SYNC].wait(); + // check if we have an entry in the provider list, othervise fill the list + yacySeed seed; + if ((remoteCrawlProviderHashes.size() == 0) && (remoteTriggeredCrawlJobSize() == 0)) { + if (yacyCore.seedDB != null && yacyCore.seedDB.sizeConnected() > 0) { + Iterator e = yacyCore.dhtAgent.getProvidesRemoteCrawlURLs(); + while (e.hasNext()) { + seed = (yacySeed) e.next(); + if (seed != null) { + remoteCrawlProviderHashes.add(seed.hash); + + } } - catch (InterruptedException e){ return false;} } } + if (remoteCrawlProviderHashes.size() == 0) return false; - // start a global crawl, if possible - String stats = "REMOTECRAWLTRIGGER[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " - + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]"; - try { - plasmaCrawlEntry urlEntry = noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT, true); - String profileHandle = urlEntry.profileHandle(); - // System.out.println("DEBUG plasmaSwitchboard.processCrawling: - // profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url()); - plasmaCrawlProfile.entry profile = sb.profilesActiveCrawls.getEntry(profileHandle); - if (profile == null) { - log.logWarning(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url()); - return true; - } + // take one entry from the provider list and load the entries from the remote peer + seed = null; + String hash = null; + while ((seed == null) && (remoteCrawlProviderHashes.size() > 0)) { + hash = (String) remoteCrawlProviderHashes.remove(remoteCrawlProviderHashes.size() - 1); + seed = yacyCore.seedDB.get(hash); + } + if (seed == null) return false; + + // we know a peer which should provide remote crawl entries. load them now. + rssReader reader = (seed == null) ? null : yacyClient.queryRemoteCrawlURLs(seed, 10); + if (reader == null) return true; + // parse the rss + rssReader.Item item; + for (int i = 0; i < reader.items(); i++) { + item = reader.getItem(i); + //System.out.println("URL=" + item.getLink() + ", desc=" + item.getDescription() + ", pubDate=" + item.getPubDate()); - // check if the protocol is supported - yacyURL url = urlEntry.url(); - String urlProtocol = url.getProtocol(); - if (!this.sb.crawlQueues.isSupportedProtocol(urlProtocol)) { - this.log.logSevere("Unsupported protocol in URL '" + url.toString()); - return true; + // put url on remote crawl stack + yacyURL url; + try { + url = new yacyURL(item.getLink(), null); + } catch (MalformedURLException e) { + url = null; } - - log.logFine("plasmaSwitchboard.limitCrawlTriggerJob: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() + ", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false") + ", depth=" + urlEntry.depth() + ", crawlDepth=" + profile.generalDepth() + ", filter=" - + profile.generalFilter() + ", permission=" + ((yacyCore.seedDB == null) ? "undefined" : (((yacyCore.seedDB.mySeed().isSenior()) || (yacyCore.seedDB.mySeed().isPrincipal())) ? "true" : "false"))); + Date loaddate; + try { + loaddate = serverDate.parseShortSecondTime(item.getPubDate()); + } catch (ParseException e) { + loaddate = new Date(); + } + yacyURL referrer = null; // referrer needed! + if (sb.acceptURL(url)) { + // stack url + sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'"); + String reasonString = sb.crawlStacker.stackCrawl(url, referrer, hash, "REMOTE-CRAWLING", loaddate, 0, sb.defaultRemoteProfile); - boolean tryRemote = ((noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) != 0) || (sb.sbQueue.size() != 0)) && - (profile.remoteIndexing()) && - (urlEntry.initiator() != null) && - // (!(urlEntry.initiator().equals(indexURL.dummyHash))) && - ((yacyCore.seedDB.mySeed().isSenior()) || (yacyCore.seedDB.mySeed().isPrincipal())); - if (tryRemote) { - // checking robots.txt for http(s) resources - if ((urlProtocol.equals("http") || urlProtocol.equals("https")) && robotsParser.isDisallowed(url)) { - this.log.logFine("Crawling of URL '" + url.toString() + "' disallowed by robots.txt."); - return true; + if (reasonString == null) { + // done + log.logInfo("crawlOrder: added remote crawl url: " + url.toNormalform(true, false)); + } else if (reasonString.startsWith("double")) { + // case where we have already the url loaded; + log.logInfo("crawlOrder: ignored double remote crawl url: " + url.toNormalform(true, false)); + } else { + log.logInfo("crawlOrder: ignored [" + reasonString + "] remote crawl url: " + url.toNormalform(true, false)); } - boolean success = processRemoteCrawlTrigger(urlEntry); - if (success) return true; + } else { + log.logWarning("crawlOrder: Received URL outside of our domain: " + url.toNormalform(true, false)); } - - processLocalCrawling(urlEntry, stats); // emergency case, work off the crawl locally - return true; - } catch (IOException e) { - log.logSevere(stats + ": CANNOT FETCH ENTRY: " + e.getMessage(), e); - if (e.getMessage().indexOf("hash is null") > 0) noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_LIMIT); - return true; // if we return a false here we will block everything } + return true; + } + + public int limitCrawlJobSize() { + return noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT); } public int remoteTriggeredCrawlJobSize() { @@ -399,108 +398,6 @@ public class plasmaCrawlQueues { return; } - private boolean processRemoteCrawlTrigger(plasmaCrawlEntry urlEntry) { - // if this returns true, then the urlEntry is considered as stored somewhere and the case is finished - // if this returns false, the urlEntry will be enqueued to the local crawl again - - // wrong access - if (urlEntry == null) { - log.logInfo("REMOTECRAWLTRIGGER[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: urlEntry=null"); - return true; // superfluous request; true correct in this context because the urlEntry shall not be tracked any more - } - - // check url - if (urlEntry.url() == null) { - log.logFine("ERROR: plasmaSwitchboard.processRemoteCrawlTrigger - url is null. name=" + urlEntry.name()); - return true; // same case as above: no more consideration - } - - // are we qualified for a remote crawl? - if ((yacyCore.seedDB.mySeed() == null) || (yacyCore.seedDB.mySeed().isJunior())) { - log.logFine("plasmaSwitchboard.processRemoteCrawlTrigger: no permission"); - return false; // no, we must crawl this page ourselves - } - - // check if peer for remote crawl is available - yacySeed remoteSeed = ((sb.isPublicRobinson()) && (sb.getConfig("cluster.mode", "").equals("publiccluster"))) ? - yacyCore.dhtAgent.getPublicClusterCrawlSeed(urlEntry.url().hash(), sb.clusterhashes) : - yacyCore.dhtAgent.getGlobalCrawlSeed(urlEntry.url().hash()); - if (remoteSeed == null) { - log.logFine("plasmaSwitchboard.processRemoteCrawlTrigger: no remote crawl seed available"); - return false; - } - - // do the request - HashMap page = yacyClient.crawlOrder(remoteSeed, urlEntry.url(), sb.getURL(urlEntry.referrerhash()), 6000); - if (page == null) { - log.logSevere(plasmaSwitchboard.STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " FAILED. URL CANNOT BE RETRIEVED from referrer hash: " + urlEntry.referrerhash()); - return false; - } - - // check if we got contact to peer and the peer respondet - if ((page == null) || (page.get("delay") == null)) { - log.logInfo("CRAWL: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " FAILED. CAUSE: unknown (URL=" + urlEntry.url().toString() + "). Removed peer."); - yacyCore.peerActions.peerDeparture(remoteSeed, "remote crawl to peer failed; peer answered unappropriate"); - return false; // no response from peer, we will crawl this ourself - } - - String response = (String) page.get("response"); - log.logFine("plasmaSwitchboard.processRemoteCrawlTrigger: remoteSeed=" - + remoteSeed.getName() + ", url=" + urlEntry.url().toString() - + ", response=" + page.toString()); // DEBUG - - // we received an answer and we are told to wait a specific time until we shall ask again for another crawl - int newdelay = Integer.parseInt((String) page.get("delay")); - yacyCore.dhtAgent.setCrawlDelay(remoteSeed.hash, newdelay); - if (response.equals("stacked")) { - // success, the remote peer accepted the crawl - log.logInfo(plasmaSwitchboard.STR_REMOTECRAWLTRIGGER + remoteSeed.getName() - + " PLACED URL=" + urlEntry.url().toString() - + "; NEW DELAY=" + newdelay); - // track this remote crawl - delegatedURL.newEntry(urlEntry, remoteSeed.hash, new Date(), 0, response).store(); - return true; - } - - // check other cases: the remote peer may respond that it already knows that url - if (response.equals("double")) { - // in case the peer answers double, it transmits the complete lurl data - String lurl = (String) page.get("lurl"); - if ((lurl != null) && (lurl.length() != 0)) { - String propStr = crypt.simpleDecode(lurl, (String) page.get("key")); - indexURLEntry entry = sb.wordIndex.loadedURL.newEntry(propStr); - try { - sb.wordIndex.loadedURL.store(entry); - sb.wordIndex.loadedURL.stack(entry, yacyCore.seedDB.mySeed().hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt? - // noticeURL.remove(entry.hash()); - } catch (IOException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - - log.logInfo(plasmaSwitchboard.STR_REMOTECRAWLTRIGGER + remoteSeed.getName() - + " SUPERFLUOUS. CAUSE: " + page.get("reason") - + " (URL=" + urlEntry.url().toString() - + "). URL IS CONSIDERED AS 'LOADED!'"); - return true; - } else { - log.logInfo(plasmaSwitchboard.STR_REMOTECRAWLTRIGGER + remoteSeed.getName() - + " REJECTED. CAUSE: bad lurl response / " + page.get("reason") + " (URL=" - + urlEntry.url().toString() + ")"); - remoteSeed.setFlagAcceptRemoteCrawl(false); - yacyCore.seedDB.update(remoteSeed.hash, remoteSeed); - return false; - } - } - - log.logInfo(plasmaSwitchboard.STR_REMOTECRAWLTRIGGER + remoteSeed.getName() - + " DENIED. RESPONSE=" + response + ", CAUSE=" - + page.get("reason") + ", URL=" + urlEntry.url().toString()); - remoteSeed.setFlagAcceptRemoteCrawl(false); - yacyCore.seedDB.update(remoteSeed.hash, remoteSeed); - return false; - } - public plasmaHTCache.Entry loadResourceFromWeb( yacyURL url, int socketTimeout, diff --git a/source/de/anomic/plasma/plasmaCrawlBalancer.java b/source/de/anomic/plasma/plasmaCrawlBalancer.java index 078f2222a..fe5213953 100644 --- a/source/de/anomic/plasma/plasmaCrawlBalancer.java +++ b/source/de/anomic/plasma/plasmaCrawlBalancer.java @@ -119,7 +119,6 @@ public class plasmaCrawlBalancer { resetFileIndex(); } - private void openFileIndex() { cacheStacksPath.mkdirs(); urlFileIndex = new kelondroCache(new kelondroFlexTable(cacheStacksPath, stackname + indexSuffix, -1, plasmaCrawlEntry.rowdef, true), true, false); diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 0f9aef309..be762399d 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -348,18 +348,17 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // 61_globalcawltrigger /** - *

public static final String CRAWLJOB_GLOBAL_CRAWL_TRIGGER = "61_globalcrawltrigger"

- *

Name of the global crawl trigger thread, popping one entry off it's queue and sending it to a non-busy peer to - * crawl it

+ *

public static final String CRAWLJOB_REMOTE_CRAWL_LOADER = "60_remotecrawlloader"

+ *

Name of the remote crawl list loading thread

* - * @see plasmaSwitchboard#CRAWLJOB_REMOTE_TRIGGERED_CRAWL + * @see plasmaSwitchboard#CRAWLJOB_REMOTE_CRAWL_LOADER */ - public static final String CRAWLJOB_GLOBAL_CRAWL_TRIGGER = "61_globalcrawltrigger"; - public static final String CRAWLJOB_GLOBAL_CRAWL_TRIGGER_METHOD_START = "limitCrawlTriggerJob"; - public static final String CRAWLJOB_GLOBAL_CRAWL_TRIGGER_METHOD_JOBCOUNT = "limitCrawlTriggerJobSize"; - public static final String CRAWLJOB_GLOBAL_CRAWL_TRIGGER_METHOD_FREEMEM = null; - public static final String CRAWLJOB_GLOBAL_CRAWL_TRIGGER_IDLESLEEP = "61_globalcrawltrigger_idlesleep"; - public static final String CRAWLJOB_GLOBAL_CRAWL_TRIGGER_BUSYSLEEP = "61_globalcrawltrigger_busysleep"; + public static final String CRAWLJOB_REMOTE_CRAWL_LOADER = "60_remotecrawlloader"; + public static final String CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_START = "remoteCrawlLoaderJob"; + public static final String CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_JOBCOUNT = null; + public static final String CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_FREEMEM = null; + public static final String CRAWLJOB_REMOTE_CRAWL_LOADER_IDLESLEEP = "60_remotecrawlloader_idlesleep"; + public static final String CRAWLJOB_REMOTE_CRAWL_LOADER_BUSYSLEEP = "60_remotecrawlloader_busysleep"; // 62_remotetriggeredcrawl /** @@ -1208,9 +1207,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser this.crawlJobsStatus.put(CRAWLJOB_REMOTE_TRIGGERED_CRAWL, new Object[]{ new Object(), Boolean.valueOf(getConfig(CRAWLJOB_REMOTE_TRIGGERED_CRAWL + "_isPaused", "false"))}); - this.crawlJobsStatus.put(CRAWLJOB_GLOBAL_CRAWL_TRIGGER, new Object[]{ + this.crawlJobsStatus.put(CRAWLJOB_REMOTE_CRAWL_LOADER, new Object[]{ new Object(), - Boolean.valueOf(getConfig(CRAWLJOB_GLOBAL_CRAWL_TRIGGER + "_isPaused", "false"))}); + Boolean.valueOf(getConfig(CRAWLJOB_REMOTE_CRAWL_LOADER + "_isPaused", "false"))}); // init cookie-Monitor this.log.logConfig("Starting Cookie Monitor"); @@ -1340,8 +1339,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser new serverInstantThread(this, PROXY_CACHE_ENQUEUE_METHOD_START, PROXY_CACHE_ENQUEUE_METHOD_JOBCOUNT, PROXY_CACHE_ENQUEUE_METHOD_FREEMEM), 10000); deployThread(CRAWLJOB_REMOTE_TRIGGERED_CRAWL, "Remote Crawl Job", "thread that performes a single crawl/indexing step triggered by a remote peer", null, new serverInstantThread(crawlQueues, CRAWLJOB_REMOTE_TRIGGERED_CRAWL_METHOD_START, CRAWLJOB_REMOTE_TRIGGERED_CRAWL_METHOD_JOBCOUNT, CRAWLJOB_REMOTE_TRIGGERED_CRAWL_METHOD_FREEMEM), 30000); - deployThread(CRAWLJOB_GLOBAL_CRAWL_TRIGGER, "Global Crawl Trigger", "thread that triggeres remote peers for crawling", "/IndexCreateWWWGlobalQueue_p.html", - new serverInstantThread(crawlQueues, CRAWLJOB_GLOBAL_CRAWL_TRIGGER_METHOD_START, CRAWLJOB_GLOBAL_CRAWL_TRIGGER_METHOD_JOBCOUNT, CRAWLJOB_GLOBAL_CRAWL_TRIGGER_METHOD_FREEMEM), 30000); // error here? + deployThread(CRAWLJOB_REMOTE_CRAWL_LOADER, "Remote Crawl URL Loader", "thread that loads remote crawl lists from other peers", "", + new serverInstantThread(crawlQueues, CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_START, CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_JOBCOUNT, CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_FREEMEM), 30000); // error here? deployThread(CRAWLJOB_LOCAL_CRAWL, "Local Crawl", "thread that performes a single crawl step from the local crawl queue", "/IndexCreateWWWLocalQueue_p.html", new serverInstantThread(crawlQueues, CRAWLJOB_LOCAL_CRAWL_METHOD_START, CRAWLJOB_LOCAL_CRAWL_METHOD_JOBCOUNT, CRAWLJOB_LOCAL_CRAWL_METHOD_FREEMEM), 10000); deployThread(SEED_UPLOAD, "Seed-List Upload", "task that a principal peer performes to generate and upload a seed-list to a ftp account", null, @@ -2639,18 +2638,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser thread.setIdleSleep(1000); } - thread = getThread(CRAWLJOB_GLOBAL_CRAWL_TRIGGER); - if (thread != null) { - setConfig(CRAWLJOB_GLOBAL_CRAWL_TRIGGER_BUSYSLEEP , thread.setBusySleep(Math.max(1000, newBusySleep * 3))); - thread.setIdleSleep(10000); - } - /* - thread = getThread(CRAWLJOB_REMOTE_TRIGGERED_CRAWL); - if (thread != null) { - setConfig(CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP , thread.setBusySleep(newBusySleep * 10)); - thread.setIdleSleep(10000); - } - */ thread = getThread(PROXY_CACHE_ENQUEUE); if (thread != null) { setConfig(PROXY_CACHE_ENQUEUE_BUSYSLEEP , thread.setBusySleep(0)); diff --git a/source/de/anomic/xml/rssReader.java b/source/de/anomic/xml/rssReader.java index 313f9408d..4616ec200 100644 --- a/source/de/anomic/xml/rssReader.java +++ b/source/de/anomic/xml/rssReader.java @@ -26,6 +26,8 @@ package de.anomic.xml; +import java.io.ByteArrayInputStream; +import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.HashMap; @@ -38,6 +40,9 @@ import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; +import de.anomic.server.serverByteBuffer; +import de.anomic.server.logging.serverLog; + public class rssReader extends DefaultHandler { // statics for item generation and automatic categorization @@ -72,17 +77,7 @@ public class rssReader extends DefaultHandler { private HashMap items; // a guid:Item map - public rssReader(String path) { - init(); - parse(path); - } - - public rssReader(InputStream stream) { - init(); - parse(stream); - } - - private void init() { + public rssReader() { itemsGUID = new ArrayList(); items = new HashMap(); buffer = new StringBuffer(); @@ -93,7 +88,8 @@ public class rssReader extends DefaultHandler { parsingItem = false; } - private void parse(String path) { + public rssReader(String path) { + this(); try { SAXParserFactory factory = SAXParserFactory.newInstance(); SAXParser saxParser = factory.newSAXParser(); @@ -103,7 +99,8 @@ public class rssReader extends DefaultHandler { } } - private void parse(InputStream stream) { + public rssReader(InputStream stream) { + this(); try { SAXParserFactory factory = SAXParserFactory.newInstance(); SAXParser saxParser = factory.newSAXParser(); @@ -112,6 +109,42 @@ public class rssReader extends DefaultHandler { e.printStackTrace(); } } + + public static rssReader parse(byte[] a) { + + // check integrity of array + if ((a == null) || (a.length == 0)) { + serverLog.logWarning("rssReader", "response=null"); + return null; + } + if (a.length < 100) { + serverLog.logWarning("rssReader", "response=" + new String(a)); + return null; + } + if (!serverByteBuffer.equals(a, " 0) return s; + } + } catch (kelondroException e) { + System.out.println("DEBUG providesRemoteCrawlURLsEnum:" + e.getMessage()); + yacyCore.log.logSevere("database inconsistency (" + e.getMessage() + "), re-set of db."); + seedDB.resetActiveTable(); + return null; + } + return null; + } + + public Object next() { + yacySeed next = nextSeed; + nextSeed = nextInternal(); + return next; + } + + public void remove() { + throw new UnsupportedOperationException(); + } + + } + public Iterator getAcceptRemoteIndexSeeds(String starthash) { // returns an enumeration of yacySeed-Objects // that have the AcceptRemoteIndex-Flag set diff --git a/source/de/anomic/yacy/yacyVersion.java b/source/de/anomic/yacy/yacyVersion.java index 4c1f67f91..792b0e04b 100644 --- a/source/de/anomic/yacy/yacyVersion.java +++ b/source/de/anomic/yacy/yacyVersion.java @@ -53,7 +53,7 @@ public final class yacyVersion implements Comparator, Comparable { public static final float YACY_SUPPORTS_GZIP_POST_REQUESTS = (float) 0.40300772; public static final float YACY_ACCEPTS_RANKING_TRANSMISSION = (float) 0.414; public static final float YACY_HANDLES_COLLECTION_INDEX = (float) 0.486; - public static final float YACY_PROVIDES_CRAWLS_VIA_LIST_HTML = (float) 0.50403367; + public static final float YACY_POVIDES_REMOTECRAWL_LISTS = (float) 0.550; // information about latest release, retrieved by other peers release version public static double latestRelease = 0.1; // this value is overwritten when a peer with later version appears diff --git a/yacy.init b/yacy.init index 55264ca16..6b536fb45 100644 --- a/yacy.init +++ b/yacy.init @@ -558,12 +558,12 @@ filterOutStopwordsFromTopwords=true 50_localcrawl_busysleep__pro=100 50_localcrawl_memprereq=4194304 50_localcrawl_isPaused=false -61_globalcrawltrigger_idlesleep=10000 -61_globalcrawltrigger_busysleep=500 -61_globalcrawltrigger_memprereq=2097152 -61_globalcrawltrigger_isPaused=false +60_remotecrawlloader_idlesleep=10000 +60_remotecrawlloader_busysleep=2000 +60_remotecrawlloader_memprereq=2097152 +60_remotecrawlloader_isPaused=false 62_remotetriggeredcrawl_idlesleep=10000 -62_remotetriggeredcrawl_busysleep=1000 +62_remotetriggeredcrawl_busysleep=500 62_remotetriggeredcrawl_memprereq=6291456 62_remotetriggeredcrawl_isPaused=false 70_cachemanager_idlesleep=1000