redesigned remote crawl process:

- instead of pushing urls to other peers, the urls are actively pulled
  by the peer that wants to do a remote crawl
- the remote crawl push process had been removed
- a process that adds urls from remote peers had been added
- the server-side interface for providing 'limit'-urls exists since 0.55 and works with this version
- the list-interface had been removed
- servlets using the list-interface had been removed (this implementation did not properly manage double-check)
- changes in configuration file to support new pull-process
- fixed a bug in crawl balancer (status was not saved/closed properly)
- the yacy/urls-protocol was extended to support different networks/clusters
- many interface-adoptions to new stack counters

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4232 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 18 years ago
parent 69521d92e5
commit 89b9b2b02a

@ -3,7 +3,7 @@ javacSource=1.4
javacTarget=1.4
# Release Configuration
releaseVersion=0.554
releaseVersion=0.555
releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
proReleaseFile=yacy_pro_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
releaseFileParentDir=yacy

@ -1,68 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>YaCy '#[clientname]#': URL Fetcher Stack Management</title>
#%env/templates/metas.template%#
</head>
<body id="CrawlURLFetchStack_p">
#%env/templates/header.template%#
#%env/templates/submenuCrawlURLFetch.template%#
<h2>Manage stack for remote URL fetches</h2>
#(addedUrls)#::<span class="success">Added #[added]# URLs!</span>#(/addedUrls)#
<form method="post" action="CrawlURLFetchStack_p.html" enctype="multipart/form-data">
<fieldset><legend>Statistics</legend>
<dl>
<dt>Currently stacked URLs:</dt><dd>#[urlCount]#</dd>
<dt>Totally fetched / added URLs:</dt><dd>#[totalFetched]# / #[totalAdded]#</dd>
#{peers}#
<dt>Fetched from #[peer]#</dt><dd>#[amount]#</dd>#{/peers}#
</dl>
</fieldset>
<fieldset><legend>Settings</legend>
<dl>
<dt><label for="maxSize">Maximum URLs for each transfer</label>:</dt>
<dd>
<input type="text" name="maxSize" id="maxSize" value="#[maxSize]#" maxlength="3" size="3" />
<input type="submit" name="setMaxSize" value="Set" />
#(set)#::
<span class="success">Set max. size for each transfer to #[value]#</span>::
<span class="error">Setting max. size for each transfer to #[value]# was unsuccessful: may not be negative</span>#(/set)#
</dd>
</dl>
</fieldset>
<fieldset><legend>Add URLs to stack</legend>
<dl>
<dt><label for="shiftloc">Shift URLs from Local Crawler</label>:</dt>
<dd>
<input type="text" name="shiftloc" id="shiftloc" value="#[locurlsVal]#" size="5" maxlength="5" style="text-align: right;" />
of <span class="tt">#[locurls]#</span> URLs
<input type="submit" name="shiftlcq" value="Shift" />#(shiftloc)#::
<span class="success">Shifted #[value]# URLs from Local Crawler Queue to URL Fetcher Stack (not bound: #[failed]#)</span>#(/shiftloc)#
</dd>
<dt><label for="shiftrem">Shift URLs from Remote Crawler</label>:</dt>
<dd>
<input type="text" name="shiftrem" id="shiftrem" value="#[remurlsVal]#" size="5" maxlength="5" style="text-align: right;" />
of <span class="tt">#[remurls]#</span> URLs
<input type="submit" name="shiftrcq" value="Shift" />#(shiftrem)#::
<span class="success">Shifted #[value]# URLs from Remote Crawler Queue to URL Fetcher Stack (not bound: #[failed]#)</span>#(/shiftrem)#
</dd>
<dt><label for="upload">Upload URL-List</label>:</dt>
<dd>
<input type="file" name="upload" id="upload" /> #(uploadError)#::&nbsp;<span class="error">No file entered for upload</span>#(/uploadError)#<br />
<input type="radio" name="uploadType" id="plain" value="plain" checked="checked" /> <label for="plain">Plain text, line-seperated</label><br />
<input type="radio" name="uploadType" id="html" value="html" /> <label for="html">HTML file, links will be added</label><br />
<input type="checkbox" name="blacklistCheck" id="blacklistCheck" checked="checked" /> <label for="blacklistCheck">Don't add URLs matching blacklists active for crawler</label><br />
<input type="submit" name="subupload" value="Upload File" />
#(upload)#::
<span class="success">Added #[added]# and rejected #[failed]# URLs from uploaded file successfully</span>::
<span class="error">An internal error occured processing the uploaded file: #[error]#</span>#(/upload)#
</dd>
</dl>
</fieldset>
</form>
#%env/templates/footer.template%#
</body>
</html>

@ -1,299 +0,0 @@
// CrawlURLFetchStack_p.java
// -------------------------------------
// part of YACY
//
// (C) 2007 by Franz Brausze
//
// last change: $LastChangedDate: $ by $LastChangedBy: $
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.PrintWriter;
import java.io.Writer;
import java.net.MalformedURLException;
import java.util.HashMap;
import java.util.Iterator;
import de.anomic.data.URLFetcherStack;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterWriter;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlEntry;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacyURL;
public class CrawlURLFetchStack_p {
public static final HashMap /* of PeerName, sent URLs */ fetchMap = new HashMap();
private static URLFetcherStack stack = null;
public static int maxURLsPerFetch = 50;
public static URLFetcherStack getURLFetcherStack(serverSwitch env) {
if (stack == null) try {
stack = new URLFetcherStack(env.getConfigPath(plasmaSwitchboard.DBPATH, plasmaSwitchboard.DBPATH_DEFAULT));
} catch (IOException e) {
serverLog.logSevere("URLFETCHER", "Couldn't initialize URL stack: " + e.getMessage());
}
return stack;
}
public static final String STREAM_CMD_ADDURLS_ = "ADD URLS: ";
public static final String STREAM_CMD_ADDURLSBLCHK_ = "ADD URLS CHECK BLACKLIST: ";
public static final String STREAM_CMD_END = "END";
public static final String STREAM_RESP_OK_ADDURLS_ = "FAILED URLS: ";
public static final String STREAM_RESP_OK = "OK";
public static final String STREAM_RESP_FAILED = "FAILED";
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
final serverObjects prop = new serverObjects();
plasmaSwitchboard sb = (plasmaSwitchboard)env;
if (((String)header.get(httpHeader.CONNECTION_PROP_PATH)).endsWith(".stream")) {
/* =================================================================
* .stream request
* ================================================================= */
InputStream in = (InputStream)header.get(httpHeader.CONNECTION_PROP_INPUTSTREAM);
OutputStream out = (OutputStream)header.get(httpHeader.CONNECTION_PROP_OUTPUTSTREAM);
BufferedReader inrb = new BufferedReader(new InputStreamReader(in));
PrintWriter outw = new PrintWriter(out);
String line;
int addurls = 0, cururl = 0;
boolean[] status = new boolean[0];
boolean blchk = false;
URLFetcherStack stack = getURLFetcherStack(env);
try {
while ((line = inrb.readLine()) != null) {
// commands
if (line.startsWith(STREAM_CMD_ADDURLS_)) {
try {
addurls = Integer.parseInt(line.substring(STREAM_CMD_ADDURLS_.length()));
status = new boolean[addurls];
cururl = 0;
blchk = false;
outw.println(STREAM_RESP_OK);
} catch (NumberFormatException e) {
outw.println(STREAM_RESP_FAILED);
}
} else if (line.startsWith(STREAM_CMD_ADDURLSBLCHK_)) {
try {
addurls = Integer.parseInt(line.substring(STREAM_CMD_ADDURLSBLCHK_.length()));
status = new boolean[addurls];
cururl = 0;
blchk = true;
outw.println(STREAM_RESP_OK);
} catch (NumberFormatException e) {
outw.println(STREAM_RESP_FAILED);
}
} else if (line.equals(STREAM_CMD_END)) {
break;
} else {
if (cururl < addurls) // add url
status[cururl++] = addURL(line, blchk, stack);
if (cururl > 0 && cururl == addurls ) {
// done with parsing the passed URL count, now some status output: i.e. 'FAILED URLS: 5 of 8'
outw.print(STREAM_RESP_OK_ADDURLS_);
StringBuffer stat = new StringBuffer();
for (int i=0; i<status.length; i++)
if (!status[i]) stat.append(i).append(", ");
outw.print(stat.substring(0, stat.length() - 2));
outw.print(" of ");
outw.println(status.length);
cururl = 0;
addurls = 0;
}
}
}
} catch (IOException e) { e.printStackTrace(); }
outw.flush();
return null;
}
/* =================================================================
* 'normal' request
* ================================================================= */
if (post != null) {
if (post.containsKey("addurls")) {
prop.put("addedUrls", "1");
prop.put("addedUrls_added", addURLs(post, post.getInt("addurls", -1), getURLFetcherStack(env)));
}
else if (post.containsKey("setMaxSize")) {
final int count = post.getInt("maxSize", maxURLsPerFetch);
if (count > 0) {
maxURLsPerFetch = count;
prop.put("set", "1");
prop.put("set_value", maxURLsPerFetch);
} else {
prop.put("set", "2");
prop.put("set_value", count);
}
}
else if (post.containsKey("shiftlcq")) {
final int count = Math.min(post.getInt("shiftloc", 0), sb.crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE));
final int failed = shiftFromNotice(sb.crawlQueues.noticeURL, plasmaCrawlNURL.STACK_TYPE_CORE, getURLFetcherStack(env), count);
prop.put("shiftloc", "1");
prop.put("shiftloc_value", count - failed);
prop.put("shiftloc_failed", failed);
}
else if (post.containsKey("shiftrcq")) {
final int count = post.getInt("shiftrem", 0);
final int failed = shiftFromNotice(sb.crawlQueues.noticeURL, plasmaCrawlNURL.STACK_TYPE_LIMIT, getURLFetcherStack(env), count);
prop.put("shiftrem", "1");
prop.put("shiftrem_value", count - failed);
prop.put("shiftrem_failed", failed);
}
else if (post.containsKey("subupload")) {
if (post.get("upload", "").length() == 0) {
prop.put("uploadError", "1");
} else {
final File file = new File(post.get("upload", ""));
final String content = new String((byte[])post.get("upload$file"));
final String type = post.get("uploadType", "");
final boolean blCheck = post.containsKey("blacklistCheck");
if (type.equals("plain")) {
prop.put("upload_added", addURLs(content.split("\n"), blCheck, getURLFetcherStack(env)));
prop.put("upload_failed", "0");
prop.put("upload", "1");
} else if (type.equals("html")) {
try {
final htmlFilterContentScraper scraper = new htmlFilterContentScraper(new yacyURL(file));
final Writer writer = new htmlFilterWriter(null, null, scraper, null, false);
serverFileUtils.write(content, writer);
writer.close();
final Iterator it = ((HashMap)scraper.getAnchors()).keySet().iterator();
int added = 0, failed = 0;
yacyURL url;
while (it.hasNext()) try {
url = new yacyURL((String) it.next(), null);
if (blCheck && plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, url)) {
failed++;
continue;
}
getURLFetcherStack(env).push(url);
added++;
} catch (MalformedURLException e) { failed++; }
prop.put("upload", "1");
prop.put("upload_added", added);
prop.put("upload_failed", failed);
} catch (Exception e) {
e.printStackTrace();
prop.put("upload", "2");
prop.putHTML("upload_error", e.getMessage());
}
}
}
}
}
putFetched(prop);
prop.put("urlCount", getURLFetcherStack(env).size());
prop.put("totalFetched", getURLFetcherStack(env).getPopped());
prop.put("totalAdded", getURLFetcherStack(env).getPushed());
prop.put("maxSize", maxURLsPerFetch);
prop.put("locurls", sb.crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE));
prop.put("remurls", sb.crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT));
prop.put("locurlsVal", Math.min(sb.crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE), 500));
prop.put("remurlsVal", Math.min(sb.crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT), 500));
return prop;
}
private static void putFetched(serverObjects prop) {
Iterator it = fetchMap.keySet().iterator();
int count = 0;
while (it.hasNext()) {
String key = (String)it.next();
prop.putHTML("peers_" + count + "_peer", key);
prop.put("peers_" + count + "_amount", ((Integer)fetchMap.get(key)).intValue());
count++;
}
prop.put("peers", count);
}
private static int addURLs(String[] urls, boolean blCheck, URLFetcherStack stack) {
int count = -1;
for (int i=0; i<urls.length; i++)
if (addURL(urls[i], blCheck, stack)) count++;
return count;
}
private static boolean addURL(String url, boolean blCheck, URLFetcherStack stack) {
try {
if (url == null || url.length() == 0) return false;
yacyURL u = new yacyURL(url, null);
if (blCheck && plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, u)) return false;
stack.push(u);
return true;
} catch (MalformedURLException e) { return false; }
}
private static int shiftFromNotice(plasmaCrawlNURL nurl, int fromStackType, URLFetcherStack stack, int count) {
plasmaCrawlEntry entry;
int failed = 0;
for (int i=0; i<count; i++) try {
entry = nurl.pop(fromStackType, false);
stack.push(entry.url());
} catch (IOException e) { failed++; }
return failed;
}
private static int addURLs(serverObjects post, int amount, URLFetcherStack stack) {
int count = 0;
String url;
for (int i=0; i<amount; i++) {
url = post.get("url" + i, null);
if (url == null || url.length() == 0) continue;
try {
stack.push(new yacyURL(url, null));
count++;
} catch (MalformedURLException e) {
serverLog.logInfo("URLFETCHER", "retrieved invalid url for adding to the stack: " + url);
}
}
return count;
}
}

@ -1,107 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>YaCy '#[clientname]#': URL Fetcher Management</title>
#%env/templates/metas.template%#
</head>
<body id="CrawlURLFetch_p">
#%env/templates/header.template%#
#%env/templates/submenuCrawlURLFetch.template%#
<h2>URL-Fetcher</h2>
<form method="post" action="CrawlURLFetch_p.html" enctype="multipart/form-data">
<fieldset><legend>Fetch new URLs to crawl</legend>
<p>
The newly added URLs will be crawled without any filter restricions except of the <em>static</em> stop-words.
The Re-Crawl option isn't used and the sites won't be stored in the Proxy Cache. Text and media types will be indexed.
Since these URLs will be requested explicitely from another peer, they won't be distributed for remote indexing.
</p>
<dl>
<dt><label for="url">Fetch from URL</label>:</dt>
<dd>
<input type="radio" name="source" value="url" id="url" checked="checked" />
<input type="text" id="host" name="host" size="60" value="#[host]#" />
#(hostError)#::&nbsp;<span class="error">Malformed URL</span>#(/hostError)#
#(saved)#::
</dd>
<dt><label for="savedURL">Or select previously entered URL</label>:</dt>
<dd>
<input type="radio" name="source" id="savedURL" value="saved" />
<select name="saved">#{urls}#
<option>#[url]#</option>#{/urls}#
</select>#(/saved)#
</dd>
#(peersKnown)#::
<dt><label for="peer">Fetch from Peer</label>:</dt>
<dd>
<input type="radio" name="source" value="peer" id="peer" />
<select name="peerhash">
<option value="random" selected="selected">Choose a random peer</option>#{peers}#
<option value="#[hash]#">#[name]#</option>#{/peers}#
</select>
<input type="submit" name="checkPeerURLCount" value="Check URL count" />
&nbsp;<label for="amount">Amount of URLs to request</label>:
<input type="text" name="amount" id="amount" value="50" maxlength="3" size="3" />
#(peerError)#::
&nbsp;<span class="error">Error fetching URL-list from <span class="tt">#[hash]#:#[name]#</span></span>::
&nbsp;<span class="error">Peer with hash <span class="tt">#[hash]#</span> doesn't seem to be online anymore</span>#(/peerError)#
</dd>#(/peersKnown)#
<dt>Frequency:</dt>
<dd>
<input type="radio" name="reg" value="once" id="once" checked="checked" /> <label for="once">Fetch only once</label><br />
<input type="radio" name="reg" value="self_det" id="self_det" disabled="disabled"/> <label for="self_det">Fetch when queue is empty</label><br />
<input type="radio" name="reg" value="delay" id="delay" /> <label for="delay">Fetch in a specified delay</label>:
<label for="frequency">every</label>
&nbsp;<input type="text" name="frequency" id="frequency" size="2" style="text-align: right;" maxlength="2"/>
<select name="freq_type">
<option value="days">Days</option>
<option value="hours" selected="selected">Hours</option>
<option value="minutes">Minutes</option>
</select>
#(freqError)#::&nbsp;<span class="error">Invalid period, fetching only once</span>#(/freqError)#
</dd>
<dt><input type="submit" name="start" value="Fetch URLs" /></dt>
</dl>
</fieldset>
</form>
#(threadError)#::
<span class="error">Error on stopping thread, it isn't alive anymore</span>::
<span class="error">Error on restarting thread, it isn't alive anymore</span>#(/threadError)#
#(runs)#::
<form method="post" action="CrawlURLFetch_p.html" enctype="multipart/form-data">
<fieldset><legend>Thread to fetch URLs is #(status)#running::stopped::paused#(/status)#</legend>
<dl>
<dt>Total runs:</dt><dd>#[totalRuns]#</dd>
<dt>Total fetched URLs:</dt><dd>#[totalFetchedURLs]#</dd>
<dt>Total failed URLs:</dt><dd>#[totalFailedURLs]#</dd>
<dt>Last run duration:</dt><dd>#[lastRun]# ms</dd>
<dt>Last server response:</dt><dd>#[lastServerResponse]#</dd>
<dt>Last fetched URLs:</dt><dd>#[lastFetchedURLs]#</dd>
<dt>Last failed URLs:</dt>
<dd>
#[error]#
<ul>#{error}#
<li><span class="error">#[reason]#</span>: <a href="#[url]#">#[url]#</a></li>#{/error}#
</ul>
</dd>
<dt><label for="newDelay">Re-set delay</label>:</dt>
<dd>
<input type="text" name="newDelay" id="newDelay" maxlength="2" size="6" value="#[curDelay]#" style="text-align: right;" /> minutes
<input type="submit" name="resetDelay" value="Set new delay" />
</dd>
<dt>#(status)#
<input type="submit" name="stop" value="Stop Thread" />::
<input type="submit" name="restart" value="Restart Thread" />::
<input type="submit" name="stop" value="Stop Thread" />
<input type="submit" name="restart" value="Restart Thread" />#(/status)#
</dt>
</dl>
</fieldset>
</form>
#(/runs)#
#%env/templates/footer.template%#
</body>
</html>

@ -1,543 +0,0 @@
// CrawlURLFetch_p.java
// -------------------------------------
// part of YACY
//
// (C) 2007 by Franz Brausze
//
// last change: $LastChangedDate: $ by $LastChangedBy: $
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Random;
import java.util.TreeMap;
import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaCrawlZURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverByteBuffer;
import de.anomic.server.serverSwitch;
import de.anomic.http.httpHeader;
import de.anomic.http.httpRemoteProxyConfig;
import de.anomic.http.httpc;
import de.anomic.server.serverObjects;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeed;
import de.anomic.yacy.yacyURL;
import de.anomic.yacy.yacyVersion;
public class CrawlURLFetch_p {
private static final long ERR_DATE = 1;
private static final long ERR_HOST_MALFORMED_URL = 1;
private static final long ERR_PEER_GENERAL_CONN = 1;
private static final long ERR_PEER_OFFLINE = 2;
private static final long ERR_THREAD_STOP = 1;
private static final long ERR_THREAD_RESUME = 2;
private static final long STAT_THREAD_ALIVE = 0;
private static final long STAT_THREAD_STOPPED = 1;
private static final long STAT_THREAD_PAUSED = 2;
private static URLFetcher fetcher = null;
private static plasmaCrawlProfile.entry profile = null;
private static ArrayList savedURLs = new ArrayList();
public static plasmaCrawlProfile.entry getCrawlProfile(serverSwitch env) {
if (profile == null) {
profile = ((plasmaSwitchboard)env).profilesActiveCrawls.newEntry(
"URLFetcher", // Name
null, // URL
".*", ".*", // General / specific filter
0, 0, // General / specific depth
-1, -1, -1, // Recrawl / Dom-filter depth / Dom-max-pages
true, // Crawl query
true, true, // Index text / media
false, true, // Store in HT- / TX-Cache
false, // Remote indexing
true, false, false); // Exclude static / dynamic / parent stopwords
}
return profile;
}
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
serverObjects prop = new serverObjects();
prop.put("host", "");
// List previously saved URLs for easy selection
listURLs(prop);
// List known hosts
listPeers(prop,
post != null && post.containsKey("checkPeerURLCount"),
((plasmaSwitchboard)env).remoteProxyConfig);
if (post != null) {
if (post.containsKey("start")) {
long frequency = URLFetcher.DELAY_ONCE;
if (post.containsKey("reg")) {
if (post.get("reg", "").equals("self_det")) {
frequency = URLFetcher.DELAY_SELF_DET;
} else if (post.get("reg", "").equals("delay")) {
frequency = getDate(post.get("frequency", ""), post.get("freq_type", ""));
if (frequency == -1)
prop.put("freqError", ERR_DATE);
}
}
int count = 50;
if (post.get("amount", "").matches("\\d+")) {
count = Integer.parseInt(post.get("amount", ""));
if (count > 999) count = 999;
}
if (fetcher != null) fetcher.interrupt();
fetcher = null;
if (post.get("source", "").equals("peer") &&
post.get("peerhash", "").equals("random")) {
fetcher = new URLFetcher(
env,
getCrawlProfile(env),
count,
frequency);
} else {
yacyURL url = null;
if (post.get("source", "").equals("url")) {
try {
url = new yacyURL(post.get("host", null), null);
if (!savedURLs.contains(url.toNormalform(true, true)))
savedURLs.add(url.toNormalform(true, true));
prop.put("host", post.get("host", url.toString()));
} catch (MalformedURLException e) {
prop.put("host", post.get("host", ""));
prop.put("hostError", ERR_HOST_MALFORMED_URL);
}
} else if (post.get("source", "").equals("savedURL")) {
try {
url = new yacyURL(post.get("saved", ""), null);
} catch (MalformedURLException e) {
/* should never appear, except for invalid input, see above */
}
} else if (post.get("source", "").equals("peer")) {
yacySeed ys = null;
ys = yacyCore.seedDB.get(post.get("peerhash", null));
if (ys != null) {
if ((url = URLFetcher.getListServletURL(
ys.getPublicAddress(),
URLFetcher.MODE_LIST,
count,
yacyCore.seedDB.mySeed().hash)) == null) {
prop.put("peerError", ERR_PEER_GENERAL_CONN);
prop.put("peerError_hash", post.get("peerhash", ""));
prop.put("peerError_name", ys.getName());
}
} else {
prop.put("peerError", ERR_PEER_OFFLINE);
prop.put("peerError_hash", post.get("peerhash", ""));
}
}
if (url != null) {
fetcher = new URLFetcher(
env,
getCrawlProfile(env),
url,
count,
frequency);
}
}
if (fetcher != null) fetcher.start();
}
else if (post.containsKey("stop")) {
if (fetcher != null) {
fetcher.interrupt();
} else {
prop.put("threadError", ERR_THREAD_STOP);
}
}
else if (post.containsKey("restart")) {
if (fetcher != null) {
fetcher.interrupt();
if (fetcher.url == null) {
fetcher = new URLFetcher(
env,
getCrawlProfile(env),
fetcher.count,
fetcher.delay);
} else {
fetcher = new URLFetcher(
env,
getCrawlProfile(env),
fetcher.url,
fetcher.count,
fetcher.delay);
}
fetcher.start();
} else {
prop.put("threadError", ERR_THREAD_RESUME);
}
}
else if (post.containsKey("resetDelay")) {
final long frequency = getDate(post.get("newDelay", ""), "minutes");
if (frequency == -1) {
prop.put("freqError", ERR_DATE);
} else {
fetcher.delay = frequency;
}
}
prop.put("LOCATION", "/CrawlURLFetch_p.html");
}
if (fetcher != null) {
prop.put("runs", "1");
prop.put("runs_status",
((fetcher.paused && fetcher.isAlive()) ? STAT_THREAD_PAUSED :
(fetcher.isAlive()) ? STAT_THREAD_ALIVE : STAT_THREAD_STOPPED));
prop.putNum("runs_totalRuns", URLFetcher.totalRuns);
prop.putNum("runs_totalFetchedURLs", URLFetcher.totalFetchedURLs);
prop.putNum("runs_totalFailedURLs", URLFetcher.totalFailed);
prop.putNum("runs_lastRun", fetcher.lastRun);
prop.putNum("runs_lastFetchedURLs", fetcher.lastFetchedURLs);
prop.put("runs_lastServerResponse", (fetcher.lastServerResponse == null)
? "" : fetcher.lastServerResponse);
prop.putNum("runs_curDelay", (int)(fetcher.delay / 60000));
Iterator it = fetcher.failed.keySet().iterator();
int i = 0;
Object key;
while (it.hasNext()) {
key = it.next();
prop.put("runs_error_" + i + "_reason", fetcher.failed.get(key));
prop.put("runs_error_" + i + "_url", (String)key);
i++;
}
prop.put("runs_error", i);
}
return prop;
}
private static int listURLs(serverObjects prop) {
if (savedURLs.size() == 0) return 0;
prop.put("saved", "1");
for (int i=0; i<savedURLs.size(); i++)
prop.put("saved_urls_" + i + "_url", savedURLs.get(i));
prop.putNum("saved_urls", savedURLs.size());
return savedURLs.size();
}
private static int listPeers(serverObjects prop, boolean checkURLCount, httpRemoteProxyConfig theRemoteProxyConfig) {
int peerCount = 0;
TreeMap hostList = new TreeMap();
String peername;
if (yacyCore.seedDB != null && yacyCore.seedDB.sizeConnected() > 0) {
final Iterator e = yacyCore.seedDB.seedsConnected(true, false, null, yacyVersion.YACY_PROVIDES_CRAWLS_VIA_LIST_HTML);
int dbsize;
while (e.hasNext()) {
yacySeed seed = (yacySeed) e.next();
if (seed != null && !seed.hash.equals(yacyCore.seedDB.mySeed().hash)) {
peername = seed.get(yacySeed.NAME, "nameless");
if (checkURLCount && (dbsize = getURLs2Fetch(seed, theRemoteProxyConfig)) > 0) {
hostList.put(peername + " (" + dbsize + ")", seed.hash);
} else {
hostList.put(peername, seed.hash);
}
}
}
}
if (hostList.size() > 0) {
while (!hostList.isEmpty() && (peername = (String) hostList.firstKey()) != null) {
final String hash = (String) hostList.get(peername);
prop.put("peersKnown_peers_" + peerCount + "_hash", hash);
prop.put("peersKnown_peers_" + peerCount + "_name", peername);
hostList.remove(peername);
peerCount++;
}
prop.put("peersKnown_peers", peerCount);
prop.put("peersKnown", "1");
} else {
prop.put("peersKnown", "0");
}
return peerCount;
}
private static int getURLs2Fetch(yacySeed seed, httpRemoteProxyConfig theRemoteProxyConfig) {
try {
String answer = new String(httpc.wget(
URLFetcher.getListServletURL(seed.getPublicAddress(), URLFetcher.MODE_COUNT, 0, null),
seed.getIP(),
5000,
null, null,
theRemoteProxyConfig,
null,
null));
if (answer.matches("\\d+"))
return Integer.parseInt(answer);
else {
serverLog.logFine("URLFETCHER", "Retrieved invalid answer from " + seed.getName() + ": '" + answer + "'");
return -1;
}
} catch (MalformedURLException e) {
/* should not happen */
return -3;
} catch (IOException e) {
return -2;
}
}
private static long getDate(String count, String type) {
long r = 0;
if (count != null && count.matches("\\d+")) r = Long.parseLong(count);
if (r < 1) return -1;
r *= 60000;
if (type.equals("days")) return r * 60 * 24;
else if (type.equals("hours")) return r * 60;
else if (type.equals("minutes")) return r;
else return -1;
}
public static class URLFetcher extends Thread {
public static final long DELAY_ONCE = -1;
public static final long DELAY_SELF_DET = 0;
public static final int MODE_LIST = 0;
public static final int MODE_COUNT = 1;
public static int totalRuns = 0;
public static int totalFetchedURLs = 0;
public static int totalFailed = 0;
public final HashMap failed = new HashMap();
public int lastFetchedURLs = 0;
public long lastRun = 0;
public String lastServerResponse = null;
public int lastFailed = 0;
public final yacyURL url;
public final int count;
public long delay;
public final plasmaSwitchboard sb;
public final plasmaCrawlProfile.entry profile;
public boolean paused = false;
public static yacyURL getListServletURL(String host, int mode, int count, String peerHash) {
String r = "http://" + host + "/yacy/list.html?list=queueUrls&display=";
switch (mode) {
case MODE_LIST: r += "list"; break;
case MODE_COUNT: r += "count"; break;
}
if (count > 0) r += "&count=" + count;
if (peerHash != null && peerHash.length() > 0) {
r += "&iam=" + peerHash;
} else if (mode == MODE_LIST) {
r += "&iam=" + yacyCore.seedDB.mySeed().hash;
}
try {
return new yacyURL(r, null);
} catch (MalformedURLException e) {
return null;
}
}
public URLFetcher(
serverSwitch env,
plasmaCrawlProfile.entry profile,
yacyURL url,
int count,
long delayMs) {
if (env == null || profile == null || url == null)
throw new NullPointerException("env, profile or url must not be null");
this.sb = (plasmaSwitchboard)env;
this.profile = profile;
this.url = url;
this.count = count;
this.delay = delayMs;
this.setName("URLFetcher");
}
public URLFetcher(
serverSwitch env,
plasmaCrawlProfile.entry profile,
int count,
long delayMs) {
if (env == null || profile == null)
throw new NullPointerException("env or profile must not be null");
this.sb = (plasmaSwitchboard)env;
this.profile = profile;
this.url = null;
this.count = count;
this.delay = delayMs;
this.setName("URLFetcher");
}
public void run() {
this.paused = false;
long start;
yacyURL url;
while (!isInterrupted()) {
try {
start = System.currentTimeMillis();
url = getDLURL();
if (url == null) {
serverLog.logSevere(this.getName(), "canceled because no valid URL for the URL-list could be determinded");
return;
}
totalFetchedURLs += stackURLs(getURLs(url));
this.lastRun = System.currentTimeMillis() - start;
totalRuns++;
serverLog.logInfo(this.getName(), "Loaded " + this.lastFetchedURLs + " URLs from " + url + " in " + this.lastRun + " ms into stackcrawler.");
if (this.delay < 0 || isInterrupted()) {
return;
} else synchronized (this) {
if (this.delay == 0) {
this.paused = true;
while (this.paused) this.wait();
} else {
this.paused = true;
this.wait(this.delay);
}
}
this.paused = false;
} catch (InterruptedException e) { return; }
}
}
private yacyURL getDLURL() {
if (this.url != null) return this.url;
// choose random seed
yacySeed ys = null;
Iterator e = yacyCore.seedDB.seedsConnected(true, false, null, yacyVersion.YACY_PROVIDES_CRAWLS_VIA_LIST_HTML);
int num = new Random().nextInt(yacyCore.seedDB.sizeConnected()) + 1;
Object o;
for (int i=0; i<num && e.hasNext(); i++) {
o = e.next();
if (o != null) ys = (yacySeed)o;
}
if (ys == null) return null;
return getListServletURL(ys.getPublicAddress(), MODE_LIST, this.count, yacyCore.seedDB.mySeed().hash);
}
private int stackURLs(ArrayList /*of yacyURL*/ urls) {
this.lastFailed = 0;
this.lastFetchedURLs = 0;
this.failed.clear();
if (urls == null) return 0;
String reason;
yacyURL url;
for (int i = 0; i < urls.size() && !isInterrupted(); i++) {
url = (yacyURL) urls.get(i);
reason = this.sb.crawlStacker.stackCrawl(
url,
null,
yacyCore.seedDB.mySeed().hash,
null,
new Date(),
this.profile.generalDepth(),
this.profile);
if (reason == null) {
serverLog.logFine(this.getName(), "stacked " + url);
this.lastFetchedURLs++;
} else {
serverLog.logFine(this.getName(), "error on stacking " + url + ": " + reason);
this.lastFailed++;
totalFailed++;
this.failed.put(url, reason);
plasmaCrawlZURL.Entry ee = this.sb.crawlQueues.errorURL.newEntry(
url,
reason);
ee.store();
this.sb.crawlQueues.errorURL.push(ee);
}
}
return this.lastFetchedURLs;
}
private ArrayList /*of yacyURL */ getURLs(yacyURL url) {
if (url == null) return null;
ArrayList a = new ArrayList();
try {
httpc con = new httpc(
url.getHost(),
url.getHost(),
url.getPort(),
15000,
url.getProtocol().equals("https"),
plasmaSwitchboard.getSwitchboard().remoteProxyConfig, null, null);
httpHeader header = new httpHeader();
header.put(httpHeader.ACCEPT_ENCODING, "US-ASCII");
header.put(httpHeader.HOST, url.getHost());
httpc.response res = con.GET(url.getPath() + "?" + url.getQuery(), header);
serverLog.logFine(this.getName(), "downloaded URL-list from " + url + " (" + res.statusCode + ")");
this.lastServerResponse = res.statusCode + " (" + res.statusText + ")";
if (res.status.startsWith("2")) {
serverByteBuffer sbb = new serverByteBuffer();
//byte[] cbs = res.writeContent();
res.writeContent(sbb, null);
String encoding = res.responseHeader.getCharacterEncoding();
if (encoding == null) encoding = "US-ASCII";
String[] s = (new String(sbb.getBytes(), encoding)).split("\n");
for (int i = 0; i < s.length; i++) {
try {
a.add(new yacyURL(s[i], null));
} catch (MalformedURLException e) {}
}
}
con.close();
} catch (IOException e) { }
return a;
}
}
}

@ -109,7 +109,7 @@ public class PerformanceQueues_p {
}
prop.putHTML("table_" + c + "_longdescr", thread.getLongDescription(), xml);
queuesize = thread.getJobCount();
prop.put("table_" + c + "_queuesize", (queuesize == Integer.MAX_VALUE) ? "unknown" : yFormatter.number(queuesize, !xml));
prop.put("table_" + c + "_queuesize", (queuesize == Integer.MAX_VALUE) ? "unlimited" : yFormatter.number(queuesize, !xml));
blocktime = thread.getBlockTime();
sleeptime = thread.getSleepTime();

@ -118,9 +118,9 @@ public class ScreenSaver {
remoteTriggeredCrawlStarted = true;
sb.continueCrawlJob(plasmaSwitchboard.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
}
if (sb.crawlJobIsPaused(plasmaSwitchboard.CRAWLJOB_GLOBAL_CRAWL_TRIGGER)) {
if (sb.crawlJobIsPaused(plasmaSwitchboard.CRAWLJOB_REMOTE_CRAWL_LOADER)) {
globalCrawlTriggerStarted = true;
sb.continueCrawlJob(plasmaSwitchboard.CRAWLJOB_GLOBAL_CRAWL_TRIGGER);
sb.continueCrawlJob(plasmaSwitchboard.CRAWLJOB_REMOTE_CRAWL_LOADER);
}
} else if (line.equals("EXIT")) {
outputWriter.println("OK");
@ -143,7 +143,7 @@ public class ScreenSaver {
sb.pauseCrawlJob(plasmaSwitchboard.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
}
if (globalCrawlTriggerStarted) {
sb.pauseCrawlJob(plasmaSwitchboard.CRAWLJOB_GLOBAL_CRAWL_TRIGGER);
sb.pauseCrawlJob(plasmaSwitchboard.CRAWLJOB_REMOTE_CRAWL_LOADER);
}
}
}

@ -91,8 +91,6 @@ public class Status {
sb.pauseCrawlJob(plasmaSwitchboard.CRAWLJOB_LOCAL_CRAWL);
else if (jobType.equals("remoteTriggeredCrawl"))
sb.pauseCrawlJob(plasmaSwitchboard.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
else if (jobType.equals("globalCrawlTrigger"))
sb.pauseCrawlJob(plasmaSwitchboard.CRAWLJOB_GLOBAL_CRAWL_TRIGGER);
redirect = true;
} else if (post.containsKey("continueCrawlJob")) {
String jobType = (String) post.get("jobType");
@ -100,8 +98,6 @@ public class Status {
sb.continueCrawlJob(plasmaSwitchboard.CRAWLJOB_LOCAL_CRAWL);
else if (jobType.equals("remoteTriggeredCrawl"))
sb.continueCrawlJob(plasmaSwitchboard.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
else if (jobType.equals("globalCrawlTrigger"))
sb.continueCrawlJob(plasmaSwitchboard.CRAWLJOB_GLOBAL_CRAWL_TRIGGER);
redirect = true;
} else if (post.containsKey("ResetTraffic")) {
httpdByteCountInputStream.resetCount();
@ -346,9 +342,6 @@ public class Status {
prop.putNum("remoteTriggeredCrawlQueueSize", sb.getThread(plasmaSwitchboard.CRAWLJOB_REMOTE_TRIGGERED_CRAWL).getJobCount());
prop.put("remoteTriggeredCrawlPaused",sb.crawlJobIsPaused(plasmaSwitchboard.CRAWLJOB_REMOTE_TRIGGERED_CRAWL) ? "1" : "0");
prop.putNum("globalCrawlTriggerQueueSize", sb.getThread(plasmaSwitchboard.CRAWLJOB_GLOBAL_CRAWL_TRIGGER).getJobCount());
prop.put("globalCrawlTriggerPaused",sb.crawlJobIsPaused(plasmaSwitchboard.CRAWLJOB_GLOBAL_CRAWL_TRIGGER) ? "1" : "0");
prop.putNum("stackCrawlQueueSize", sb.crawlStacker.size());

@ -76,12 +76,6 @@
<td><a href="Status.html?#(remoteTriggeredCrawlPaused)#pauseCrawlJob::continueCrawlJob#(/remoteTriggeredCrawlPaused)#=&amp;jobType=remoteTriggeredCrawl" title="#(remoteTriggeredCrawlPaused)#pause remote triggered crawl::continue remote triggered crawl#(/remoteTriggeredCrawlPaused)#"><img src="env/grafics/#(remoteTriggeredCrawlPaused)#stop.gif::start.gif#(/remoteTriggeredCrawlPaused)#" alt="#(remoteTriggeredCrawlPaused)#pause remote triggered crawl::continue remote triggered crawl#(/remoteTriggeredCrawlPaused)#" style="width:12px;height:12px;" /></a></td>
<td>#(remoteTriggeredCrawlPaused)#&nbsp;::(paused)#(/remoteTriggeredCrawlPaused)#</td>
</tr>
<tr>
<td><a href="IndexCreateWWWGlobalQueue_p.html">Global Crawl Trigger</a></td>
<td>#[globalCrawlTriggerQueueSize]#</td>
<td><a href="Status.html?#(globalCrawlTriggerPaused)#pauseCrawlJob::continueCrawlJob#(/globalCrawlTriggerPaused)#=&amp;jobType=globalCrawlTrigger" title="#(globalCrawlTriggerPaused)#pause global crawl trigger::continue global crawl trigger#(/globalCrawlTriggerPaused)#"><img src="env/grafics/#(globalCrawlTriggerPaused)#stop.gif::start.gif#(/globalCrawlTriggerPaused)#" alt="#(globalCrawlTriggerPaused)#pause global crawl trigger::continue global crawl trigger#(/globalCrawlTriggerPaused)#" style="width:12px;height:12px;" /></a></td>
<td>#(globalCrawlTriggerPaused)#&nbsp;::(paused)#(/globalCrawlTriggerPaused)#</td>
</tr>
<tr>
<td>Pre-Queueing</td>
<td>#[stackCrawlQueueSize]#</td>

@ -44,6 +44,16 @@
</td>
<td align="right">unlimited</td>
</tr>
<tr class="TableCellLight">
<td align="left">Limit Crawler</td>
<td align="right"><span id="limitcrawlerqueuesize">&nbsp;&nbsp;&nbsp;</span></td>
<td>
<a href="" title="" id="limitcrawlerstateA">
<img src="" alt="" style="width:12px; height:12px;" id="limitcrawlerstateIMG" />
</a>
</td>
<td align="right">unlimited</td>
</tr>
<tr class="TableCellLight">
<td align="left">Remote Crawler</td>
<td align="right"><span id="remotecrawlerqueuesize">&nbsp;&nbsp;&nbsp;</span></td>

@ -71,7 +71,7 @@ public class WatchCrawler_p {
} else {
prop.put("info", "0");
if ((post.containsKey("autoforward")) && (switchboard.crawlQueues.coreCrawlJobSize() == 0)) {
if ((post.containsKey("autoforward")) && (switchboard.crawlQueues.coreCrawlJobSize() == 0) && (switchboard.crawlQueues.remoteTriggeredCrawlJobSize() == 0)) {
prop.put("forwardToCrawlStart", "1");
}
@ -81,7 +81,7 @@ public class WatchCrawler_p {
if (queue.equals("localcrawler")) {
switchboard.continueCrawlJob(plasmaSwitchboard.CRAWLJOB_LOCAL_CRAWL);
} else if (queue.equals("remotecrawler")) {
switchboard.continueCrawlJob(plasmaSwitchboard.CRAWLJOB_GLOBAL_CRAWL_TRIGGER);
switchboard.continueCrawlJob(plasmaSwitchboard.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
}
}
@ -91,7 +91,7 @@ public class WatchCrawler_p {
if (queue.equals("localcrawler")) {
switchboard.pauseCrawlJob(plasmaSwitchboard.CRAWLJOB_LOCAL_CRAWL);
} else if (queue.equals("remotecrawler")) {
switchboard.pauseCrawlJob(plasmaSwitchboard.CRAWLJOB_GLOBAL_CRAWL_TRIGGER);
switchboard.pauseCrawlJob(plasmaSwitchboard.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
}
}

@ -1,7 +0,0 @@
<div class="SubMenu">
<h3>URL Fetcher Menu</h3>
<ul class="SubMenu">
<li><a href="/CrawlURLFetch_p.html" class="MenuItemLink lock">URL Fetcher</a></li>
<li><a href="/CrawlURLFetchStack_p.html" class="MenuItemLink lock">URL Stack</a></li>
</ul>
</div>

@ -150,9 +150,16 @@ function handleQueues(){
updateTable(localcrawlerqueue, "local crawler");
limitcrawlerqueue=getFirstChild(xml, "limitcrawlerqueue");
updateTable(limitcrawlerqueue, "limitCrawlerTable");
limitcrawlerqueue_size=getValue(getFirstChild(limitcrawlerqueue, "size"));
limitcrawlerqueue_state=getValue(getFirstChild(limitcrawlerqueue, "state"));
document.getElementById("limitcrawlerqueuesize").firstChild.nodeValue=limitcrawlerqueue_size;
putQueueState("limitcrawler", limitcrawlerqueue_state);
updateTable(limitcrawlerqueue, "limit crawler");
remotecrawlerqueue=getFirstChild(xml, "remotecrawlerqueue");
updateTable(remotecrawlerqueue, "remoteCrawlerTable");
remotecrawlerqueue_size=getValue(getFirstChild(remotecrawlerqueue, "size"));
remotecrawlerqueue_state=getValue(getFirstChild(remotecrawlerqueue, "state"));
document.getElementById("remotecrawlerqueuesize").firstChild.nodeValue=remotecrawlerqueue_size;

@ -0,0 +1,30 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>YaCy '#[clientname]#': Index Control</title>
#%env/templates/metas.template%#
</head>
<body id="IndexControl">
#%env/templates/header.template%#
<h2>remote crawl fetch test</h2>
<form name="selection" action="rct_p.html" method="post" enctype="multipart/form-data">
<fieldset><legend>Retrieve remote crawl url list</legend>
<dl>
<dt class="TableCellDark">Target Peer:</dt>
<dd>select <select name="peer">
#{hosts}#
<option value="#[hosthash]#">#[hostname]#</option>
#{/hosts}#
</select>
</dd>
<dt class="TableCellLight"></dt>
<dd><input type="submit" name="retrieve" value="retrieve" />
</dd>
</dl>
</fieldset>
</form>
#%env/templates/footer.template%#
</body>
</html>

@ -0,0 +1,124 @@
// rct_p.java
// -----------------------
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 28.11.2007 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2007-11-14 01:15:28 +0000 (Mi, 14 Nov 2007) $
// $LastChangedRevision: 4216 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.net.MalformedURLException;
import java.text.ParseException;
import java.util.Date;
import java.util.Iterator;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverDate;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.xml.rssReader;
import de.anomic.yacy.yacyClient;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeed;
import de.anomic.yacy.yacyURL;
public class rct_p {
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
// return variable that accumulates replacements
plasmaSwitchboard sb = (plasmaSwitchboard) env;
serverObjects prop = new serverObjects();
if (post != null) {
if (post.containsKey("retrieve")) {
String peerhash = post.get("peer", null);
yacySeed seed = (peerhash == null) ? null : yacyCore.seedDB.getConnected(peerhash);
rssReader reader = (seed == null) ? null : yacyClient.queryRemoteCrawlURLs(seed, 10);
if (reader != null) {
rssReader.Item item;
for (int i = 0; i < reader.items(); i++) {
item = reader.getItem(i);
//System.out.println("URL=" + item.getLink() + ", desc=" + item.getDescription() + ", pubDate=" + item.getPubDate());
// put url on remote crawl stack
yacyURL url;
try {
url = new yacyURL(item.getLink(), null);
} catch (MalformedURLException e) {
url = null;
}
Date loaddate;
try {
loaddate = serverDate.parseShortSecondTime(item.getPubDate());
} catch (ParseException e) {
loaddate = new Date();
}
yacyURL referrer = null; // referrer needed!
if (sb.acceptURL(url)) {
// stack url
sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
String reasonString = sb.crawlStacker.stackCrawl(url, referrer, peerhash, "REMOTE-CRAWLING", loaddate, 0, sb.defaultRemoteProfile);
if (reasonString == null) {
// done
env.getLog().logInfo("crawlOrder: added remote crawl url: " + url.toNormalform(true, false));
} else if (reasonString.startsWith("double")) {
// case where we have already the url loaded;
env.getLog().logInfo("crawlOrder: ignored double remote crawl url: " + url.toNormalform(true, false));
} else {
env.getLog().logInfo("crawlOrder: ignored [" + reasonString + "] remote crawl url: " + url.toNormalform(true, false));
}
} else {
env.getLog().logWarning("crawlOrder: Received URL outside of our domain: " + url.toNormalform(true, false));
}
}
}
}
}
listHosts(prop);
// return rewrite properties
return prop;
}
private static void listHosts(serverObjects prop) {
// list known hosts
yacySeed seed;
int hc = 0;
if (yacyCore.seedDB != null && yacyCore.seedDB.sizeConnected() > 0) {
Iterator e = yacyCore.dhtAgent.getProvidesRemoteCrawlURLs();
while (e.hasNext()) {
seed = (yacySeed) e.next();
if (seed != null) {
prop.put("hosts_" + hc + "_hosthash", seed.hash);
prop.putHTML("hosts_" + hc + "_hostname", seed.hash + " " + seed.get(yacySeed.NAME, "nameless") + " (" + seed.getLong(yacySeed.RCOUNT, 0) + ")");
hc++;
}
}
prop.put("hosts", hc);
} else {
prop.put("hosts", "0");
}
}
}

@ -163,11 +163,15 @@ public class queues_p {
prop.put("localCrawlState", sb.crawlJobIsPaused(plasmaSwitchboard.CRAWLJOB_LOCAL_CRAWL) ? STATE_PAUSED : STATE_RUNNING);
int stackSize = sb.crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
addNTable(prop, "list-local", sb.crawlQueues.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_CORE, Math.min(10, stackSize)));
//global crawl queue
prop.putNum("remoteCrawlSize", Integer.toString(sb.getThread(plasmaSwitchboard.CRAWLJOB_GLOBAL_CRAWL_TRIGGER).getJobCount()));
prop.put("remoteCrawlState", sb.crawlJobIsPaused(plasmaSwitchboard.CRAWLJOB_GLOBAL_CRAWL_TRIGGER) ? STATE_PAUSED : STATE_RUNNING);
prop.putNum("limitCrawlSize", Integer.toString(sb.crawlQueues.limitCrawlJobSize()));
prop.put("limitCrawlState", STATE_RUNNING);
stackSize = sb.crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT);
//global crawl queue
prop.putNum("remoteCrawlSize", Integer.toString(sb.getThread(plasmaSwitchboard.CRAWLJOB_REMOTE_TRIGGERED_CRAWL).getJobCount()));
prop.put("remoteCrawlState", sb.crawlJobIsPaused(plasmaSwitchboard.CRAWLJOB_REMOTE_TRIGGERED_CRAWL) ? STATE_PAUSED : STATE_RUNNING);
stackSize = sb.crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT);
if (stackSize == 0) {

@ -49,6 +49,22 @@
</entry>
#{/list-local}#
</localcrawlerqueue>
<limitcrawlerqueue>
<size>#[limitCrawlSize]#</size>
<state>#[limitCrawlState]#</state>
#{list-limit}#
<entry>
<profile>#[profile]#</profile>
<initiator>#[initiator]#</initiator>
<depth>#[depth]#</depth>
<modified>#[modified]#</modified>
<anchor>#[anchor]#</anchor>
<url>#[url]#</url>
<hash>#[hash]#</hash>
<inProcess>#(inProcess)#false::true#(/inProcess)#</inProcess>
</entry>
#{/list-limit}#
</limitcrawlerqueue>
<remotecrawlerqueue>
<size>#[remoteCrawlSize]#</size>
<state>#[remoteCrawlState]#</state>

@ -129,7 +129,7 @@ public final class crawlOrder {
delay = "3600"; // may request one hour later again
} else try {
yacySeed requester = yacyCore.seedDB.getConnected(iam);
int queuesize = switchboard.crawlQueues.coreCrawlJobSize() + switchboard.crawlQueues.limitCrawlTriggerJobSize() + switchboard.crawlQueues.remoteTriggeredCrawlJobSize() + switchboard.queueSize();
int queuesize = switchboard.crawlQueues.coreCrawlJobSize() + switchboard.crawlQueues.limitCrawlJobSize() + switchboard.crawlQueues.remoteTriggeredCrawlJobSize() + switchboard.queueSize();
if (requester == null) {
response = "denied";
reason = "unknown-client";
@ -190,7 +190,7 @@ public final class crawlOrder {
env.getLog().logWarning("crawlOrder: Received not normalized Referer URL " + refv.get(0) + " of URL " + urlv.get(0));
}
if (!switchboard.acceptURL(new yacyURL(newURL, null))) {
if (!switchboard.acceptURL(url)) {
env.getLog().logWarning("crawlOrder: Received URL outside of our domain: " + newURL);
return null;
}

@ -1,152 +0,0 @@
// list.java
// -----------------------
// part of YaCy
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
//
// This File is contributed by Alexander Schier
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
// You must compile this file with
// javac -classpath .:../../classes list.java
// if the shell's current path is HTROOT
// contains contributions by [FB] to support listing URLs for URL Fetcher
import java.io.File;
import de.anomic.data.URLFetcherStack;
import de.anomic.data.htmlTools;
import de.anomic.data.listManager;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverCore;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacyNetwork;
import de.anomic.yacy.yacySeed;
import de.anomic.yacy.yacyURL;
public final class list {
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
if (post == null || env == null)
throw new NullPointerException("post: " + post + ", sb: " + env);
plasmaSwitchboard sb = (plasmaSwitchboard) env;
// return variable that accumulates replacements
final serverObjects prop = new serverObjects();
if ((post == null) || (env == null)) return prop;
if (!yacyNetwork.authentifyRequest(post, env)) return prop;
final String col = post.get("col", "");
final File listsPath = env.getConfigPath(plasmaSwitchboard.LISTS_PATH, plasmaSwitchboard.LISTS_PATH_DEFAULT);
String otherPeerName = null;
if (post.containsKey("iam")) {
yacySeed bla = yacyCore.seedDB.get(post.get("iam", ""));
if (bla != null) otherPeerName = bla.getName();
}
if (otherPeerName == null) otherPeerName = (String)header.get(httpHeader.CONNECTION_PROP_CLIENTIP);
if ((sb.isRobinsonMode()) && (!sb.isInMyCluster(otherPeerName))) {
// if we are a robinson cluster, answer only if this client is known by our network definition
return null;
}
if (col.equals("black")) {
final StringBuffer out = new StringBuffer();
final String filenames=env.getConfig("BlackLists.Shared", "");
final String[] filenamesarray = filenames.split(",");
if(filenamesarray.length > 0){
for(int i = 0;i < filenamesarray.length; i++){
String filename = filenamesarray[i];
File fileObj = new File(listsPath,filename);
out.append(listManager.getListString(fileObj, false))
.append(serverCore.crlfString);
}
} // if filenamesarray.length > 0
prop.put("list",out.toString());
}
// start contrib by [FB]
else if (col.length() == 0 && post.get("list", "").equals("queueUrls")) {
final URLFetcherStack db = CrawlURLFetchStack_p.getURLFetcherStack(env);
final String display = post.get("display", "list");
if (display.equals("list")) {
// list urls from remote crawler queue for other peers
final int count = Math.min(post.getInt("count", 50), CrawlURLFetchStack_p.maxURLsPerFetch);
if (count > 0 && db.size() > 0) {
final StringBuffer b = new StringBuffer();
yacyURL url;
int cnt = 0;
for (int i=0; i<count; i++) {
if ((url = db.pop()) == null) continue;
b.append(htmlTools.decodeHtml2Unicode(url.toNormalform(false, true))).append("\n");
cnt++;
}
prop.put("list", b.toString());
CrawlURLFetchStack_p.fetchMap.put(
otherPeerName,
new Integer(((CrawlURLFetchStack_p.fetchMap.get(otherPeerName) == null)
? 0
: ((Integer)CrawlURLFetchStack_p.fetchMap.get(otherPeerName)).intValue()) + cnt));
serverLog.logInfo("URLFETCHER", "sent " + cnt + " URLs to " + otherPeerName);
} else {
prop.put("list", "");
serverLog.logInfo("URLFETCHER", "couldn't satisfy URL request from " + otherPeerName + ": stack is empty");
}
} else if (display.equals("count")) {
prop.put("list", db.size());
}
// end contrib by [FB]
} else {
prop.put("list","");
}
return prop;
}
}

@ -34,16 +34,15 @@ import de.anomic.server.serverDate;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacyNetwork;
public class urls {
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
plasmaSwitchboard sb = (plasmaSwitchboard) env;
// return variable that accumulates replacements
serverObjects prop = new serverObjects();
// insert default values
serverObjects prop = new serverObjects();
prop.put("iam", yacyCore.seedDB.mySeed().hash);
prop.put("response", "rejected - insufficient call parameters");
prop.put("channel_title", "");
@ -51,7 +50,8 @@ public class urls {
prop.put("channel_pubDate", "");
prop.put("item", "0");
if (post == null) return prop;
if ((post == null) || (env == null)) return prop;
if (!yacyNetwork.authentifyRequest(post, env)) return prop;
if (post.get("call", "").equals("remotecrawl")) {
// perform a remote crawl url handover
@ -67,6 +67,9 @@ public class urls {
break;
}
if (entry == null) break;
// place url to notice-url db
sb.crawlQueues.delegatedURL.push(sb.crawlQueues.delegatedURL.newEntry(entry.url(), "client=____________"));
// create RSS entry
prop.put("item_" + c + "_title", "");
prop.putHTML("item_" + c + "_link", entry.url().toNormalform(true, false));
prop.putHTML("item_" + c + "_description", entry.name());
@ -85,3 +88,106 @@ public class urls {
}
}
/*
from http://88.64.186.183:9999/yacy/urls.xml?count=10&call=remotecrawl
<?xml version="1.0"?>
<!-- this is not exactly rss format, but similar -->
<rss>
<!-- YaCy standard response header -->
<yacy version="0.5540423">
<iam>c_32kgI-4HTE</iam>
<uptime>3226</uptime>
<mytime>20071128030353</mytime>
<response>ok</response>
</yacy>
<!-- rss standard channel -->
<channel>
<title></title>
<description></description>
<pubDate></pubDate>
<!-- urll items -->
<item>
<title></title>
<link>http://publish.vx.roo.com/australian/ithomepagemini/</link>
<description>sub</description>
<author></author>
<pubDate>20071126173629</pubDate>
<guid>mlD2rBhnfuoY</guid>
</item>
<item>
<title></title>
<link>http://www.news.com.au/story/0%2C23599%2C22835669-2%2C00.html</link>
<description></description>
<author></author>
<pubDate>20071128014306</pubDate>
<guid>qT1GjNRe_5SQ</guid>
</item>
<item>
<title></title>
<link>http://www.news.com.au/perthnow/story/0%2C21598%2C22835663-2761%2C00.html</link>
<description>Driver injured: Willagee crash witnesses sought</description>
<author></author>
<pubDate>20071128014306</pubDate>
<guid>yGMa4uRe_5SQ</guid>
</item>
<item>
<title></title>
<link>http://www.news.com.au/travel/story/0%2C26058%2C22835185-5014090%2C00.html</link>
<description></description>
<author></author>
<pubDate>20071128014306</pubDate>
<guid>qfob36Re_5SQ</guid>
</item>
<item>
<title></title>
<link>http://www.news.com.au/story/0%2C23599%2C22835311-421%2C00.html</link>
<description></description>
<author></author>
<pubDate>20071128014306</pubDate>
<guid>YBLVBNRe_5SQ</guid>
</item>
<item>
<title></title>
<link>http://www.thirdwayblog.com/wp-content/uploads/</link>
<description>sub</description>
<author></author>
<pubDate>20071128010343</pubDate>
<guid>9rnz2MUqGq6Z</guid>
</item>
<item>
<title></title>
<link>http://www.parliament.gr/kouselas/koino_dra/koino_docs/</link>
<description>sub</description>
<author></author>
<pubDate>20071128010343</pubDate>
<guid>hSTvg-u6LxcB</guid>
</item>
<item>
<title></title>
<link>http://upload.wikimedia.org/wikipedia/el/f/f1/</link>
<description>sub</description>
<author></author>
<pubDate>20071128010343</pubDate>
<guid>F-3WVJBs-F4R</guid>
</item>
<item>
<title></title>
<link>http://www.logiprint.nl/nl/Briefpapier_drukken_Eindhoven.html</link>
<description>Briefpapier drukken Eindhoven</description>
<author></author>
<pubDate>20071011104246</pubDate>
<guid>bmBv8j07Ta7B</guid>
</item>
</channel>
</rss>
*/

@ -24,7 +24,7 @@
<description>#[description]#</description>
<author>#[author]#</author>
<pubDate>#[pubDate]#</pubDate>
<guid>#[guid]#</guid>
<guid isPermaLink="false">#[guid]#</guid>
</item>
#{/item}#
</channel>

@ -28,12 +28,14 @@ package de.anomic.plasma.crawler;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import de.anomic.data.robotsParser;
import de.anomic.index.indexURLEntry;
import de.anomic.plasma.plasmaCrawlEntry;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaCrawlProfile;
@ -41,8 +43,9 @@ import de.anomic.plasma.plasmaCrawlZURL;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverDate;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.crypt;
import de.anomic.xml.rssReader;
import de.anomic.yacy.yacyClient;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeed;
@ -54,6 +57,7 @@ public class plasmaCrawlQueues {
private serverLog log;
private HashMap workers; // mapping from url hash to Worker thread object
private plasmaProtocolLoader loader;
private ArrayList remoteCrawlProviderHashes;
public plasmaCrawlNURL noticeURL;
public plasmaCrawlZURL errorURL, delegatedURL;
@ -63,6 +67,7 @@ public class plasmaCrawlQueues {
this.log = new serverLog("CRAWLER");
this.workers = new HashMap();
this.loader = new plasmaProtocolLoader(sb, log);
this.remoteCrawlProviderHashes = new ArrayList();
// start crawling management
log.logConfig("Starting Crawling Management");
@ -108,6 +113,9 @@ public class plasmaCrawlQueues {
Iterator i = workers.values().iterator();
while (i.hasNext()) ((Thread) i.next()).interrupt();
// TODO: wait some more time until all threads are finished
noticeURL.close();
errorURL.close();
delegatedURL.close();
}
public plasmaCrawlEntry[] activeWorker() {
@ -131,18 +139,32 @@ public class plasmaCrawlQueues {
}
public boolean coreCrawlJob() {
boolean robinsonPrivateCase = ((sb.isRobinsonMode()) &&
(!sb.getConfig(plasmaSwitchboard.CLUSTER_MODE, "").equals(plasmaSwitchboard.CLUSTER_MODE_PUBLIC_CLUSTER)) &&
(!sb.getConfig(plasmaSwitchboard.CLUSTER_MODE, "").equals(plasmaSwitchboard.CLUSTER_MODE_PRIVATE_CLUSTER)));
if ((robinsonPrivateCase) || ((coreCrawlJobSize() <= 20) && (limitCrawlJobSize() > 0))) {
// move some tasks to the core crawl job so we have something to do
int toshift = Math.min(10, limitCrawlJobSize()); // this cannot be a big number because the balancer makes a forced waiting if it cannot balance
for (int i = 0; i < toshift; i++) {
noticeURL.shift(plasmaCrawlNURL.STACK_TYPE_LIMIT, plasmaCrawlNURL.STACK_TYPE_CORE);
}
log.logInfo("shifted " + toshift + " jobs from global crawl to local crawl (coreCrawlJobSize()=" + coreCrawlJobSize() +
", limitCrawlJobSize()=" + limitCrawlJobSize() + ", cluster.mode=" + sb.getConfig(plasmaSwitchboard.CLUSTER_MODE, "") +
", robinsonMode=" + ((sb.isRobinsonMode()) ? "on" : "off"));
}
if (noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) == 0) {
//log.logDebug("CoreCrawl: queue is empty");
return false;
}
if (sb.sbQueue.size() >= (int) sb.getConfigLong(plasmaSwitchboard.INDEXER_SLOTS, 30)) {
log.logFine("CoreCrawl: too many processes in indexing queue, dismissed (" +
"sbQueueSize=" + sb.sbQueue.size() + ")");
log.logFine("CoreCrawl: too many processes in indexing queue, dismissed (" + "sbQueueSize=" + sb.sbQueue.size() + ")");
return false;
}
if (this.size() >= sb.getConfigLong(plasmaSwitchboard.CRAWLER_THREADS_ACTIVE_MAX, 10)) {
log.logFine("CoreCrawl: too many processes in loader queue, dismissed (" +
"cacheLoader=" + this.size() + ")");
log.logFine("CoreCrawl: too many processes in loader queue, dismissed (" + "cacheLoader=" + this.size() + ")");
return false;
}
if (sb.onlineCaution()) {
@ -203,107 +225,84 @@ public class plasmaCrawlQueues {
return true;
}
public int limitCrawlTriggerJobSize() {
return noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT);
}
public boolean limitCrawlTriggerJob() {
if (noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) == 0) {
//log.logDebug("LimitCrawl: queue is empty");
return false;
}
boolean robinsonPrivateCase = ((sb.isRobinsonMode()) &&
(!sb.getConfig(plasmaSwitchboard.CLUSTER_MODE, "").equals(plasmaSwitchboard.CLUSTER_MODE_PUBLIC_CLUSTER)) &&
(!sb.getConfig(plasmaSwitchboard.CLUSTER_MODE, "").equals(plasmaSwitchboard.CLUSTER_MODE_PRIVATE_CLUSTER)));
public boolean remoteCrawlLoaderJob() {
// check if we are allowed to crawl urls provided by other peers
if (!yacyCore.seedDB.mySeed().getFlagAcceptRemoteCrawl()) return false;
if ((robinsonPrivateCase) || ((coreCrawlJobSize() <= 20) && (limitCrawlTriggerJobSize() > 10))) {
// it is not efficient if the core crawl job is empty and we have too much to do
// move some tasks to the core crawl job
int toshift = 10; // this cannot be a big number because the balancer makes a forced waiting if it cannot balance
if (toshift > limitCrawlTriggerJobSize()) toshift = limitCrawlTriggerJobSize();
for (int i = 0; i < toshift; i++) {
noticeURL.shift(plasmaCrawlNURL.STACK_TYPE_LIMIT, plasmaCrawlNURL.STACK_TYPE_CORE);
}
log.logInfo("shifted " + toshift + " jobs from global crawl to local crawl (coreCrawlJobSize()=" + coreCrawlJobSize() + ", limitCrawlTriggerJobSize()=" + limitCrawlTriggerJobSize() + ", cluster.mode=" + sb.getConfig(plasmaSwitchboard.CLUSTER_MODE, "") + ", robinsonMode=" + ((sb.isRobinsonMode()) ? "on" : "off"));
if (robinsonPrivateCase) return false;
}
// check if we are a senior peer
if (!yacyCore.seedDB.mySeed().isActive()) return false;
// check local indexing queues
// in case the placing of remote crawl fails, there must be space in the local queue to work off the remote crawl
if (sb.sbQueue.size() >= (int) sb.getConfigLong(plasmaSwitchboard.INDEXER_SLOTS, 30) * 2) {
log.logFine("LimitCrawl: too many processes in indexing queue, dismissed (" +
"sbQueueSize=" + sb.sbQueue.size() + ")");
return false;
}
if (this.size() >= sb.getConfigLong(plasmaSwitchboard.CRAWLER_THREADS_ACTIVE_MAX, 10)) {
log.logFine("LimitCrawl: too many processes in loader queue, dismissed (" +
"cacheLoader=" + this.size() + ")");
return false;
}
if (sb.onlineCaution()) {
log.logFine("LimitCrawl: online caution, omitting processing");
return false;
}
// if crawling was paused we have to wait until we were notified to continue
Object[] status = (Object[]) sb.crawlJobsStatus.get(plasmaSwitchboard.CRAWLJOB_GLOBAL_CRAWL_TRIGGER);
synchronized(status[plasmaSwitchboard.CRAWLJOB_SYNC]) {
if (((Boolean)status[plasmaSwitchboard.CRAWLJOB_STATUS]).booleanValue()) {
try {
status[plasmaSwitchboard.CRAWLJOB_SYNC].wait();
// check if we have an entry in the provider list, othervise fill the list
yacySeed seed;
if ((remoteCrawlProviderHashes.size() == 0) && (remoteTriggeredCrawlJobSize() == 0)) {
if (yacyCore.seedDB != null && yacyCore.seedDB.sizeConnected() > 0) {
Iterator e = yacyCore.dhtAgent.getProvidesRemoteCrawlURLs();
while (e.hasNext()) {
seed = (yacySeed) e.next();
if (seed != null) {
remoteCrawlProviderHashes.add(seed.hash);
}
}
catch (InterruptedException e){ return false;}
}
}
if (remoteCrawlProviderHashes.size() == 0) return false;
// start a global crawl, if possible
String stats = "REMOTECRAWLTRIGGER[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", "
+ noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]";
try {
plasmaCrawlEntry urlEntry = noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT, true);
String profileHandle = urlEntry.profileHandle();
// System.out.println("DEBUG plasmaSwitchboard.processCrawling:
// profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
plasmaCrawlProfile.entry profile = sb.profilesActiveCrawls.getEntry(profileHandle);
if (profile == null) {
log.logWarning(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
return true;
}
// take one entry from the provider list and load the entries from the remote peer
seed = null;
String hash = null;
while ((seed == null) && (remoteCrawlProviderHashes.size() > 0)) {
hash = (String) remoteCrawlProviderHashes.remove(remoteCrawlProviderHashes.size() - 1);
seed = yacyCore.seedDB.get(hash);
}
if (seed == null) return false;
// we know a peer which should provide remote crawl entries. load them now.
rssReader reader = (seed == null) ? null : yacyClient.queryRemoteCrawlURLs(seed, 10);
if (reader == null) return true;
// parse the rss
rssReader.Item item;
for (int i = 0; i < reader.items(); i++) {
item = reader.getItem(i);
//System.out.println("URL=" + item.getLink() + ", desc=" + item.getDescription() + ", pubDate=" + item.getPubDate());
// check if the protocol is supported
yacyURL url = urlEntry.url();
String urlProtocol = url.getProtocol();
if (!this.sb.crawlQueues.isSupportedProtocol(urlProtocol)) {
this.log.logSevere("Unsupported protocol in URL '" + url.toString());
return true;
// put url on remote crawl stack
yacyURL url;
try {
url = new yacyURL(item.getLink(), null);
} catch (MalformedURLException e) {
url = null;
}
log.logFine("plasmaSwitchboard.limitCrawlTriggerJob: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() + ", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false") + ", depth=" + urlEntry.depth() + ", crawlDepth=" + profile.generalDepth() + ", filter="
+ profile.generalFilter() + ", permission=" + ((yacyCore.seedDB == null) ? "undefined" : (((yacyCore.seedDB.mySeed().isSenior()) || (yacyCore.seedDB.mySeed().isPrincipal())) ? "true" : "false")));
Date loaddate;
try {
loaddate = serverDate.parseShortSecondTime(item.getPubDate());
} catch (ParseException e) {
loaddate = new Date();
}
yacyURL referrer = null; // referrer needed!
if (sb.acceptURL(url)) {
// stack url
sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
String reasonString = sb.crawlStacker.stackCrawl(url, referrer, hash, "REMOTE-CRAWLING", loaddate, 0, sb.defaultRemoteProfile);
boolean tryRemote = ((noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) != 0) || (sb.sbQueue.size() != 0)) &&
(profile.remoteIndexing()) &&
(urlEntry.initiator() != null) &&
// (!(urlEntry.initiator().equals(indexURL.dummyHash))) &&
((yacyCore.seedDB.mySeed().isSenior()) || (yacyCore.seedDB.mySeed().isPrincipal()));
if (tryRemote) {
// checking robots.txt for http(s) resources
if ((urlProtocol.equals("http") || urlProtocol.equals("https")) && robotsParser.isDisallowed(url)) {
this.log.logFine("Crawling of URL '" + url.toString() + "' disallowed by robots.txt.");
return true;
if (reasonString == null) {
// done
log.logInfo("crawlOrder: added remote crawl url: " + url.toNormalform(true, false));
} else if (reasonString.startsWith("double")) {
// case where we have already the url loaded;
log.logInfo("crawlOrder: ignored double remote crawl url: " + url.toNormalform(true, false));
} else {
log.logInfo("crawlOrder: ignored [" + reasonString + "] remote crawl url: " + url.toNormalform(true, false));
}
boolean success = processRemoteCrawlTrigger(urlEntry);
if (success) return true;
} else {
log.logWarning("crawlOrder: Received URL outside of our domain: " + url.toNormalform(true, false));
}
processLocalCrawling(urlEntry, stats); // emergency case, work off the crawl locally
return true;
} catch (IOException e) {
log.logSevere(stats + ": CANNOT FETCH ENTRY: " + e.getMessage(), e);
if (e.getMessage().indexOf("hash is null") > 0) noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_LIMIT);
return true; // if we return a false here we will block everything
}
return true;
}
public int limitCrawlJobSize() {
return noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT);
}
public int remoteTriggeredCrawlJobSize() {
@ -399,108 +398,6 @@ public class plasmaCrawlQueues {
return;
}
private boolean processRemoteCrawlTrigger(plasmaCrawlEntry urlEntry) {
// if this returns true, then the urlEntry is considered as stored somewhere and the case is finished
// if this returns false, the urlEntry will be enqueued to the local crawl again
// wrong access
if (urlEntry == null) {
log.logInfo("REMOTECRAWLTRIGGER[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: urlEntry=null");
return true; // superfluous request; true correct in this context because the urlEntry shall not be tracked any more
}
// check url
if (urlEntry.url() == null) {
log.logFine("ERROR: plasmaSwitchboard.processRemoteCrawlTrigger - url is null. name=" + urlEntry.name());
return true; // same case as above: no more consideration
}
// are we qualified for a remote crawl?
if ((yacyCore.seedDB.mySeed() == null) || (yacyCore.seedDB.mySeed().isJunior())) {
log.logFine("plasmaSwitchboard.processRemoteCrawlTrigger: no permission");
return false; // no, we must crawl this page ourselves
}
// check if peer for remote crawl is available
yacySeed remoteSeed = ((sb.isPublicRobinson()) && (sb.getConfig("cluster.mode", "").equals("publiccluster"))) ?
yacyCore.dhtAgent.getPublicClusterCrawlSeed(urlEntry.url().hash(), sb.clusterhashes) :
yacyCore.dhtAgent.getGlobalCrawlSeed(urlEntry.url().hash());
if (remoteSeed == null) {
log.logFine("plasmaSwitchboard.processRemoteCrawlTrigger: no remote crawl seed available");
return false;
}
// do the request
HashMap page = yacyClient.crawlOrder(remoteSeed, urlEntry.url(), sb.getURL(urlEntry.referrerhash()), 6000);
if (page == null) {
log.logSevere(plasmaSwitchboard.STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " FAILED. URL CANNOT BE RETRIEVED from referrer hash: " + urlEntry.referrerhash());
return false;
}
// check if we got contact to peer and the peer respondet
if ((page == null) || (page.get("delay") == null)) {
log.logInfo("CRAWL: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " FAILED. CAUSE: unknown (URL=" + urlEntry.url().toString() + "). Removed peer.");
yacyCore.peerActions.peerDeparture(remoteSeed, "remote crawl to peer failed; peer answered unappropriate");
return false; // no response from peer, we will crawl this ourself
}
String response = (String) page.get("response");
log.logFine("plasmaSwitchboard.processRemoteCrawlTrigger: remoteSeed="
+ remoteSeed.getName() + ", url=" + urlEntry.url().toString()
+ ", response=" + page.toString()); // DEBUG
// we received an answer and we are told to wait a specific time until we shall ask again for another crawl
int newdelay = Integer.parseInt((String) page.get("delay"));
yacyCore.dhtAgent.setCrawlDelay(remoteSeed.hash, newdelay);
if (response.equals("stacked")) {
// success, the remote peer accepted the crawl
log.logInfo(plasmaSwitchboard.STR_REMOTECRAWLTRIGGER + remoteSeed.getName()
+ " PLACED URL=" + urlEntry.url().toString()
+ "; NEW DELAY=" + newdelay);
// track this remote crawl
delegatedURL.newEntry(urlEntry, remoteSeed.hash, new Date(), 0, response).store();
return true;
}
// check other cases: the remote peer may respond that it already knows that url
if (response.equals("double")) {
// in case the peer answers double, it transmits the complete lurl data
String lurl = (String) page.get("lurl");
if ((lurl != null) && (lurl.length() != 0)) {
String propStr = crypt.simpleDecode(lurl, (String) page.get("key"));
indexURLEntry entry = sb.wordIndex.loadedURL.newEntry(propStr);
try {
sb.wordIndex.loadedURL.store(entry);
sb.wordIndex.loadedURL.stack(entry, yacyCore.seedDB.mySeed().hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt?
// noticeURL.remove(entry.hash());
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
log.logInfo(plasmaSwitchboard.STR_REMOTECRAWLTRIGGER + remoteSeed.getName()
+ " SUPERFLUOUS. CAUSE: " + page.get("reason")
+ " (URL=" + urlEntry.url().toString()
+ "). URL IS CONSIDERED AS 'LOADED!'");
return true;
} else {
log.logInfo(plasmaSwitchboard.STR_REMOTECRAWLTRIGGER + remoteSeed.getName()
+ " REJECTED. CAUSE: bad lurl response / " + page.get("reason") + " (URL="
+ urlEntry.url().toString() + ")");
remoteSeed.setFlagAcceptRemoteCrawl(false);
yacyCore.seedDB.update(remoteSeed.hash, remoteSeed);
return false;
}
}
log.logInfo(plasmaSwitchboard.STR_REMOTECRAWLTRIGGER + remoteSeed.getName()
+ " DENIED. RESPONSE=" + response + ", CAUSE="
+ page.get("reason") + ", URL=" + urlEntry.url().toString());
remoteSeed.setFlagAcceptRemoteCrawl(false);
yacyCore.seedDB.update(remoteSeed.hash, remoteSeed);
return false;
}
public plasmaHTCache.Entry loadResourceFromWeb(
yacyURL url,
int socketTimeout,

@ -119,7 +119,6 @@ public class plasmaCrawlBalancer {
resetFileIndex();
}
private void openFileIndex() {
cacheStacksPath.mkdirs();
urlFileIndex = new kelondroCache(new kelondroFlexTable(cacheStacksPath, stackname + indexSuffix, -1, plasmaCrawlEntry.rowdef, true), true, false);

@ -348,18 +348,17 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// 61_globalcawltrigger
/**
* <p><code>public static final String <strong>CRAWLJOB_GLOBAL_CRAWL_TRIGGER</strong> = "61_globalcrawltrigger"</code></p>
* <p>Name of the global crawl trigger thread, popping one entry off it's queue and sending it to a non-busy peer to
* crawl it</p>
* <p><code>public static final String <strong>CRAWLJOB_REMOTE_CRAWL_LOADER</strong> = "60_remotecrawlloader"</code></p>
* <p>Name of the remote crawl list loading thread</p>
*
* @see plasmaSwitchboard#CRAWLJOB_REMOTE_TRIGGERED_CRAWL
* @see plasmaSwitchboard#CRAWLJOB_REMOTE_CRAWL_LOADER
*/
public static final String CRAWLJOB_GLOBAL_CRAWL_TRIGGER = "61_globalcrawltrigger";
public static final String CRAWLJOB_GLOBAL_CRAWL_TRIGGER_METHOD_START = "limitCrawlTriggerJob";
public static final String CRAWLJOB_GLOBAL_CRAWL_TRIGGER_METHOD_JOBCOUNT = "limitCrawlTriggerJobSize";
public static final String CRAWLJOB_GLOBAL_CRAWL_TRIGGER_METHOD_FREEMEM = null;
public static final String CRAWLJOB_GLOBAL_CRAWL_TRIGGER_IDLESLEEP = "61_globalcrawltrigger_idlesleep";
public static final String CRAWLJOB_GLOBAL_CRAWL_TRIGGER_BUSYSLEEP = "61_globalcrawltrigger_busysleep";
public static final String CRAWLJOB_REMOTE_CRAWL_LOADER = "60_remotecrawlloader";
public static final String CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_START = "remoteCrawlLoaderJob";
public static final String CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_JOBCOUNT = null;
public static final String CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_FREEMEM = null;
public static final String CRAWLJOB_REMOTE_CRAWL_LOADER_IDLESLEEP = "60_remotecrawlloader_idlesleep";
public static final String CRAWLJOB_REMOTE_CRAWL_LOADER_BUSYSLEEP = "60_remotecrawlloader_busysleep";
// 62_remotetriggeredcrawl
/**
@ -1208,9 +1207,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
this.crawlJobsStatus.put(CRAWLJOB_REMOTE_TRIGGERED_CRAWL, new Object[]{
new Object(),
Boolean.valueOf(getConfig(CRAWLJOB_REMOTE_TRIGGERED_CRAWL + "_isPaused", "false"))});
this.crawlJobsStatus.put(CRAWLJOB_GLOBAL_CRAWL_TRIGGER, new Object[]{
this.crawlJobsStatus.put(CRAWLJOB_REMOTE_CRAWL_LOADER, new Object[]{
new Object(),
Boolean.valueOf(getConfig(CRAWLJOB_GLOBAL_CRAWL_TRIGGER + "_isPaused", "false"))});
Boolean.valueOf(getConfig(CRAWLJOB_REMOTE_CRAWL_LOADER + "_isPaused", "false"))});
// init cookie-Monitor
this.log.logConfig("Starting Cookie Monitor");
@ -1340,8 +1339,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
new serverInstantThread(this, PROXY_CACHE_ENQUEUE_METHOD_START, PROXY_CACHE_ENQUEUE_METHOD_JOBCOUNT, PROXY_CACHE_ENQUEUE_METHOD_FREEMEM), 10000);
deployThread(CRAWLJOB_REMOTE_TRIGGERED_CRAWL, "Remote Crawl Job", "thread that performes a single crawl/indexing step triggered by a remote peer", null,
new serverInstantThread(crawlQueues, CRAWLJOB_REMOTE_TRIGGERED_CRAWL_METHOD_START, CRAWLJOB_REMOTE_TRIGGERED_CRAWL_METHOD_JOBCOUNT, CRAWLJOB_REMOTE_TRIGGERED_CRAWL_METHOD_FREEMEM), 30000);
deployThread(CRAWLJOB_GLOBAL_CRAWL_TRIGGER, "Global Crawl Trigger", "thread that triggeres remote peers for crawling", "/IndexCreateWWWGlobalQueue_p.html",
new serverInstantThread(crawlQueues, CRAWLJOB_GLOBAL_CRAWL_TRIGGER_METHOD_START, CRAWLJOB_GLOBAL_CRAWL_TRIGGER_METHOD_JOBCOUNT, CRAWLJOB_GLOBAL_CRAWL_TRIGGER_METHOD_FREEMEM), 30000); // error here?
deployThread(CRAWLJOB_REMOTE_CRAWL_LOADER, "Remote Crawl URL Loader", "thread that loads remote crawl lists from other peers", "",
new serverInstantThread(crawlQueues, CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_START, CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_JOBCOUNT, CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_FREEMEM), 30000); // error here?
deployThread(CRAWLJOB_LOCAL_CRAWL, "Local Crawl", "thread that performes a single crawl step from the local crawl queue", "/IndexCreateWWWLocalQueue_p.html",
new serverInstantThread(crawlQueues, CRAWLJOB_LOCAL_CRAWL_METHOD_START, CRAWLJOB_LOCAL_CRAWL_METHOD_JOBCOUNT, CRAWLJOB_LOCAL_CRAWL_METHOD_FREEMEM), 10000);
deployThread(SEED_UPLOAD, "Seed-List Upload", "task that a principal peer performes to generate and upload a seed-list to a ftp account", null,
@ -2639,18 +2638,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
thread.setIdleSleep(1000);
}
thread = getThread(CRAWLJOB_GLOBAL_CRAWL_TRIGGER);
if (thread != null) {
setConfig(CRAWLJOB_GLOBAL_CRAWL_TRIGGER_BUSYSLEEP , thread.setBusySleep(Math.max(1000, newBusySleep * 3)));
thread.setIdleSleep(10000);
}
/*
thread = getThread(CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
if (thread != null) {
setConfig(CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP , thread.setBusySleep(newBusySleep * 10));
thread.setIdleSleep(10000);
}
*/
thread = getThread(PROXY_CACHE_ENQUEUE);
if (thread != null) {
setConfig(PROXY_CACHE_ENQUEUE_BUSYSLEEP , thread.setBusySleep(0));

@ -26,6 +26,8 @@
package de.anomic.xml;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
@ -38,6 +40,9 @@ import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import de.anomic.server.serverByteBuffer;
import de.anomic.server.logging.serverLog;
public class rssReader extends DefaultHandler {
// statics for item generation and automatic categorization
@ -72,17 +77,7 @@ public class rssReader extends DefaultHandler {
private HashMap items; // a guid:Item map
public rssReader(String path) {
init();
parse(path);
}
public rssReader(InputStream stream) {
init();
parse(stream);
}
private void init() {
public rssReader() {
itemsGUID = new ArrayList();
items = new HashMap();
buffer = new StringBuffer();
@ -93,7 +88,8 @@ public class rssReader extends DefaultHandler {
parsingItem = false;
}
private void parse(String path) {
public rssReader(String path) {
this();
try {
SAXParserFactory factory = SAXParserFactory.newInstance();
SAXParser saxParser = factory.newSAXParser();
@ -103,7 +99,8 @@ public class rssReader extends DefaultHandler {
}
}
private void parse(InputStream stream) {
public rssReader(InputStream stream) {
this();
try {
SAXParserFactory factory = SAXParserFactory.newInstance();
SAXParser saxParser = factory.newSAXParser();
@ -112,6 +109,42 @@ public class rssReader extends DefaultHandler {
e.printStackTrace();
}
}
public static rssReader parse(byte[] a) {
// check integrity of array
if ((a == null) || (a.length == 0)) {
serverLog.logWarning("rssReader", "response=null");
return null;
}
if (a.length < 100) {
serverLog.logWarning("rssReader", "response=" + new String(a));
return null;
}
if (!serverByteBuffer.equals(a, "<?xml".getBytes())) {
serverLog.logWarning("rssReader", "response does not contain valid xml");
return null;
}
String end = new String(a, a.length - 10, 10);
if (end.indexOf("rss") < 0) {
serverLog.logWarning("rssReader", "response incomplete");
return null;
}
// make input stream
ByteArrayInputStream bais = new ByteArrayInputStream(a);
// parse stream
rssReader reader = null;
try {
reader = new rssReader(bais);
} catch (Exception e) {
serverLog.logWarning("rssReader", "parse exception: " + e);
return null;
}
try { bais.close(); } catch (IOException e) {}
return reader;
}
public void startElement(String uri, String name, String tag, Attributes atts) throws SAXException {
if ("channel".equals(tag)) {

@ -72,6 +72,7 @@ import de.anomic.server.serverDomains;
import de.anomic.server.serverObjects;
import de.anomic.tools.crypt;
import de.anomic.tools.nxTools;
import de.anomic.xml.rssReader;
public final class yacyClient {
@ -332,6 +333,46 @@ public final class yacyClient {
}
}
public static rssReader queryRemoteCrawlURLs(yacySeed target, int count) {
// returns a list of
if (target == null) { return null; }
if (yacyCore.seedDB.mySeed() == null) return null;
// prepare request
final serverObjects post = yacyNetwork.basicRequestPost(plasmaSwitchboard.getSwitchboard(), target.hash);
post.put("call", "remotecrawl");
post.put("count", count);
// send request
try {
final byte[] result =
httpc.wput(new yacyURL("http://" + target.getClusterAddress() + "/yacy/urls.xml", null),
target.getHexHash() + ".yacyh",
60000, /* a long time-out is needed */
null,
null,
proxyConfig(),
post,
null
);
rssReader reader = rssReader.parse(result);
if (reader == null) {
// case where the rss reader does not understand the content
yacyCore.log.logWarning("yacyClient.queryRemoteCrawlURLs failed asking peer '" + target.getName() + "': probably bad response from remote peer");
System.out.println("***DEBUG*** rss input = " + new String(result));
target.put(yacySeed.RCOUNT, "0");
yacyCore.seedDB.update(target.hash, target); // overwrite number of remote-available number to avoid that this peer is called again (until update is done by peer ping)
//e.printStackTrace();
return null;
}
return reader;
} catch (IOException e) {
yacyCore.log.logSevere("yacyClient.queryRemoteCrawlURLs error asking peer '" + target.getName() + "':" + e.toString());
return null;
}
}
public static String[] search(
String wordhashes,
String excludehashes,
@ -748,62 +789,6 @@ public final class yacyClient {
}
return "wrong protocol: " + protocol;
}
public static HashMap crawlOrder(yacySeed targetSeed, yacyURL url, yacyURL referrer, int timeout) {
return crawlOrder(targetSeed, new yacyURL[]{url}, new yacyURL[]{referrer}, timeout);
}
public static HashMap crawlOrder(yacySeed target, yacyURL[] url, yacyURL[] referrer, int timeout) {
assert (target != null);
assert (yacyCore.seedDB.mySeed() != null);
assert (yacyCore.seedDB.mySeed() != target);
// prepare request
final serverObjects post = yacyNetwork.basicRequestPost(plasmaSwitchboard.getSwitchboard(), target.hash);
post.put("process", "crawl");
if (url.length == 1) {
post.put("url", crypt.simpleEncode(url[0].toNormalform(true, true)));
post.put("referrer", crypt.simpleEncode((referrer[0] == null) ? "" : referrer[0].toNormalform(true, true)));
} else {
for (int i=0; i< url.length; i++) {
post.put("url" + i, crypt.simpleEncode(url[i].toNormalform(true, true)));
post.put("ref" + i, crypt.simpleEncode((referrer[i] == null) ? "" : referrer[i].toNormalform(true, true)));
}
}
post.put("depth", "0");
post.put("ttl", "0");
// determining target address
final String address = target.getClusterAddress();
if (address == null) { return null; }
// send request
try {
final HashMap result = nxTools.table(
httpc.wput(new yacyURL("http://" + address + "/yacy/crawlOrder.html", null),
target.getHexHash() + ".yacyh",
timeout,
null,
null,
proxyConfig(),
post,
null
), "UTF-8"
);
return result;
} catch (Exception e) {
// most probably a network time-out exception
yacyCore.log.logSevere("yacyClient.crawlOrder error: peer=" + target.getName() + ", error=" + e.getMessage());
return null;
}
}
/*
Test:
http://217.234.95.114:5777/yacy/crawlOrder.html?key=abc&iam=S-cjM67KhtcJ&youare=EK31N7RgRqTn&process=crawl&referrer=&depth=0&url=p|http://www.heise.de/newsticker/meldung/53245
version=0.297 uptime=225 accepted=true reason=ok delay=30 depth=0
-er crawlt, Ergebnis erscheint aber unter falschem initiator
*/
public static HashMap crawlReceipt(yacySeed target, String process, String result, String reason, indexURLEntry entry, String wordhashes) {
assert (target != null);

@ -124,6 +124,53 @@ public class yacyDHTAction implements yacyPeerAction {
}
}
public Iterator getProvidesRemoteCrawlURLs() {
return new providesRemoteCrawlURLsEnum();
}
class providesRemoteCrawlURLsEnum implements Iterator {
Iterator se;
yacySeed nextSeed;
public providesRemoteCrawlURLsEnum() {
se = getDHTSeeds(true, null, yacyVersion.YACY_POVIDES_REMOTECRAWL_LISTS);
nextSeed = nextInternal();
}
public boolean hasNext() {
return nextSeed != null;
}
private yacySeed nextInternal() {
yacySeed s;
try {
while (se.hasNext()) {
s = (yacySeed) se.next();
if (s == null) return null;
if (s.getLong(yacySeed.RCOUNT, 0) > 0) return s;
}
} catch (kelondroException e) {
System.out.println("DEBUG providesRemoteCrawlURLsEnum:" + e.getMessage());
yacyCore.log.logSevere("database inconsistency (" + e.getMessage() + "), re-set of db.");
seedDB.resetActiveTable();
return null;
}
return null;
}
public Object next() {
yacySeed next = nextSeed;
nextSeed = nextInternal();
return next;
}
public void remove() {
throw new UnsupportedOperationException();
}
}
public Iterator getAcceptRemoteIndexSeeds(String starthash) {
// returns an enumeration of yacySeed-Objects
// that have the AcceptRemoteIndex-Flag set

@ -53,7 +53,7 @@ public final class yacyVersion implements Comparator, Comparable {
public static final float YACY_SUPPORTS_GZIP_POST_REQUESTS = (float) 0.40300772;
public static final float YACY_ACCEPTS_RANKING_TRANSMISSION = (float) 0.414;
public static final float YACY_HANDLES_COLLECTION_INDEX = (float) 0.486;
public static final float YACY_PROVIDES_CRAWLS_VIA_LIST_HTML = (float) 0.50403367;
public static final float YACY_POVIDES_REMOTECRAWL_LISTS = (float) 0.550;
// information about latest release, retrieved by other peers release version
public static double latestRelease = 0.1; // this value is overwritten when a peer with later version appears

@ -558,12 +558,12 @@ filterOutStopwordsFromTopwords=true
50_localcrawl_busysleep__pro=100
50_localcrawl_memprereq=4194304
50_localcrawl_isPaused=false
61_globalcrawltrigger_idlesleep=10000
61_globalcrawltrigger_busysleep=500
61_globalcrawltrigger_memprereq=2097152
61_globalcrawltrigger_isPaused=false
60_remotecrawlloader_idlesleep=10000
60_remotecrawlloader_busysleep=2000
60_remotecrawlloader_memprereq=2097152
60_remotecrawlloader_isPaused=false
62_remotetriggeredcrawl_idlesleep=10000
62_remotetriggeredcrawl_busysleep=1000
62_remotetriggeredcrawl_busysleep=500
62_remotetriggeredcrawl_memprereq=6291456
62_remotetriggeredcrawl_isPaused=false
70_cachemanager_idlesleep=1000

Loading…
Cancel
Save