- instead of pushing urls to other peers, the urls are actively pulled by the peer that wants to do a remote crawl - the remote crawl push process had been removed - a process that adds urls from remote peers had been added - the server-side interface for providing 'limit'-urls exists since 0.55 and works with this version - the list-interface had been removed - servlets using the list-interface had been removed (this implementation did not properly manage double-check) - changes in configuration file to support new pull-process - fixed a bug in crawl balancer (status was not saved/closed properly) - the yacy/urls-protocol was extended to support different networks/clusters - many interface-adoptions to new stack counters git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4232 6c8d7289-2bf4-0310-a012-ef5d649a1542pull/1/head
@ -1,68 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<title>YaCy '#[clientname]#': URL Fetcher Stack Management</title>
<body id="CrawlURLFetchStack_p">
<h2>Manage stack for remote URL fetches</h2>
#(addedUrls)#::<span class="success">Added #[added]# URLs!</span>#(/addedUrls)#
<form method="post" action="CrawlURLFetchStack_p.html" enctype="multipart/form-data">
<dt>Currently stacked URLs:</dt><dd>#[urlCount]#</dd>
<dt>Totally fetched / added URLs:</dt><dd>#[totalFetched]# / #[totalAdded]#</dd>
<dt>Fetched from #[peer]#</dt><dd>#[amount]#</dd>#{/peers}#
<dt><label for="maxSize">Maximum URLs for each transfer</label>:</dt>
<input type="text" name="maxSize" id="maxSize" value="#[maxSize]#" maxlength="3" size="3" />
<input type="submit" name="setMaxSize" value="Set" />
<span class="success">Set max. size for each transfer to #[value]#</span>::
<span class="error">Setting max. size for each transfer to #[value]# was unsuccessful: may not be negative</span>#(/set)#
<fieldset><legend>Add URLs to stack</legend>
<dt><label for="shiftloc">Shift URLs from Local Crawler</label>:</dt>
<input type="text" name="shiftloc" id="shiftloc" value="#[locurlsVal]#" size="5" maxlength="5" style="text-align: right;" />
of <span class="tt">#[locurls]#</span> URLs
<input type="submit" name="shiftlcq" value="Shift" />#(shiftloc)#::
<span class="success">Shifted #[value]# URLs from Local Crawler Queue to URL Fetcher Stack (not bound: #[failed]#)</span>#(/shiftloc)#
<dt><label for="shiftrem">Shift URLs from Remote Crawler</label>:</dt>
<input type="text" name="shiftrem" id="shiftrem" value="#[remurlsVal]#" size="5" maxlength="5" style="text-align: right;" />
of <span class="tt">#[remurls]#</span> URLs
<input type="submit" name="shiftrcq" value="Shift" />#(shiftrem)#::
<span class="success">Shifted #[value]# URLs from Remote Crawler Queue to URL Fetcher Stack (not bound: #[failed]#)</span>#(/shiftrem)#
<dt><label for="upload">Upload URL-List</label>:</dt>
<input type="file" name="upload" id="upload" /> #(uploadError)#:: <span class="error">No file entered for upload</span>#(/uploadError)#<br />
<input type="radio" name="uploadType" id="plain" value="plain" checked="checked" /> <label for="plain">Plain text, line-seperated</label><br />
<input type="radio" name="uploadType" id="html" value="html" /> <label for="html">HTML file, links will be added</label><br />
<input type="checkbox" name="blacklistCheck" id="blacklistCheck" checked="checked" /> <label for="blacklistCheck">Don't add URLs matching blacklists active for crawler</label><br />
<input type="submit" name="subupload" value="Upload File" />
<span class="success">Added #[added]# and rejected #[failed]# URLs from uploaded file successfully</span>::
<span class="error">An internal error occured processing the uploaded file: #[error]#</span>#(/upload)#
@ -1,299 +0,0 @@
// CrawlURLFetchStack_p.java
// -------------------------------------
// part of YACY
// (C) 2007 by Franz Brausze
// last change: $LastChangedDate: $ by $LastChangedBy: $
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.PrintWriter;
import java.io.Writer;
import java.net.MalformedURLException;
import java.util.HashMap;
import java.util.Iterator;
import de.anomic.data.URLFetcherStack;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterWriter;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlEntry;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacyURL;
public class CrawlURLFetchStack_p {
public static final HashMap /* of PeerName, sent URLs */ fetchMap = new HashMap();
private static URLFetcherStack stack = null;
public static int maxURLsPerFetch = 50;
public static URLFetcherStack getURLFetcherStack(serverSwitch env) {
if (stack == null) try {
stack = new URLFetcherStack(env.getConfigPath(plasmaSwitchboard.DBPATH, plasmaSwitchboard.DBPATH_DEFAULT));
} catch (IOException e) {
serverLog.logSevere("URLFETCHER", "Couldn't initialize URL stack: " + e.getMessage());
return stack;
public static final String STREAM_CMD_ADDURLS_ = "ADD URLS: ";
public static final String STREAM_CMD_END = "END";
public static final String STREAM_RESP_OK_ADDURLS_ = "FAILED URLS: ";
public static final String STREAM_RESP_OK = "OK";
public static final String STREAM_RESP_FAILED = "FAILED";
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
final serverObjects prop = new serverObjects();
plasmaSwitchboard sb = (plasmaSwitchboard)env;
if (((String)header.get(httpHeader.CONNECTION_PROP_PATH)).endsWith(".stream")) {
/* =================================================================
* .stream request
* ================================================================= */
InputStream in = (InputStream)header.get(httpHeader.CONNECTION_PROP_INPUTSTREAM);
OutputStream out = (OutputStream)header.get(httpHeader.CONNECTION_PROP_OUTPUTSTREAM);
BufferedReader inrb = new BufferedReader(new InputStreamReader(in));
PrintWriter outw = new PrintWriter(out);
String line;
int addurls = 0, cururl = 0;
boolean[] status = new boolean[0];
boolean blchk = false;
URLFetcherStack stack = getURLFetcherStack(env);
try {
while ((line = inrb.readLine()) != null) {
// commands
if (line.startsWith(STREAM_CMD_ADDURLS_)) {
try {
addurls = Integer.parseInt(line.substring(STREAM_CMD_ADDURLS_.length()));
status = new boolean[addurls];
cururl = 0;
blchk = false;
} catch (NumberFormatException e) {
} else if (line.startsWith(STREAM_CMD_ADDURLSBLCHK_)) {
try {
addurls = Integer.parseInt(line.substring(STREAM_CMD_ADDURLSBLCHK_.length()));
status = new boolean[addurls];
cururl = 0;
blchk = true;
} catch (NumberFormatException e) {
} else if (line.equals(STREAM_CMD_END)) {
} else {
if (cururl < addurls) // add url
status[cururl++] = addURL(line, blchk, stack);
if (cururl > 0 && cururl == addurls ) {
// done with parsing the passed URL count, now some status output: i.e. 'FAILED URLS: 5 of 8'
StringBuffer stat = new StringBuffer();
for (int i=0; i<status.length; i++)
if (!status[i]) stat.append(i).append(", ");
outw.print(stat.substring(0, stat.length() - 2));
outw.print(" of ");
cururl = 0;
addurls = 0;
} catch (IOException e) { e.printStackTrace(); }
return null;
/* =================================================================
* 'normal' request
* ================================================================= */
if (post != null) {
if (post.containsKey("addurls")) {
prop.put("addedUrls", "1");
prop.put("addedUrls_added", addURLs(post, post.getInt("addurls", -1), getURLFetcherStack(env)));
else if (post.containsKey("setMaxSize")) {
final int count = post.getInt("maxSize", maxURLsPerFetch);
if (count > 0) {
maxURLsPerFetch = count;
prop.put("set", "1");
prop.put("set_value", maxURLsPerFetch);
} else {
prop.put("set", "2");
prop.put("set_value", count);
else if (post.containsKey("shiftlcq")) {
final int count = Math.min(post.getInt("shiftloc", 0), sb.crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE));
final int failed = shiftFromNotice(sb.crawlQueues.noticeURL, plasmaCrawlNURL.STACK_TYPE_CORE, getURLFetcherStack(env), count);
prop.put("shiftloc", "1");
prop.put("shiftloc_value", count - failed);
prop.put("shiftloc_failed", failed);
else if (post.containsKey("shiftrcq")) {
final int count = post.getInt("shiftrem", 0);
final int failed = shiftFromNotice(sb.crawlQueues.noticeURL, plasmaCrawlNURL.STACK_TYPE_LIMIT, getURLFetcherStack(env), count);
prop.put("shiftrem", "1");
prop.put("shiftrem_value", count - failed);
prop.put("shiftrem_failed", failed);
else if (post.containsKey("subupload")) {
if (post.get("upload", "").length() == 0) {
prop.put("uploadError", "1");
} else {
final File file = new File(post.get("upload", ""));
final String content = new String((byte[])post.get("upload$file"));
final String type = post.get("uploadType", "");
final boolean blCheck = post.containsKey("blacklistCheck");
if (type.equals("plain")) {
prop.put("upload_added", addURLs(content.split("\n"), blCheck, getURLFetcherStack(env)));
prop.put("upload_failed", "0");
prop.put("upload", "1");
} else if (type.equals("html")) {
try {
final htmlFilterContentScraper scraper = new htmlFilterContentScraper(new yacyURL(file));
final Writer writer = new htmlFilterWriter(null, null, scraper, null, false);
serverFileUtils.write(content, writer);
final Iterator it = ((HashMap)scraper.getAnchors()).keySet().iterator();
int added = 0, failed = 0;
yacyURL url;
while (it.hasNext()) try {
url = new yacyURL((String) it.next(), null);
if (blCheck && plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, url)) {
} catch (MalformedURLException e) { failed++; }
prop.put("upload", "1");
prop.put("upload_added", added);
prop.put("upload_failed", failed);
} catch (Exception e) {
prop.put("upload", "2");
prop.putHTML("upload_error", e.getMessage());
prop.put("urlCount", getURLFetcherStack(env).size());
prop.put("totalFetched", getURLFetcherStack(env).getPopped());
prop.put("totalAdded", getURLFetcherStack(env).getPushed());
prop.put("maxSize", maxURLsPerFetch);
prop.put("locurls", sb.crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE));
prop.put("remurls", sb.crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT));
prop.put("locurlsVal", Math.min(sb.crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE), 500));
prop.put("remurlsVal", Math.min(sb.crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT), 500));
return prop;
private static void putFetched(serverObjects prop) {
Iterator it = fetchMap.keySet().iterator();
int count = 0;
while (it.hasNext()) {
String key = (String)it.next();
prop.putHTML("peers_" + count + "_peer", key);
prop.put("peers_" + count + "_amount", ((Integer)fetchMap.get(key)).intValue());
prop.put("peers", count);
private static int addURLs(String[] urls, boolean blCheck, URLFetcherStack stack) {
int count = -1;
for (int i=0; i<urls.length; i++)
if (addURL(urls[i], blCheck, stack)) count++;
return count;
private static boolean addURL(String url, boolean blCheck, URLFetcherStack stack) {
try {
if (url == null || url.length() == 0) return false;
yacyURL u = new yacyURL(url, null);
if (blCheck && plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, u)) return false;
return true;
} catch (MalformedURLException e) { return false; }
private static int shiftFromNotice(plasmaCrawlNURL nurl, int fromStackType, URLFetcherStack stack, int count) {
plasmaCrawlEntry entry;
int failed = 0;
for (int i=0; i<count; i++) try {
entry = nurl.pop(fromStackType, false);
} catch (IOException e) { failed++; }
return failed;
private static int addURLs(serverObjects post, int amount, URLFetcherStack stack) {
int count = 0;
String url;
for (int i=0; i<amount; i++) {
url = post.get("url" + i, null);
if (url == null || url.length() == 0) continue;
try {
stack.push(new yacyURL(url, null));
} catch (MalformedURLException e) {
serverLog.logInfo("URLFETCHER", "retrieved invalid url for adding to the stack: " + url);
return count;
@ -1,107 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<title>YaCy '#[clientname]#': URL Fetcher Management</title>
<body id="CrawlURLFetch_p">
<form method="post" action="CrawlURLFetch_p.html" enctype="multipart/form-data">
<fieldset><legend>Fetch new URLs to crawl</legend>
The newly added URLs will be crawled without any filter restricions except of the <em>static</em> stop-words.
The Re-Crawl option isn't used and the sites won't be stored in the Proxy Cache. Text and media types will be indexed.
Since these URLs will be requested explicitely from another peer, they won't be distributed for remote indexing.
<dt><label for="url">Fetch from URL</label>:</dt>
<input type="radio" name="source" value="url" id="url" checked="checked" />
<input type="text" id="host" name="host" size="60" value="#[host]#" />
#(hostError)#:: <span class="error">Malformed URL</span>#(/hostError)#
<dt><label for="savedURL">Or select previously entered URL</label>:</dt>
<input type="radio" name="source" id="savedURL" value="saved" />
<select name="saved">#{urls}#
<dt><label for="peer">Fetch from Peer</label>:</dt>
<input type="radio" name="source" value="peer" id="peer" />
<select name="peerhash">
<option value="random" selected="selected">Choose a random peer</option>#{peers}#
<option value="#[hash]#">#[name]#</option>#{/peers}#
<input type="submit" name="checkPeerURLCount" value="Check URL count" />
<label for="amount">Amount of URLs to request</label>:
<input type="text" name="amount" id="amount" value="50" maxlength="3" size="3" />
<span class="error">Error fetching URL-list from <span class="tt">#[hash]#:#[name]#</span></span>::
<span class="error">Peer with hash <span class="tt">#[hash]#</span> doesn't seem to be online anymore</span>#(/peerError)#
<input type="radio" name="reg" value="once" id="once" checked="checked" /> <label for="once">Fetch only once</label><br />
<input type="radio" name="reg" value="self_det" id="self_det" disabled="disabled"/> <label for="self_det">Fetch when queue is empty</label><br />
<input type="radio" name="reg" value="delay" id="delay" /> <label for="delay">Fetch in a specified delay</label>:
<label for="frequency">every</label>
<input type="text" name="frequency" id="frequency" size="2" style="text-align: right;" maxlength="2"/>
<select name="freq_type">
<option value="days">Days</option>
<option value="hours" selected="selected">Hours</option>
<option value="minutes">Minutes</option>
#(freqError)#:: <span class="error">Invalid period, fetching only once</span>#(/freqError)#
<dt><input type="submit" name="start" value="Fetch URLs" /></dt>
<span class="error">Error on stopping thread, it isn't alive anymore</span>::
<span class="error">Error on restarting thread, it isn't alive anymore</span>#(/threadError)#
<form method="post" action="CrawlURLFetch_p.html" enctype="multipart/form-data">
<fieldset><legend>Thread to fetch URLs is #(status)#running::stopped::paused#(/status)#</legend>
<dt>Total runs:</dt><dd>#[totalRuns]#</dd>
<dt>Total fetched URLs:</dt><dd>#[totalFetchedURLs]#</dd>
<dt>Total failed URLs:</dt><dd>#[totalFailedURLs]#</dd>
<dt>Last run duration:</dt><dd>#[lastRun]# ms</dd>
<dt>Last server response:</dt><dd>#[lastServerResponse]#</dd>
<dt>Last fetched URLs:</dt><dd>#[lastFetchedURLs]#</dd>
<dt>Last failed URLs:</dt>
<li><span class="error">#[reason]#</span>: <a href="#[url]#">#[url]#</a></li>#{/error}#
<dt><label for="newDelay">Re-set delay</label>:</dt>
<input type="text" name="newDelay" id="newDelay" maxlength="2" size="6" value="#[curDelay]#" style="text-align: right;" /> minutes
<input type="submit" name="resetDelay" value="Set new delay" />
<input type="submit" name="stop" value="Stop Thread" />::
<input type="submit" name="restart" value="Restart Thread" />::
<input type="submit" name="stop" value="Stop Thread" />
<input type="submit" name="restart" value="Restart Thread" />#(/status)#
@ -1,543 +0,0 @@
// CrawlURLFetch_p.java
// -------------------------------------
// part of YACY
// (C) 2007 by Franz Brausze
// last change: $LastChangedDate: $ by $LastChangedBy: $
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Random;
import java.util.TreeMap;
import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaCrawlZURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverByteBuffer;
import de.anomic.server.serverSwitch;
import de.anomic.http.httpHeader;
import de.anomic.http.httpRemoteProxyConfig;
import de.anomic.http.httpc;
import de.anomic.server.serverObjects;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeed;
import de.anomic.yacy.yacyURL;
import de.anomic.yacy.yacyVersion;
public class CrawlURLFetch_p {
private static final long ERR_DATE = 1;
private static final long ERR_HOST_MALFORMED_URL = 1;
private static final long ERR_PEER_GENERAL_CONN = 1;
private static final long ERR_PEER_OFFLINE = 2;
private static final long ERR_THREAD_STOP = 1;
private static final long ERR_THREAD_RESUME = 2;
private static final long STAT_THREAD_ALIVE = 0;
private static final long STAT_THREAD_STOPPED = 1;
private static final long STAT_THREAD_PAUSED = 2;
private static URLFetcher fetcher = null;
private static plasmaCrawlProfile.entry profile = null;
private static ArrayList savedURLs = new ArrayList();
public static plasmaCrawlProfile.entry getCrawlProfile(serverSwitch env) {
if (profile == null) {
profile = ((plasmaSwitchboard)env).profilesActiveCrawls.newEntry(
"URLFetcher", // Name
null, // URL
".*", ".*", // General / specific filter
0, 0, // General / specific depth
-1, -1, -1, // Recrawl / Dom-filter depth / Dom-max-pages
true, // Crawl query
true, true, // Index text / media
false, true, // Store in HT- / TX-Cache
false, // Remote indexing
true, false, false); // Exclude static / dynamic / parent stopwords
return profile;
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
serverObjects prop = new serverObjects();
prop.put("host", "");
// List previously saved URLs for easy selection
// List known hosts
post != null && post.containsKey("checkPeerURLCount"),
if (post != null) {
if (post.containsKey("start")) {
long frequency = URLFetcher.DELAY_ONCE;
if (post.containsKey("reg")) {
if (post.get("reg", "").equals("self_det")) {
frequency = URLFetcher.DELAY_SELF_DET;
} else if (post.get("reg", "").equals("delay")) {
frequency = getDate(post.get("frequency", ""), post.get("freq_type", ""));
if (frequency == -1)
prop.put("freqError", ERR_DATE);
int count = 50;
if (post.get("amount", "").matches("\\d+")) {
count = Integer.parseInt(post.get("amount", ""));
if (count > 999) count = 999;
if (fetcher != null) fetcher.interrupt();
fetcher = null;
if (post.get("source", "").equals("peer") &&
post.get("peerhash", "").equals("random")) {
fetcher = new URLFetcher(
} else {
yacyURL url = null;
if (post.get("source", "").equals("url")) {
try {
url = new yacyURL(post.get("host", null), null);
if (!savedURLs.contains(url.toNormalform(true, true)))
savedURLs.add(url.toNormalform(true, true));
prop.put("host", post.get("host", url.toString()));
} catch (MalformedURLException e) {
prop.put("host", post.get("host", ""));
prop.put("hostError", ERR_HOST_MALFORMED_URL);
} else if (post.get("source", "").equals("savedURL")) {
try {
url = new yacyURL(post.get("saved", ""), null);
} catch (MalformedURLException e) {
/* should never appear, except for invalid input, see above */
} else if (post.get("source", "").equals("peer")) {
yacySeed ys = null;
ys = yacyCore.seedDB.get(post.get("peerhash", null));
if (ys != null) {
if ((url = URLFetcher.getListServletURL(
yacyCore.seedDB.mySeed().hash)) == null) {
prop.put("peerError", ERR_PEER_GENERAL_CONN);
prop.put("peerError_hash", post.get("peerhash", ""));
prop.put("peerError_name", ys.getName());
} else {
prop.put("peerError", ERR_PEER_OFFLINE);
prop.put("peerError_hash", post.get("peerhash", ""));
if (url != null) {
fetcher = new URLFetcher(
if (fetcher != null) fetcher.start();
else if (post.containsKey("stop")) {
if (fetcher != null) {
} else {
prop.put("threadError", ERR_THREAD_STOP);
else if (post.containsKey("restart")) {
if (fetcher != null) {
if (fetcher.url == null) {
fetcher = new URLFetcher(
} else {
fetcher = new URLFetcher(
} else {
prop.put("threadError", ERR_THREAD_RESUME);
else if (post.containsKey("resetDelay")) {
final long frequency = getDate(post.get("newDelay", ""), "minutes");
if (frequency == -1) {
prop.put("freqError", ERR_DATE);
} else {
fetcher.delay = frequency;
prop.put("LOCATION", "/CrawlURLFetch_p.html");
if (fetcher != null) {
prop.put("runs", "1");
((fetcher.paused && fetcher.isAlive()) ? STAT_THREAD_PAUSED :
prop.putNum("runs_totalRuns", URLFetcher.totalRuns);
prop.putNum("runs_totalFetchedURLs", URLFetcher.totalFetchedURLs);
prop.putNum("runs_totalFailedURLs", URLFetcher.totalFailed);
prop.putNum("runs_lastRun", fetcher.lastRun);
prop.putNum("runs_lastFetchedURLs", fetcher.lastFetchedURLs);
prop.put("runs_lastServerResponse", (fetcher.lastServerResponse == null)
? "" : fetcher.lastServerResponse);
prop.putNum("runs_curDelay", (int)(fetcher.delay / 60000));
Iterator it = fetcher.failed.keySet().iterator();
int i = 0;
Object key;
while (it.hasNext()) {
key = it.next();
prop.put("runs_error_" + i + "_reason", fetcher.failed.get(key));
prop.put("runs_error_" + i + "_url", (String)key);
prop.put("runs_error", i);
return prop;
private static int listURLs(serverObjects prop) {
if (savedURLs.size() == 0) return 0;
prop.put("saved", "1");
for (int i=0; i<savedURLs.size(); i++)
prop.put("saved_urls_" + i + "_url", savedURLs.get(i));
prop.putNum("saved_urls", savedURLs.size());
return savedURLs.size();
private static int listPeers(serverObjects prop, boolean checkURLCount, httpRemoteProxyConfig theRemoteProxyConfig) {
int peerCount = 0;
TreeMap hostList = new TreeMap();
String peername;
if (yacyCore.seedDB != null && yacyCore.seedDB.sizeConnected() > 0) {
final Iterator e = yacyCore.seedDB.seedsConnected(true, false, null, yacyVersion.YACY_PROVIDES_CRAWLS_VIA_LIST_HTML);
int dbsize;
while (e.hasNext()) {
yacySeed seed = (yacySeed) e.next();
if (seed != null && !seed.hash.equals(yacyCore.seedDB.mySeed().hash)) {
peername = seed.get(yacySeed.NAME, "nameless");
if (checkURLCount && (dbsize = getURLs2Fetch(seed, theRemoteProxyConfig)) > 0) {
hostList.put(peername + " (" + dbsize + ")", seed.hash);
} else {
hostList.put(peername, seed.hash);
if (hostList.size() > 0) {
while (!hostList.isEmpty() && (peername = (String) hostList.firstKey()) != null) {
final String hash = (String) hostList.get(peername);
prop.put("peersKnown_peers_" + peerCount + "_hash", hash);
prop.put("peersKnown_peers_" + peerCount + "_name", peername);
prop.put("peersKnown_peers", peerCount);
prop.put("peersKnown", "1");
} else {
prop.put("peersKnown", "0");
return peerCount;
private static int getURLs2Fetch(yacySeed seed, httpRemoteProxyConfig theRemoteProxyConfig) {
try {
String answer = new String(httpc.wget(
URLFetcher.getListServletURL(seed.getPublicAddress(), URLFetcher.MODE_COUNT, 0, null),
null, null,
if (answer.matches("\\d+"))
return Integer.parseInt(answer);
else {
serverLog.logFine("URLFETCHER", "Retrieved invalid answer from " + seed.getName() + ": '" + answer + "'");
return -1;
} catch (MalformedURLException e) {
/* should not happen */
return -3;
} catch (IOException e) {
return -2;
private static long getDate(String count, String type) {
long r = 0;
if (count != null && count.matches("\\d+")) r = Long.parseLong(count);
if (r < 1) return -1;
r *= 60000;
if (type.equals("days")) return r * 60 * 24;
else if (type.equals("hours")) return r * 60;
else if (type.equals("minutes")) return r;
else return -1;
public static class URLFetcher extends Thread {
public static final long DELAY_ONCE = -1;
public static final long DELAY_SELF_DET = 0;
public static final int MODE_LIST = 0;
public static final int MODE_COUNT = 1;
public static int totalRuns = 0;
public static int totalFetchedURLs = 0;
public static int totalFailed = 0;
public final HashMap failed = new HashMap();
public int lastFetchedURLs = 0;
public long lastRun = 0;
public String lastServerResponse = null;
public int lastFailed = 0;
public final yacyURL url;
public final int count;
public long delay;
public final plasmaSwitchboard sb;
public final plasmaCrawlProfile.entry profile;
public boolean paused = false;
public static yacyURL getListServletURL(String host, int mode, int count, String peerHash) {
String r = "http://" + host + "/yacy/list.html?list=queueUrls&display=";
switch (mode) {
case MODE_LIST: r += "list"; break;
case MODE_COUNT: r += "count"; break;
if (count > 0) r += "&count=" + count;
if (peerHash != null && peerHash.length() > 0) {
r += "&iam=" + peerHash;
} else if (mode == MODE_LIST) {
r += "&iam=" + yacyCore.seedDB.mySeed().hash;
try {
return new yacyURL(r, null);
} catch (MalformedURLException e) {
return null;
public URLFetcher(
serverSwitch env,
plasmaCrawlProfile.entry profile,
yacyURL url,
int count,
long delayMs) {
if (env == null || profile == null || url == null)
throw new NullPointerException("env, profile or url must not be null");
this.sb = (plasmaSwitchboard)env;
this.profile = profile;
this.url = url;
this.count = count;
this.delay = delayMs;
public URLFetcher(
serverSwitch env,
plasmaCrawlProfile.entry profile,
int count,
long delayMs) {
if (env == null || profile == null)
throw new NullPointerException("env or profile must not be null");
this.sb = (plasmaSwitchboard)env;
this.profile = profile;
this.url = null;
this.count = count;
this.delay = delayMs;
public void run() {
this.paused = false;
long start;
yacyURL url;
while (!isInterrupted()) {
try {
start = System.currentTimeMillis();
url = getDLURL();
if (url == null) {
serverLog.logSevere(this.getName(), "canceled because no valid URL for the URL-list could be determinded");
totalFetchedURLs += stackURLs(getURLs(url));
this.lastRun = System.currentTimeMillis() - start;
serverLog.logInfo(this.getName(), "Loaded " + this.lastFetchedURLs + " URLs from " + url + " in " + this.lastRun + " ms into stackcrawler.");
if (this.delay < 0 || isInterrupted()) {
} else synchronized (this) {
if (this.delay == 0) {
this.paused = true;
while (this.paused) this.wait();
} else {
this.paused = true;
this.paused = false;
} catch (InterruptedException e) { return; }
private yacyURL getDLURL() {
if (this.url != null) return this.url;
// choose random seed
yacySeed ys = null;
Iterator e = yacyCore.seedDB.seedsConnected(true, false, null, yacyVersion.YACY_PROVIDES_CRAWLS_VIA_LIST_HTML);
int num = new Random().nextInt(yacyCore.seedDB.sizeConnected()) + 1;
Object o;
for (int i=0; i<num && e.hasNext(); i++) {
o = e.next();
if (o != null) ys = (yacySeed)o;
if (ys == null) return null;
return getListServletURL(ys.getPublicAddress(), MODE_LIST, this.count, yacyCore.seedDB.mySeed().hash);
private int stackURLs(ArrayList /*of yacyURL*/ urls) {
this.lastFailed = 0;
this.lastFetchedURLs = 0;
if (urls == null) return 0;
String reason;
yacyURL url;
for (int i = 0; i < urls.size() && !isInterrupted(); i++) {
url = (yacyURL) urls.get(i);
reason = this.sb.crawlStacker.stackCrawl(
new Date(),
if (reason == null) {
serverLog.logFine(this.getName(), "stacked " + url);
} else {
serverLog.logFine(this.getName(), "error on stacking " + url + ": " + reason);
this.failed.put(url, reason);
plasmaCrawlZURL.Entry ee = this.sb.crawlQueues.errorURL.newEntry(
return this.lastFetchedURLs;
private ArrayList /*of yacyURL */ getURLs(yacyURL url) {
if (url == null) return null;
ArrayList a = new ArrayList();
try {
httpc con = new httpc(
plasmaSwitchboard.getSwitchboard().remoteProxyConfig, null, null);
httpHeader header = new httpHeader();
header.put(httpHeader.ACCEPT_ENCODING, "US-ASCII");
header.put(httpHeader.HOST, url.getHost());
httpc.response res = con.GET(url.getPath() + "?" + url.getQuery(), header);
serverLog.logFine(this.getName(), "downloaded URL-list from " + url + " (" + res.statusCode + ")");
this.lastServerResponse = res.statusCode + " (" + res.statusText + ")";
if (res.status.startsWith("2")) {
serverByteBuffer sbb = new serverByteBuffer();
//byte[] cbs = res.writeContent();
res.writeContent(sbb, null);
String encoding = res.responseHeader.getCharacterEncoding();
if (encoding == null) encoding = "US-ASCII";
String[] s = (new String(sbb.getBytes(), encoding)).split("\n");
for (int i = 0; i < s.length; i++) {
try {
a.add(new yacyURL(s[i], null));
} catch (MalformedURLException e) {}
} catch (IOException e) { }
return a;
@ -1,7 +0,0 @@
<div class="SubMenu">
<h3>URL Fetcher Menu</h3>
<ul class="SubMenu">
<li><a href="/CrawlURLFetch_p.html" class="MenuItemLink lock">URL Fetcher</a></li>
<li><a href="/CrawlURLFetchStack_p.html" class="MenuItemLink lock">URL Stack</a></li>
@ -0,0 +1,30 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<title>YaCy '#[clientname]#': Index Control</title>
<body id="IndexControl">
<h2>remote crawl fetch test</h2>
<form name="selection" action="rct_p.html" method="post" enctype="multipart/form-data">
<fieldset><legend>Retrieve remote crawl url list</legend>
<dt class="TableCellDark">Target Peer:</dt>
<dd>select <select name="peer">
<option value="#[hosthash]#">#[hostname]#</option>
<dt class="TableCellLight"></dt>
<dd><input type="submit" name="retrieve" value="retrieve" />
@ -0,0 +1,124 @@
// rct_p.java
// -----------------------
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 28.11.2007 on http://yacy.net
// This is a part of YaCy, a peer-to-peer based web search engine
// $LastChangedDate: 2007-11-14 01:15:28 +0000 (Mi, 14 Nov 2007) $
// $LastChangedRevision: 4216 $
// $LastChangedBy: orbiter $
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.net.MalformedURLException;
import java.text.ParseException;
import java.util.Date;
import java.util.Iterator;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverDate;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.xml.rssReader;
import de.anomic.yacy.yacyClient;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeed;
import de.anomic.yacy.yacyURL;
public class rct_p {
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
// return variable that accumulates replacements
plasmaSwitchboard sb = (plasmaSwitchboard) env;
serverObjects prop = new serverObjects();
if (post != null) {
if (post.containsKey("retrieve")) {
String peerhash = post.get("peer", null);
yacySeed seed = (peerhash == null) ? null : yacyCore.seedDB.getConnected(peerhash);
rssReader reader = (seed == null) ? null : yacyClient.queryRemoteCrawlURLs(seed, 10);
if (reader != null) {
rssReader.Item item;
for (int i = 0; i < reader.items(); i++) {
item = reader.getItem(i);
//System.out.println("URL=" + item.getLink() + ", desc=" + item.getDescription() + ", pubDate=" + item.getPubDate());
// put url on remote crawl stack
yacyURL url;
try {
url = new yacyURL(item.getLink(), null);
} catch (MalformedURLException e) {
url = null;
Date loaddate;
try {
loaddate = serverDate.parseShortSecondTime(item.getPubDate());
} catch (ParseException e) {
loaddate = new Date();
yacyURL referrer = null; // referrer needed!
if (sb.acceptURL(url)) {
// stack url
sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
String reasonString = sb.crawlStacker.stackCrawl(url, referrer, peerhash, "REMOTE-CRAWLING", loaddate, 0, sb.defaultRemoteProfile);
if (reasonString == null) {
// done
env.getLog().logInfo("crawlOrder: added remote crawl url: " + url.toNormalform(true, false));
} else if (reasonString.startsWith("double")) {
// case where we have already the url loaded;
env.getLog().logInfo("crawlOrder: ignored double remote crawl url: " + url.toNormalform(true, false));
} else {
env.getLog().logInfo("crawlOrder: ignored [" + reasonString + "] remote crawl url: " + url.toNormalform(true, false));
} else {
env.getLog().logWarning("crawlOrder: Received URL outside of our domain: " + url.toNormalform(true, false));
// return rewrite properties
return prop;
private static void listHosts(serverObjects prop) {
// list known hosts
yacySeed seed;
int hc = 0;
if (yacyCore.seedDB != null && yacyCore.seedDB.sizeConnected() > 0) {
Iterator e = yacyCore.dhtAgent.getProvidesRemoteCrawlURLs();
while (e.hasNext()) {
seed = (yacySeed) e.next();
if (seed != null) {
prop.put("hosts_" + hc + "_hosthash", seed.hash);
prop.putHTML("hosts_" + hc + "_hostname", seed.hash + " " + seed.get(yacySeed.NAME, "nameless") + " (" + seed.getLong(yacySeed.RCOUNT, 0) + ")");
prop.put("hosts", hc);
} else {
prop.put("hosts", "0");
@ -1 +0,0 @@
@ -1,152 +0,0 @@
// list.java
// -----------------------
// part of YaCy
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
// This File is contributed by Alexander Schier
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
// You must compile this file with
// javac -classpath .:../../classes list.java
// if the shell's current path is HTROOT
// contains contributions by [FB] to support listing URLs for URL Fetcher
import java.io.File;
import de.anomic.data.URLFetcherStack;
import de.anomic.data.htmlTools;
import de.anomic.data.listManager;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverCore;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacyNetwork;
import de.anomic.yacy.yacySeed;
import de.anomic.yacy.yacyURL;
public final class list {
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
if (post == null || env == null)
throw new NullPointerException("post: " + post + ", sb: " + env);
plasmaSwitchboard sb = (plasmaSwitchboard) env;
// return variable that accumulates replacements
final serverObjects prop = new serverObjects();
if ((post == null) || (env == null)) return prop;
if (!yacyNetwork.authentifyRequest(post, env)) return prop;
final String col = post.get("col", "");
final File listsPath = env.getConfigPath(plasmaSwitchboard.LISTS_PATH, plasmaSwitchboard.LISTS_PATH_DEFAULT);
String otherPeerName = null;
if (post.containsKey("iam")) {
yacySeed bla = yacyCore.seedDB.get(post.get("iam", ""));
if (bla != null) otherPeerName = bla.getName();
if (otherPeerName == null) otherPeerName = (String)header.get(httpHeader.CONNECTION_PROP_CLIENTIP);
if ((sb.isRobinsonMode()) && (!sb.isInMyCluster(otherPeerName))) {
// if we are a robinson cluster, answer only if this client is known by our network definition
return null;
if (col.equals("black")) {
final StringBuffer out = new StringBuffer();
final String filenames=env.getConfig("BlackLists.Shared", "");
final String[] filenamesarray = filenames.split(",");
if(filenamesarray.length > 0){
for(int i = 0;i < filenamesarray.length; i++){
String filename = filenamesarray[i];
File fileObj = new File(listsPath,filename);
out.append(listManager.getListString(fileObj, false))
} // if filenamesarray.length > 0
// start contrib by [FB]
else if (col.length() == 0 && post.get("list", "").equals("queueUrls")) {
final URLFetcherStack db = CrawlURLFetchStack_p.getURLFetcherStack(env);
final String display = post.get("display", "list");
if (display.equals("list")) {
// list urls from remote crawler queue for other peers
final int count = Math.min(post.getInt("count", 50), CrawlURLFetchStack_p.maxURLsPerFetch);
if (count > 0 && db.size() > 0) {
final StringBuffer b = new StringBuffer();
yacyURL url;
int cnt = 0;
for (int i=0; i<count; i++) {
if ((url = db.pop()) == null) continue;
b.append(htmlTools.decodeHtml2Unicode(url.toNormalform(false, true))).append("\n");
prop.put("list", b.toString());
new Integer(((CrawlURLFetchStack_p.fetchMap.get(otherPeerName) == null)
? 0
: ((Integer)CrawlURLFetchStack_p.fetchMap.get(otherPeerName)).intValue()) + cnt));
serverLog.logInfo("URLFETCHER", "sent " + cnt + " URLs to " + otherPeerName);
} else {
prop.put("list", "");
serverLog.logInfo("URLFETCHER", "couldn't satisfy URL request from " + otherPeerName + ": stack is empty");
} else if (display.equals("count")) {
prop.put("list", db.size());
// end contrib by [FB]
} else {
return prop;
Reference in new issue