- instead of pushing urls to other peers, the urls are actively pulled by the peer that wants to do a remote crawl - the remote crawl push process had been removed - a process that adds urls from remote peers had been added - the server-side interface for providing 'limit'-urls exists since 0.55 and works with this version - the list-interface had been removed - servlets using the list-interface had been removed (this implementation did not properly manage double-check) - changes in configuration file to support new pull-process - fixed a bug in crawl balancer (status was not saved/closed properly) - the yacy/urls-protocol was extended to support different networks/clusters - many interface-adoptions to new stack counters git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4232 6c8d7289-2bf4-0310-a012-ef5d649a1542pull/1/head
parent
69521d92e5
commit
89b9b2b02a
@ -1,68 +0,0 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<title>YaCy '#[clientname]#': URL Fetcher Stack Management</title>
|
||||
#%env/templates/metas.template%#
|
||||
</head>
|
||||
<body id="CrawlURLFetchStack_p">
|
||||
#%env/templates/header.template%#
|
||||
#%env/templates/submenuCrawlURLFetch.template%#
|
||||
<h2>Manage stack for remote URL fetches</h2>
|
||||
|
||||
#(addedUrls)#::<span class="success">Added #[added]# URLs!</span>#(/addedUrls)#
|
||||
<form method="post" action="CrawlURLFetchStack_p.html" enctype="multipart/form-data">
|
||||
<fieldset><legend>Statistics</legend>
|
||||
<dl>
|
||||
<dt>Currently stacked URLs:</dt><dd>#[urlCount]#</dd>
|
||||
<dt>Totally fetched / added URLs:</dt><dd>#[totalFetched]# / #[totalAdded]#</dd>
|
||||
#{peers}#
|
||||
<dt>Fetched from #[peer]#</dt><dd>#[amount]#</dd>#{/peers}#
|
||||
</dl>
|
||||
</fieldset>
|
||||
|
||||
<fieldset><legend>Settings</legend>
|
||||
<dl>
|
||||
<dt><label for="maxSize">Maximum URLs for each transfer</label>:</dt>
|
||||
<dd>
|
||||
<input type="text" name="maxSize" id="maxSize" value="#[maxSize]#" maxlength="3" size="3" />
|
||||
<input type="submit" name="setMaxSize" value="Set" />
|
||||
#(set)#::
|
||||
<span class="success">Set max. size for each transfer to #[value]#</span>::
|
||||
<span class="error">Setting max. size for each transfer to #[value]# was unsuccessful: may not be negative</span>#(/set)#
|
||||
</dd>
|
||||
</dl>
|
||||
</fieldset>
|
||||
|
||||
<fieldset><legend>Add URLs to stack</legend>
|
||||
<dl>
|
||||
<dt><label for="shiftloc">Shift URLs from Local Crawler</label>:</dt>
|
||||
<dd>
|
||||
<input type="text" name="shiftloc" id="shiftloc" value="#[locurlsVal]#" size="5" maxlength="5" style="text-align: right;" />
|
||||
of <span class="tt">#[locurls]#</span> URLs
|
||||
<input type="submit" name="shiftlcq" value="Shift" />#(shiftloc)#::
|
||||
<span class="success">Shifted #[value]# URLs from Local Crawler Queue to URL Fetcher Stack (not bound: #[failed]#)</span>#(/shiftloc)#
|
||||
</dd>
|
||||
<dt><label for="shiftrem">Shift URLs from Remote Crawler</label>:</dt>
|
||||
<dd>
|
||||
<input type="text" name="shiftrem" id="shiftrem" value="#[remurlsVal]#" size="5" maxlength="5" style="text-align: right;" />
|
||||
of <span class="tt">#[remurls]#</span> URLs
|
||||
<input type="submit" name="shiftrcq" value="Shift" />#(shiftrem)#::
|
||||
<span class="success">Shifted #[value]# URLs from Remote Crawler Queue to URL Fetcher Stack (not bound: #[failed]#)</span>#(/shiftrem)#
|
||||
</dd>
|
||||
<dt><label for="upload">Upload URL-List</label>:</dt>
|
||||
<dd>
|
||||
<input type="file" name="upload" id="upload" /> #(uploadError)#:: <span class="error">No file entered for upload</span>#(/uploadError)#<br />
|
||||
<input type="radio" name="uploadType" id="plain" value="plain" checked="checked" /> <label for="plain">Plain text, line-seperated</label><br />
|
||||
<input type="radio" name="uploadType" id="html" value="html" /> <label for="html">HTML file, links will be added</label><br />
|
||||
<input type="checkbox" name="blacklistCheck" id="blacklistCheck" checked="checked" /> <label for="blacklistCheck">Don't add URLs matching blacklists active for crawler</label><br />
|
||||
<input type="submit" name="subupload" value="Upload File" />
|
||||
#(upload)#::
|
||||
<span class="success">Added #[added]# and rejected #[failed]# URLs from uploaded file successfully</span>::
|
||||
<span class="error">An internal error occured processing the uploaded file: #[error]#</span>#(/upload)#
|
||||
</dd>
|
||||
</dl>
|
||||
</fieldset>
|
||||
</form>
|
||||
#%env/templates/footer.template%#
|
||||
</body>
|
||||
</html>
|
@ -1,299 +0,0 @@
|
||||
// CrawlURLFetchStack_p.java
|
||||
// -------------------------------------
|
||||
// part of YACY
|
||||
//
|
||||
// (C) 2007 by Franz Brausze
|
||||
//
|
||||
// last change: $LastChangedDate: $ by $LastChangedBy: $
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
//
|
||||
// Using this software in any meaning (reading, learning, copying, compiling,
|
||||
// running) means that you agree that the Author(s) is (are) not responsible
|
||||
// for cost, loss of data or any harm that may be caused directly or indirectly
|
||||
// by usage of this softare or this documentation. The usage of this software
|
||||
// is on your own risk. The installation and usage (starting/running) of this
|
||||
// software may allow other people or application to access your computer and
|
||||
// any attached devices and is highly dependent on the configuration of the
|
||||
// software which must be done by the user of the software; the author(s) is
|
||||
// (are) also not responsible for proper configuration and usage of the
|
||||
// software, even if provoked by documentation provided together with
|
||||
// the software.
|
||||
//
|
||||
// Any changes to this file according to the GPL as documented in the file
|
||||
// gpl.txt aside this file in the shipment you received can be done to the
|
||||
// lines that follows this copyright notice here, but changes must not be
|
||||
// done inside the copyright notive above. A re-distribution must contain
|
||||
// the intact and unchanged copyright notice.
|
||||
// Contributions and changes to the program code must be marked as such.
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.OutputStream;
|
||||
import java.io.PrintWriter;
|
||||
import java.io.Writer;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
|
||||
import de.anomic.data.URLFetcherStack;
|
||||
import de.anomic.htmlFilter.htmlFilterContentScraper;
|
||||
import de.anomic.htmlFilter.htmlFilterWriter;
|
||||
import de.anomic.http.httpHeader;
|
||||
import de.anomic.plasma.plasmaCrawlEntry;
|
||||
import de.anomic.plasma.plasmaCrawlNURL;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.plasma.urlPattern.plasmaURLPattern;
|
||||
import de.anomic.server.serverFileUtils;
|
||||
import de.anomic.server.serverObjects;
|
||||
import de.anomic.server.serverSwitch;
|
||||
import de.anomic.server.logging.serverLog;
|
||||
import de.anomic.yacy.yacyURL;
|
||||
|
||||
public class CrawlURLFetchStack_p {
|
||||
|
||||
public static final HashMap /* of PeerName, sent URLs */ fetchMap = new HashMap();
|
||||
private static URLFetcherStack stack = null;
|
||||
public static int maxURLsPerFetch = 50;
|
||||
|
||||
public static URLFetcherStack getURLFetcherStack(serverSwitch env) {
|
||||
if (stack == null) try {
|
||||
stack = new URLFetcherStack(env.getConfigPath(plasmaSwitchboard.DBPATH, plasmaSwitchboard.DBPATH_DEFAULT));
|
||||
} catch (IOException e) {
|
||||
serverLog.logSevere("URLFETCHER", "Couldn't initialize URL stack: " + e.getMessage());
|
||||
}
|
||||
return stack;
|
||||
}
|
||||
|
||||
public static final String STREAM_CMD_ADDURLS_ = "ADD URLS: ";
|
||||
public static final String STREAM_CMD_ADDURLSBLCHK_ = "ADD URLS CHECK BLACKLIST: ";
|
||||
public static final String STREAM_CMD_END = "END";
|
||||
public static final String STREAM_RESP_OK_ADDURLS_ = "FAILED URLS: ";
|
||||
public static final String STREAM_RESP_OK = "OK";
|
||||
public static final String STREAM_RESP_FAILED = "FAILED";
|
||||
|
||||
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
|
||||
final serverObjects prop = new serverObjects();
|
||||
plasmaSwitchboard sb = (plasmaSwitchboard)env;
|
||||
|
||||
if (((String)header.get(httpHeader.CONNECTION_PROP_PATH)).endsWith(".stream")) {
|
||||
/* =================================================================
|
||||
* .stream request
|
||||
* ================================================================= */
|
||||
InputStream in = (InputStream)header.get(httpHeader.CONNECTION_PROP_INPUTSTREAM);
|
||||
OutputStream out = (OutputStream)header.get(httpHeader.CONNECTION_PROP_OUTPUTSTREAM);
|
||||
BufferedReader inrb = new BufferedReader(new InputStreamReader(in));
|
||||
PrintWriter outw = new PrintWriter(out);
|
||||
|
||||
String line;
|
||||
int addurls = 0, cururl = 0;
|
||||
boolean[] status = new boolean[0];
|
||||
boolean blchk = false;
|
||||
URLFetcherStack stack = getURLFetcherStack(env);
|
||||
try {
|
||||
while ((line = inrb.readLine()) != null) {
|
||||
// commands
|
||||
if (line.startsWith(STREAM_CMD_ADDURLS_)) {
|
||||
try {
|
||||
addurls = Integer.parseInt(line.substring(STREAM_CMD_ADDURLS_.length()));
|
||||
status = new boolean[addurls];
|
||||
cururl = 0;
|
||||
blchk = false;
|
||||
outw.println(STREAM_RESP_OK);
|
||||
} catch (NumberFormatException e) {
|
||||
outw.println(STREAM_RESP_FAILED);
|
||||
}
|
||||
} else if (line.startsWith(STREAM_CMD_ADDURLSBLCHK_)) {
|
||||
try {
|
||||
addurls = Integer.parseInt(line.substring(STREAM_CMD_ADDURLSBLCHK_.length()));
|
||||
status = new boolean[addurls];
|
||||
cururl = 0;
|
||||
blchk = true;
|
||||
outw.println(STREAM_RESP_OK);
|
||||
} catch (NumberFormatException e) {
|
||||
outw.println(STREAM_RESP_FAILED);
|
||||
}
|
||||
} else if (line.equals(STREAM_CMD_END)) {
|
||||
break;
|
||||
} else {
|
||||
if (cururl < addurls) // add url
|
||||
status[cururl++] = addURL(line, blchk, stack);
|
||||
|
||||
if (cururl > 0 && cururl == addurls ) {
|
||||
// done with parsing the passed URL count, now some status output: i.e. 'FAILED URLS: 5 of 8'
|
||||
outw.print(STREAM_RESP_OK_ADDURLS_);
|
||||
StringBuffer stat = new StringBuffer();
|
||||
for (int i=0; i<status.length; i++)
|
||||
if (!status[i]) stat.append(i).append(", ");
|
||||
outw.print(stat.substring(0, stat.length() - 2));
|
||||
outw.print(" of ");
|
||||
outw.println(status.length);
|
||||
cururl = 0;
|
||||
addurls = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (IOException e) { e.printStackTrace(); }
|
||||
outw.flush();
|
||||
return null;
|
||||
}
|
||||
/* =================================================================
|
||||
* 'normal' request
|
||||
* ================================================================= */
|
||||
if (post != null) {
|
||||
if (post.containsKey("addurls")) {
|
||||
prop.put("addedUrls", "1");
|
||||
prop.put("addedUrls_added", addURLs(post, post.getInt("addurls", -1), getURLFetcherStack(env)));
|
||||
}
|
||||
else if (post.containsKey("setMaxSize")) {
|
||||
final int count = post.getInt("maxSize", maxURLsPerFetch);
|
||||
if (count > 0) {
|
||||
maxURLsPerFetch = count;
|
||||
prop.put("set", "1");
|
||||
prop.put("set_value", maxURLsPerFetch);
|
||||
} else {
|
||||
prop.put("set", "2");
|
||||
prop.put("set_value", count);
|
||||
}
|
||||
}
|
||||
else if (post.containsKey("shiftlcq")) {
|
||||
final int count = Math.min(post.getInt("shiftloc", 0), sb.crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE));
|
||||
final int failed = shiftFromNotice(sb.crawlQueues.noticeURL, plasmaCrawlNURL.STACK_TYPE_CORE, getURLFetcherStack(env), count);
|
||||
prop.put("shiftloc", "1");
|
||||
prop.put("shiftloc_value", count - failed);
|
||||
prop.put("shiftloc_failed", failed);
|
||||
}
|
||||
else if (post.containsKey("shiftrcq")) {
|
||||
final int count = post.getInt("shiftrem", 0);
|
||||
final int failed = shiftFromNotice(sb.crawlQueues.noticeURL, plasmaCrawlNURL.STACK_TYPE_LIMIT, getURLFetcherStack(env), count);
|
||||
prop.put("shiftrem", "1");
|
||||
prop.put("shiftrem_value", count - failed);
|
||||
prop.put("shiftrem_failed", failed);
|
||||
}
|
||||
else if (post.containsKey("subupload")) {
|
||||
if (post.get("upload", "").length() == 0) {
|
||||
prop.put("uploadError", "1");
|
||||
} else {
|
||||
final File file = new File(post.get("upload", ""));
|
||||
final String content = new String((byte[])post.get("upload$file"));
|
||||
|
||||
final String type = post.get("uploadType", "");
|
||||
final boolean blCheck = post.containsKey("blacklistCheck");
|
||||
if (type.equals("plain")) {
|
||||
prop.put("upload_added", addURLs(content.split("\n"), blCheck, getURLFetcherStack(env)));
|
||||
prop.put("upload_failed", "0");
|
||||
prop.put("upload", "1");
|
||||
} else if (type.equals("html")) {
|
||||
try {
|
||||
final htmlFilterContentScraper scraper = new htmlFilterContentScraper(new yacyURL(file));
|
||||
final Writer writer = new htmlFilterWriter(null, null, scraper, null, false);
|
||||
serverFileUtils.write(content, writer);
|
||||
writer.close();
|
||||
|
||||
final Iterator it = ((HashMap)scraper.getAnchors()).keySet().iterator();
|
||||
int added = 0, failed = 0;
|
||||
yacyURL url;
|
||||
while (it.hasNext()) try {
|
||||
url = new yacyURL((String) it.next(), null);
|
||||
if (blCheck && plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, url)) {
|
||||
failed++;
|
||||
continue;
|
||||
}
|
||||
getURLFetcherStack(env).push(url);
|
||||
added++;
|
||||
} catch (MalformedURLException e) { failed++; }
|
||||
prop.put("upload", "1");
|
||||
prop.put("upload_added", added);
|
||||
prop.put("upload_failed", failed);
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
prop.put("upload", "2");
|
||||
prop.putHTML("upload_error", e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
putFetched(prop);
|
||||
prop.put("urlCount", getURLFetcherStack(env).size());
|
||||
prop.put("totalFetched", getURLFetcherStack(env).getPopped());
|
||||
prop.put("totalAdded", getURLFetcherStack(env).getPushed());
|
||||
prop.put("maxSize", maxURLsPerFetch);
|
||||
prop.put("locurls", sb.crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE));
|
||||
prop.put("remurls", sb.crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT));
|
||||
prop.put("locurlsVal", Math.min(sb.crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE), 500));
|
||||
prop.put("remurlsVal", Math.min(sb.crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT), 500));
|
||||
return prop;
|
||||
}
|
||||
|
||||
private static void putFetched(serverObjects prop) {
|
||||
Iterator it = fetchMap.keySet().iterator();
|
||||
int count = 0;
|
||||
while (it.hasNext()) {
|
||||
String key = (String)it.next();
|
||||
prop.putHTML("peers_" + count + "_peer", key);
|
||||
prop.put("peers_" + count + "_amount", ((Integer)fetchMap.get(key)).intValue());
|
||||
count++;
|
||||
}
|
||||
prop.put("peers", count);
|
||||
}
|
||||
|
||||
private static int addURLs(String[] urls, boolean blCheck, URLFetcherStack stack) {
|
||||
int count = -1;
|
||||
for (int i=0; i<urls.length; i++)
|
||||
if (addURL(urls[i], blCheck, stack)) count++;
|
||||
return count;
|
||||
}
|
||||
|
||||
private static boolean addURL(String url, boolean blCheck, URLFetcherStack stack) {
|
||||
try {
|
||||
if (url == null || url.length() == 0) return false;
|
||||
yacyURL u = new yacyURL(url, null);
|
||||
if (blCheck && plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, u)) return false;
|
||||
stack.push(u);
|
||||
return true;
|
||||
} catch (MalformedURLException e) { return false; }
|
||||
}
|
||||
|
||||
private static int shiftFromNotice(plasmaCrawlNURL nurl, int fromStackType, URLFetcherStack stack, int count) {
|
||||
plasmaCrawlEntry entry;
|
||||
int failed = 0;
|
||||
for (int i=0; i<count; i++) try {
|
||||
entry = nurl.pop(fromStackType, false);
|
||||
stack.push(entry.url());
|
||||
} catch (IOException e) { failed++; }
|
||||
return failed;
|
||||
}
|
||||
|
||||
private static int addURLs(serverObjects post, int amount, URLFetcherStack stack) {
|
||||
int count = 0;
|
||||
String url;
|
||||
for (int i=0; i<amount; i++) {
|
||||
url = post.get("url" + i, null);
|
||||
if (url == null || url.length() == 0) continue;
|
||||
try {
|
||||
stack.push(new yacyURL(url, null));
|
||||
count++;
|
||||
} catch (MalformedURLException e) {
|
||||
serverLog.logInfo("URLFETCHER", "retrieved invalid url for adding to the stack: " + url);
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
}
|
@ -1,107 +0,0 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<title>YaCy '#[clientname]#': URL Fetcher Management</title>
|
||||
#%env/templates/metas.template%#
|
||||
</head>
|
||||
<body id="CrawlURLFetch_p">
|
||||
#%env/templates/header.template%#
|
||||
#%env/templates/submenuCrawlURLFetch.template%#
|
||||
<h2>URL-Fetcher</h2>
|
||||
<form method="post" action="CrawlURLFetch_p.html" enctype="multipart/form-data">
|
||||
<fieldset><legend>Fetch new URLs to crawl</legend>
|
||||
<p>
|
||||
The newly added URLs will be crawled without any filter restricions except of the <em>static</em> stop-words.
|
||||
The Re-Crawl option isn't used and the sites won't be stored in the Proxy Cache. Text and media types will be indexed.
|
||||
Since these URLs will be requested explicitely from another peer, they won't be distributed for remote indexing.
|
||||
</p>
|
||||
<dl>
|
||||
<dt><label for="url">Fetch from URL</label>:</dt>
|
||||
<dd>
|
||||
<input type="radio" name="source" value="url" id="url" checked="checked" />
|
||||
<input type="text" id="host" name="host" size="60" value="#[host]#" />
|
||||
#(hostError)#:: <span class="error">Malformed URL</span>#(/hostError)#
|
||||
#(saved)#::
|
||||
</dd>
|
||||
<dt><label for="savedURL">Or select previously entered URL</label>:</dt>
|
||||
<dd>
|
||||
<input type="radio" name="source" id="savedURL" value="saved" />
|
||||
<select name="saved">#{urls}#
|
||||
<option>#[url]#</option>#{/urls}#
|
||||
</select>#(/saved)#
|
||||
</dd>
|
||||
|
||||
#(peersKnown)#::
|
||||
<dt><label for="peer">Fetch from Peer</label>:</dt>
|
||||
<dd>
|
||||
<input type="radio" name="source" value="peer" id="peer" />
|
||||
<select name="peerhash">
|
||||
<option value="random" selected="selected">Choose a random peer</option>#{peers}#
|
||||
<option value="#[hash]#">#[name]#</option>#{/peers}#
|
||||
</select>
|
||||
<input type="submit" name="checkPeerURLCount" value="Check URL count" />
|
||||
<label for="amount">Amount of URLs to request</label>:
|
||||
<input type="text" name="amount" id="amount" value="50" maxlength="3" size="3" />
|
||||
#(peerError)#::
|
||||
<span class="error">Error fetching URL-list from <span class="tt">#[hash]#:#[name]#</span></span>::
|
||||
<span class="error">Peer with hash <span class="tt">#[hash]#</span> doesn't seem to be online anymore</span>#(/peerError)#
|
||||
</dd>#(/peersKnown)#
|
||||
|
||||
<dt>Frequency:</dt>
|
||||
<dd>
|
||||
<input type="radio" name="reg" value="once" id="once" checked="checked" /> <label for="once">Fetch only once</label><br />
|
||||
<input type="radio" name="reg" value="self_det" id="self_det" disabled="disabled"/> <label for="self_det">Fetch when queue is empty</label><br />
|
||||
<input type="radio" name="reg" value="delay" id="delay" /> <label for="delay">Fetch in a specified delay</label>:
|
||||
<label for="frequency">every</label>
|
||||
<input type="text" name="frequency" id="frequency" size="2" style="text-align: right;" maxlength="2"/>
|
||||
<select name="freq_type">
|
||||
<option value="days">Days</option>
|
||||
<option value="hours" selected="selected">Hours</option>
|
||||
<option value="minutes">Minutes</option>
|
||||
</select>
|
||||
#(freqError)#:: <span class="error">Invalid period, fetching only once</span>#(/freqError)#
|
||||
</dd>
|
||||
<dt><input type="submit" name="start" value="Fetch URLs" /></dt>
|
||||
</dl>
|
||||
</fieldset>
|
||||
</form>
|
||||
|
||||
#(threadError)#::
|
||||
<span class="error">Error on stopping thread, it isn't alive anymore</span>::
|
||||
<span class="error">Error on restarting thread, it isn't alive anymore</span>#(/threadError)#
|
||||
|
||||
#(runs)#::
|
||||
<form method="post" action="CrawlURLFetch_p.html" enctype="multipart/form-data">
|
||||
<fieldset><legend>Thread to fetch URLs is #(status)#running::stopped::paused#(/status)#</legend>
|
||||
<dl>
|
||||
<dt>Total runs:</dt><dd>#[totalRuns]#</dd>
|
||||
<dt>Total fetched URLs:</dt><dd>#[totalFetchedURLs]#</dd>
|
||||
<dt>Total failed URLs:</dt><dd>#[totalFailedURLs]#</dd>
|
||||
<dt>Last run duration:</dt><dd>#[lastRun]# ms</dd>
|
||||
<dt>Last server response:</dt><dd>#[lastServerResponse]#</dd>
|
||||
<dt>Last fetched URLs:</dt><dd>#[lastFetchedURLs]#</dd>
|
||||
<dt>Last failed URLs:</dt>
|
||||
<dd>
|
||||
#[error]#
|
||||
<ul>#{error}#
|
||||
<li><span class="error">#[reason]#</span>: <a href="#[url]#">#[url]#</a></li>#{/error}#
|
||||
</ul>
|
||||
</dd>
|
||||
<dt><label for="newDelay">Re-set delay</label>:</dt>
|
||||
<dd>
|
||||
<input type="text" name="newDelay" id="newDelay" maxlength="2" size="6" value="#[curDelay]#" style="text-align: right;" /> minutes
|
||||
<input type="submit" name="resetDelay" value="Set new delay" />
|
||||
</dd>
|
||||
<dt>#(status)#
|
||||
<input type="submit" name="stop" value="Stop Thread" />::
|
||||
<input type="submit" name="restart" value="Restart Thread" />::
|
||||
<input type="submit" name="stop" value="Stop Thread" />
|
||||
<input type="submit" name="restart" value="Restart Thread" />#(/status)#
|
||||
</dt>
|
||||
</dl>
|
||||
</fieldset>
|
||||
</form>
|
||||
#(/runs)#
|
||||
#%env/templates/footer.template%#
|
||||
</body>
|
||||
</html>
|
@ -1,543 +0,0 @@
|
||||
// CrawlURLFetch_p.java
|
||||
// -------------------------------------
|
||||
// part of YACY
|
||||
//
|
||||
// (C) 2007 by Franz Brausze
|
||||
//
|
||||
// last change: $LastChangedDate: $ by $LastChangedBy: $
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
//
|
||||
// Using this software in any meaning (reading, learning, copying, compiling,
|
||||
// running) means that you agree that the Author(s) is (are) not responsible
|
||||
// for cost, loss of data or any harm that may be caused directly or indirectly
|
||||
// by usage of this softare or this documentation. The usage of this software
|
||||
// is on your own risk. The installation and usage (starting/running) of this
|
||||
// software may allow other people or application to access your computer and
|
||||
// any attached devices and is highly dependent on the configuration of the
|
||||
// software which must be done by the user of the software; the author(s) is
|
||||
// (are) also not responsible for proper configuration and usage of the
|
||||
// software, even if provoked by documentation provided together with
|
||||
// the software.
|
||||
//
|
||||
// Any changes to this file according to the GPL as documented in the file
|
||||
// gpl.txt aside this file in the shipment you received can be done to the
|
||||
// lines that follows this copyright notice here, but changes must not be
|
||||
// done inside the copyright notive above. A re-distribution must contain
|
||||
// the intact and unchanged copyright notice.
|
||||
// Contributions and changes to the program code must be marked as such.
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Random;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import de.anomic.plasma.plasmaCrawlProfile;
|
||||
import de.anomic.plasma.plasmaCrawlZURL;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.server.serverByteBuffer;
|
||||
import de.anomic.server.serverSwitch;
|
||||
import de.anomic.http.httpHeader;
|
||||
import de.anomic.http.httpRemoteProxyConfig;
|
||||
import de.anomic.http.httpc;
|
||||
import de.anomic.server.serverObjects;
|
||||
import de.anomic.server.logging.serverLog;
|
||||
import de.anomic.yacy.yacyCore;
|
||||
import de.anomic.yacy.yacySeed;
|
||||
import de.anomic.yacy.yacyURL;
|
||||
import de.anomic.yacy.yacyVersion;
|
||||
|
||||
public class CrawlURLFetch_p {
|
||||
|
||||
private static final long ERR_DATE = 1;
|
||||
private static final long ERR_HOST_MALFORMED_URL = 1;
|
||||
private static final long ERR_PEER_GENERAL_CONN = 1;
|
||||
private static final long ERR_PEER_OFFLINE = 2;
|
||||
private static final long ERR_THREAD_STOP = 1;
|
||||
private static final long ERR_THREAD_RESUME = 2;
|
||||
|
||||
private static final long STAT_THREAD_ALIVE = 0;
|
||||
private static final long STAT_THREAD_STOPPED = 1;
|
||||
private static final long STAT_THREAD_PAUSED = 2;
|
||||
|
||||
private static URLFetcher fetcher = null;
|
||||
private static plasmaCrawlProfile.entry profile = null;
|
||||
private static ArrayList savedURLs = new ArrayList();
|
||||
|
||||
public static plasmaCrawlProfile.entry getCrawlProfile(serverSwitch env) {
|
||||
if (profile == null) {
|
||||
profile = ((plasmaSwitchboard)env).profilesActiveCrawls.newEntry(
|
||||
"URLFetcher", // Name
|
||||
null, // URL
|
||||
".*", ".*", // General / specific filter
|
||||
0, 0, // General / specific depth
|
||||
-1, -1, -1, // Recrawl / Dom-filter depth / Dom-max-pages
|
||||
true, // Crawl query
|
||||
true, true, // Index text / media
|
||||
false, true, // Store in HT- / TX-Cache
|
||||
false, // Remote indexing
|
||||
true, false, false); // Exclude static / dynamic / parent stopwords
|
||||
}
|
||||
return profile;
|
||||
}
|
||||
|
||||
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
|
||||
serverObjects prop = new serverObjects();
|
||||
prop.put("host", "");
|
||||
|
||||
// List previously saved URLs for easy selection
|
||||
listURLs(prop);
|
||||
|
||||
// List known hosts
|
||||
listPeers(prop,
|
||||
post != null && post.containsKey("checkPeerURLCount"),
|
||||
((plasmaSwitchboard)env).remoteProxyConfig);
|
||||
|
||||
if (post != null) {
|
||||
if (post.containsKey("start")) {
|
||||
long frequency = URLFetcher.DELAY_ONCE;
|
||||
if (post.containsKey("reg")) {
|
||||
if (post.get("reg", "").equals("self_det")) {
|
||||
frequency = URLFetcher.DELAY_SELF_DET;
|
||||
} else if (post.get("reg", "").equals("delay")) {
|
||||
frequency = getDate(post.get("frequency", ""), post.get("freq_type", ""));
|
||||
if (frequency == -1)
|
||||
prop.put("freqError", ERR_DATE);
|
||||
}
|
||||
}
|
||||
|
||||
int count = 50;
|
||||
if (post.get("amount", "").matches("\\d+")) {
|
||||
count = Integer.parseInt(post.get("amount", ""));
|
||||
if (count > 999) count = 999;
|
||||
}
|
||||
|
||||
if (fetcher != null) fetcher.interrupt();
|
||||
fetcher = null;
|
||||
if (post.get("source", "").equals("peer") &&
|
||||
post.get("peerhash", "").equals("random")) {
|
||||
fetcher = new URLFetcher(
|
||||
env,
|
||||
getCrawlProfile(env),
|
||||
count,
|
||||
frequency);
|
||||
} else {
|
||||
yacyURL url = null;
|
||||
if (post.get("source", "").equals("url")) {
|
||||
try {
|
||||
url = new yacyURL(post.get("host", null), null);
|
||||
if (!savedURLs.contains(url.toNormalform(true, true)))
|
||||
savedURLs.add(url.toNormalform(true, true));
|
||||
prop.put("host", post.get("host", url.toString()));
|
||||
} catch (MalformedURLException e) {
|
||||
prop.put("host", post.get("host", ""));
|
||||
prop.put("hostError", ERR_HOST_MALFORMED_URL);
|
||||
}
|
||||
} else if (post.get("source", "").equals("savedURL")) {
|
||||
try {
|
||||
url = new yacyURL(post.get("saved", ""), null);
|
||||
} catch (MalformedURLException e) {
|
||||
/* should never appear, except for invalid input, see above */
|
||||
}
|
||||
} else if (post.get("source", "").equals("peer")) {
|
||||
yacySeed ys = null;
|
||||
ys = yacyCore.seedDB.get(post.get("peerhash", null));
|
||||
if (ys != null) {
|
||||
if ((url = URLFetcher.getListServletURL(
|
||||
ys.getPublicAddress(),
|
||||
URLFetcher.MODE_LIST,
|
||||
count,
|
||||
yacyCore.seedDB.mySeed().hash)) == null) {
|
||||
prop.put("peerError", ERR_PEER_GENERAL_CONN);
|
||||
prop.put("peerError_hash", post.get("peerhash", ""));
|
||||
prop.put("peerError_name", ys.getName());
|
||||
}
|
||||
} else {
|
||||
prop.put("peerError", ERR_PEER_OFFLINE);
|
||||
prop.put("peerError_hash", post.get("peerhash", ""));
|
||||
}
|
||||
}
|
||||
|
||||
if (url != null) {
|
||||
fetcher = new URLFetcher(
|
||||
env,
|
||||
getCrawlProfile(env),
|
||||
url,
|
||||
count,
|
||||
frequency);
|
||||
}
|
||||
}
|
||||
if (fetcher != null) fetcher.start();
|
||||
}
|
||||
else if (post.containsKey("stop")) {
|
||||
if (fetcher != null) {
|
||||
fetcher.interrupt();
|
||||
} else {
|
||||
prop.put("threadError", ERR_THREAD_STOP);
|
||||
}
|
||||
}
|
||||
else if (post.containsKey("restart")) {
|
||||
if (fetcher != null) {
|
||||
fetcher.interrupt();
|
||||
if (fetcher.url == null) {
|
||||
fetcher = new URLFetcher(
|
||||
env,
|
||||
getCrawlProfile(env),
|
||||
fetcher.count,
|
||||
fetcher.delay);
|
||||
} else {
|
||||
fetcher = new URLFetcher(
|
||||
env,
|
||||
getCrawlProfile(env),
|
||||
fetcher.url,
|
||||
fetcher.count,
|
||||
fetcher.delay);
|
||||
}
|
||||
fetcher.start();
|
||||
} else {
|
||||
prop.put("threadError", ERR_THREAD_RESUME);
|
||||
}
|
||||
}
|
||||
else if (post.containsKey("resetDelay")) {
|
||||
final long frequency = getDate(post.get("newDelay", ""), "minutes");
|
||||
if (frequency == -1) {
|
||||
prop.put("freqError", ERR_DATE);
|
||||
} else {
|
||||
fetcher.delay = frequency;
|
||||
}
|
||||
}
|
||||
prop.put("LOCATION", "/CrawlURLFetch_p.html");
|
||||
}
|
||||
|
||||
if (fetcher != null) {
|
||||
prop.put("runs", "1");
|
||||
prop.put("runs_status",
|
||||
((fetcher.paused && fetcher.isAlive()) ? STAT_THREAD_PAUSED :
|
||||
(fetcher.isAlive()) ? STAT_THREAD_ALIVE : STAT_THREAD_STOPPED));
|
||||
prop.putNum("runs_totalRuns", URLFetcher.totalRuns);
|
||||
prop.putNum("runs_totalFetchedURLs", URLFetcher.totalFetchedURLs);
|
||||
prop.putNum("runs_totalFailedURLs", URLFetcher.totalFailed);
|
||||
prop.putNum("runs_lastRun", fetcher.lastRun);
|
||||
prop.putNum("runs_lastFetchedURLs", fetcher.lastFetchedURLs);
|
||||
prop.put("runs_lastServerResponse", (fetcher.lastServerResponse == null)
|
||||
? "" : fetcher.lastServerResponse);
|
||||
prop.putNum("runs_curDelay", (int)(fetcher.delay / 60000));
|
||||
|
||||
Iterator it = fetcher.failed.keySet().iterator();
|
||||
int i = 0;
|
||||
Object key;
|
||||
while (it.hasNext()) {
|
||||
key = it.next();
|
||||
prop.put("runs_error_" + i + "_reason", fetcher.failed.get(key));
|
||||
prop.put("runs_error_" + i + "_url", (String)key);
|
||||
i++;
|
||||
}
|
||||
prop.put("runs_error", i);
|
||||
}
|
||||
|
||||
return prop;
|
||||
}
|
||||
|
||||
private static int listURLs(serverObjects prop) {
|
||||
if (savedURLs.size() == 0) return 0;
|
||||
prop.put("saved", "1");
|
||||
for (int i=0; i<savedURLs.size(); i++)
|
||||
prop.put("saved_urls_" + i + "_url", savedURLs.get(i));
|
||||
prop.putNum("saved_urls", savedURLs.size());
|
||||
return savedURLs.size();
|
||||
}
|
||||
|
||||
private static int listPeers(serverObjects prop, boolean checkURLCount, httpRemoteProxyConfig theRemoteProxyConfig) {
|
||||
int peerCount = 0;
|
||||
TreeMap hostList = new TreeMap();
|
||||
String peername;
|
||||
if (yacyCore.seedDB != null && yacyCore.seedDB.sizeConnected() > 0) {
|
||||
final Iterator e = yacyCore.seedDB.seedsConnected(true, false, null, yacyVersion.YACY_PROVIDES_CRAWLS_VIA_LIST_HTML);
|
||||
int dbsize;
|
||||
while (e.hasNext()) {
|
||||
yacySeed seed = (yacySeed) e.next();
|
||||
if (seed != null && !seed.hash.equals(yacyCore.seedDB.mySeed().hash)) {
|
||||
peername = seed.get(yacySeed.NAME, "nameless");
|
||||
if (checkURLCount && (dbsize = getURLs2Fetch(seed, theRemoteProxyConfig)) > 0) {
|
||||
hostList.put(peername + " (" + dbsize + ")", seed.hash);
|
||||
} else {
|
||||
hostList.put(peername, seed.hash);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (hostList.size() > 0) {
|
||||
while (!hostList.isEmpty() && (peername = (String) hostList.firstKey()) != null) {
|
||||
final String hash = (String) hostList.get(peername);
|
||||
prop.put("peersKnown_peers_" + peerCount + "_hash", hash);
|
||||
prop.put("peersKnown_peers_" + peerCount + "_name", peername);
|
||||
hostList.remove(peername);
|
||||
peerCount++;
|
||||
}
|
||||
prop.put("peersKnown_peers", peerCount);
|
||||
prop.put("peersKnown", "1");
|
||||
} else {
|
||||
prop.put("peersKnown", "0");
|
||||
}
|
||||
return peerCount;
|
||||
}
|
||||
|
||||
private static int getURLs2Fetch(yacySeed seed, httpRemoteProxyConfig theRemoteProxyConfig) {
|
||||
try {
|
||||
String answer = new String(httpc.wget(
|
||||
URLFetcher.getListServletURL(seed.getPublicAddress(), URLFetcher.MODE_COUNT, 0, null),
|
||||
seed.getIP(),
|
||||
5000,
|
||||
null, null,
|
||||
theRemoteProxyConfig,
|
||||
null,
|
||||
null));
|
||||
if (answer.matches("\\d+"))
|
||||
return Integer.parseInt(answer);
|
||||
else {
|
||||
serverLog.logFine("URLFETCHER", "Retrieved invalid answer from " + seed.getName() + ": '" + answer + "'");
|
||||
return -1;
|
||||
}
|
||||
} catch (MalformedURLException e) {
|
||||
/* should not happen */
|
||||
return -3;
|
||||
} catch (IOException e) {
|
||||
return -2;
|
||||
}
|
||||
}
|
||||
|
||||
private static long getDate(String count, String type) {
|
||||
long r = 0;
|
||||
if (count != null && count.matches("\\d+")) r = Long.parseLong(count);
|
||||
if (r < 1) return -1;
|
||||
|
||||
r *= 60000;
|
||||
if (type.equals("days")) return r * 60 * 24;
|
||||
else if (type.equals("hours")) return r * 60;
|
||||
else if (type.equals("minutes")) return r;
|
||||
else return -1;
|
||||
}
|
||||
|
||||
public static class URLFetcher extends Thread {
|
||||
|
||||
public static final long DELAY_ONCE = -1;
|
||||
public static final long DELAY_SELF_DET = 0;
|
||||
|
||||
public static final int MODE_LIST = 0;
|
||||
public static final int MODE_COUNT = 1;
|
||||
|
||||
public static int totalRuns = 0;
|
||||
public static int totalFetchedURLs = 0;
|
||||
public static int totalFailed = 0;
|
||||
|
||||
public final HashMap failed = new HashMap();
|
||||
|
||||
public int lastFetchedURLs = 0;
|
||||
public long lastRun = 0;
|
||||
public String lastServerResponse = null;
|
||||
public int lastFailed = 0;
|
||||
|
||||
public final yacyURL url;
|
||||
public final int count;
|
||||
public long delay;
|
||||
public final plasmaSwitchboard sb;
|
||||
public final plasmaCrawlProfile.entry profile;
|
||||
|
||||
public boolean paused = false;
|
||||
|
||||
public static yacyURL getListServletURL(String host, int mode, int count, String peerHash) {
|
||||
String r = "http://" + host + "/yacy/list.html?list=queueUrls&display=";
|
||||
|
||||
switch (mode) {
|
||||
case MODE_LIST: r += "list"; break;
|
||||
case MODE_COUNT: r += "count"; break;
|
||||
}
|
||||
|
||||
if (count > 0) r += "&count=" + count;
|
||||
|
||||
if (peerHash != null && peerHash.length() > 0) {
|
||||
r += "&iam=" + peerHash;
|
||||
} else if (mode == MODE_LIST) {
|
||||
r += "&iam=" + yacyCore.seedDB.mySeed().hash;
|
||||
}
|
||||
|
||||
try {
|
||||
return new yacyURL(r, null);
|
||||
} catch (MalformedURLException e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public URLFetcher(
|
||||
serverSwitch env,
|
||||
plasmaCrawlProfile.entry profile,
|
||||
yacyURL url,
|
||||
int count,
|
||||
long delayMs) {
|
||||
if (env == null || profile == null || url == null)
|
||||
throw new NullPointerException("env, profile or url must not be null");
|
||||
this.sb = (plasmaSwitchboard)env;
|
||||
this.profile = profile;
|
||||
this.url = url;
|
||||
this.count = count;
|
||||
this.delay = delayMs;
|
||||
this.setName("URLFetcher");
|
||||
}
|
||||
|
||||
public URLFetcher(
|
||||
serverSwitch env,
|
||||
plasmaCrawlProfile.entry profile,
|
||||
int count,
|
||||
long delayMs) {
|
||||
if (env == null || profile == null)
|
||||
throw new NullPointerException("env or profile must not be null");
|
||||
this.sb = (plasmaSwitchboard)env;
|
||||
this.profile = profile;
|
||||
this.url = null;
|
||||
this.count = count;
|
||||
this.delay = delayMs;
|
||||
this.setName("URLFetcher");
|
||||
}
|
||||
|
||||
public void run() {
|
||||
this.paused = false;
|
||||
long start;
|
||||
yacyURL url;
|
||||
while (!isInterrupted()) {
|
||||
try {
|
||||
start = System.currentTimeMillis();
|
||||
url = getDLURL();
|
||||
if (url == null) {
|
||||
serverLog.logSevere(this.getName(), "canceled because no valid URL for the URL-list could be determinded");
|
||||
return;
|
||||
}
|
||||
totalFetchedURLs += stackURLs(getURLs(url));
|
||||
this.lastRun = System.currentTimeMillis() - start;
|
||||
totalRuns++;
|
||||
serverLog.logInfo(this.getName(), "Loaded " + this.lastFetchedURLs + " URLs from " + url + " in " + this.lastRun + " ms into stackcrawler.");
|
||||
if (this.delay < 0 || isInterrupted()) {
|
||||
return;
|
||||
} else synchronized (this) {
|
||||
if (this.delay == 0) {
|
||||
this.paused = true;
|
||||
while (this.paused) this.wait();
|
||||
} else {
|
||||
this.paused = true;
|
||||
this.wait(this.delay);
|
||||
}
|
||||
}
|
||||
this.paused = false;
|
||||
} catch (InterruptedException e) { return; }
|
||||
}
|
||||
}
|
||||
|
||||
private yacyURL getDLURL() {
|
||||
if (this.url != null) return this.url;
|
||||
|
||||
// choose random seed
|
||||
yacySeed ys = null;
|
||||
Iterator e = yacyCore.seedDB.seedsConnected(true, false, null, yacyVersion.YACY_PROVIDES_CRAWLS_VIA_LIST_HTML);
|
||||
int num = new Random().nextInt(yacyCore.seedDB.sizeConnected()) + 1;
|
||||
Object o;
|
||||
for (int i=0; i<num && e.hasNext(); i++) {
|
||||
o = e.next();
|
||||
if (o != null) ys = (yacySeed)o;
|
||||
}
|
||||
if (ys == null) return null;
|
||||
|
||||
return getListServletURL(ys.getPublicAddress(), MODE_LIST, this.count, yacyCore.seedDB.mySeed().hash);
|
||||
}
|
||||
|
||||
private int stackURLs(ArrayList /*of yacyURL*/ urls) {
|
||||
this.lastFailed = 0;
|
||||
this.lastFetchedURLs = 0;
|
||||
this.failed.clear();
|
||||
|
||||
if (urls == null) return 0;
|
||||
String reason;
|
||||
yacyURL url;
|
||||
for (int i = 0; i < urls.size() && !isInterrupted(); i++) {
|
||||
url = (yacyURL) urls.get(i);
|
||||
reason = this.sb.crawlStacker.stackCrawl(
|
||||
url,
|
||||
null,
|
||||
yacyCore.seedDB.mySeed().hash,
|
||||
null,
|
||||
new Date(),
|
||||
this.profile.generalDepth(),
|
||||
this.profile);
|
||||
if (reason == null) {
|
||||
serverLog.logFine(this.getName(), "stacked " + url);
|
||||
this.lastFetchedURLs++;
|
||||
} else {
|
||||
serverLog.logFine(this.getName(), "error on stacking " + url + ": " + reason);
|
||||
this.lastFailed++;
|
||||
totalFailed++;
|
||||
this.failed.put(url, reason);
|
||||
plasmaCrawlZURL.Entry ee = this.sb.crawlQueues.errorURL.newEntry(
|
||||
url,
|
||||
reason);
|
||||
ee.store();
|
||||
this.sb.crawlQueues.errorURL.push(ee);
|
||||
}
|
||||
}
|
||||
return this.lastFetchedURLs;
|
||||
}
|
||||
|
||||
private ArrayList /*of yacyURL */ getURLs(yacyURL url) {
|
||||
if (url == null) return null;
|
||||
ArrayList a = new ArrayList();
|
||||
try {
|
||||
httpc con = new httpc(
|
||||
url.getHost(),
|
||||
url.getHost(),
|
||||
url.getPort(),
|
||||
15000,
|
||||
url.getProtocol().equals("https"),
|
||||
plasmaSwitchboard.getSwitchboard().remoteProxyConfig, null, null);
|
||||
|
||||
httpHeader header = new httpHeader();
|
||||
header.put(httpHeader.ACCEPT_ENCODING, "US-ASCII");
|
||||
header.put(httpHeader.HOST, url.getHost());
|
||||
|
||||
httpc.response res = con.GET(url.getPath() + "?" + url.getQuery(), header);
|
||||
serverLog.logFine(this.getName(), "downloaded URL-list from " + url + " (" + res.statusCode + ")");
|
||||
this.lastServerResponse = res.statusCode + " (" + res.statusText + ")";
|
||||
if (res.status.startsWith("2")) {
|
||||
serverByteBuffer sbb = new serverByteBuffer();
|
||||
//byte[] cbs = res.writeContent();
|
||||
res.writeContent(sbb, null);
|
||||
String encoding = res.responseHeader.getCharacterEncoding();
|
||||
|
||||
if (encoding == null) encoding = "US-ASCII";
|
||||
String[] s = (new String(sbb.getBytes(), encoding)).split("\n");
|
||||
for (int i = 0; i < s.length; i++) {
|
||||
try {
|
||||
a.add(new yacyURL(s[i], null));
|
||||
} catch (MalformedURLException e) {}
|
||||
}
|
||||
}
|
||||
con.close();
|
||||
} catch (IOException e) { }
|
||||
return a;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
@ -1,7 +0,0 @@
|
||||
<div class="SubMenu">
|
||||
<h3>URL Fetcher Menu</h3>
|
||||
<ul class="SubMenu">
|
||||
<li><a href="/CrawlURLFetch_p.html" class="MenuItemLink lock">URL Fetcher</a></li>
|
||||
<li><a href="/CrawlURLFetchStack_p.html" class="MenuItemLink lock">URL Stack</a></li>
|
||||
</ul>
|
||||
</div>
|
@ -0,0 +1,30 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<title>YaCy '#[clientname]#': Index Control</title>
|
||||
#%env/templates/metas.template%#
|
||||
</head>
|
||||
<body id="IndexControl">
|
||||
#%env/templates/header.template%#
|
||||
<h2>remote crawl fetch test</h2>
|
||||
|
||||
<form name="selection" action="rct_p.html" method="post" enctype="multipart/form-data">
|
||||
<fieldset><legend>Retrieve remote crawl url list</legend>
|
||||
<dl>
|
||||
<dt class="TableCellDark">Target Peer:</dt>
|
||||
<dd>select <select name="peer">
|
||||
#{hosts}#
|
||||
<option value="#[hosthash]#">#[hostname]#</option>
|
||||
#{/hosts}#
|
||||
</select>
|
||||
</dd>
|
||||
<dt class="TableCellLight"></dt>
|
||||
<dd><input type="submit" name="retrieve" value="retrieve" />
|
||||
</dd>
|
||||
</dl>
|
||||
</fieldset>
|
||||
</form>
|
||||
|
||||
#%env/templates/footer.template%#
|
||||
</body>
|
||||
</html>
|
@ -0,0 +1,124 @@
|
||||
// rct_p.java
|
||||
// -----------------------
|
||||
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||
// first published 28.11.2007 on http://yacy.net
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate: 2007-11-14 01:15:28 +0000 (Mi, 14 Nov 2007) $
|
||||
// $LastChangedRevision: 4216 $
|
||||
// $LastChangedBy: orbiter $
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
import java.text.ParseException;
|
||||
import java.util.Date;
|
||||
import java.util.Iterator;
|
||||
|
||||
import de.anomic.http.httpHeader;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.server.serverDate;
|
||||
import de.anomic.server.serverObjects;
|
||||
import de.anomic.server.serverSwitch;
|
||||
import de.anomic.xml.rssReader;
|
||||
import de.anomic.yacy.yacyClient;
|
||||
import de.anomic.yacy.yacyCore;
|
||||
import de.anomic.yacy.yacySeed;
|
||||
import de.anomic.yacy.yacyURL;
|
||||
|
||||
public class rct_p {
|
||||
|
||||
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
|
||||
// return variable that accumulates replacements
|
||||
plasmaSwitchboard sb = (plasmaSwitchboard) env;
|
||||
serverObjects prop = new serverObjects();
|
||||
|
||||
if (post != null) {
|
||||
if (post.containsKey("retrieve")) {
|
||||
String peerhash = post.get("peer", null);
|
||||
yacySeed seed = (peerhash == null) ? null : yacyCore.seedDB.getConnected(peerhash);
|
||||
rssReader reader = (seed == null) ? null : yacyClient.queryRemoteCrawlURLs(seed, 10);
|
||||
if (reader != null) {
|
||||
rssReader.Item item;
|
||||
for (int i = 0; i < reader.items(); i++) {
|
||||
item = reader.getItem(i);
|
||||
//System.out.println("URL=" + item.getLink() + ", desc=" + item.getDescription() + ", pubDate=" + item.getPubDate());
|
||||
|
||||
// put url on remote crawl stack
|
||||
yacyURL url;
|
||||
try {
|
||||
url = new yacyURL(item.getLink(), null);
|
||||
} catch (MalformedURLException e) {
|
||||
url = null;
|
||||
}
|
||||
Date loaddate;
|
||||
try {
|
||||
loaddate = serverDate.parseShortSecondTime(item.getPubDate());
|
||||
} catch (ParseException e) {
|
||||
loaddate = new Date();
|
||||
}
|
||||
yacyURL referrer = null; // referrer needed!
|
||||
if (sb.acceptURL(url)) {
|
||||
// stack url
|
||||
sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
|
||||
String reasonString = sb.crawlStacker.stackCrawl(url, referrer, peerhash, "REMOTE-CRAWLING", loaddate, 0, sb.defaultRemoteProfile);
|
||||
|
||||
if (reasonString == null) {
|
||||
// done
|
||||
env.getLog().logInfo("crawlOrder: added remote crawl url: " + url.toNormalform(true, false));
|
||||
} else if (reasonString.startsWith("double")) {
|
||||
// case where we have already the url loaded;
|
||||
env.getLog().logInfo("crawlOrder: ignored double remote crawl url: " + url.toNormalform(true, false));
|
||||
} else {
|
||||
env.getLog().logInfo("crawlOrder: ignored [" + reasonString + "] remote crawl url: " + url.toNormalform(true, false));
|
||||
}
|
||||
} else {
|
||||
env.getLog().logWarning("crawlOrder: Received URL outside of our domain: " + url.toNormalform(true, false));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
listHosts(prop);
|
||||
|
||||
// return rewrite properties
|
||||
return prop;
|
||||
}
|
||||
|
||||
private static void listHosts(serverObjects prop) {
|
||||
// list known hosts
|
||||
yacySeed seed;
|
||||
int hc = 0;
|
||||
if (yacyCore.seedDB != null && yacyCore.seedDB.sizeConnected() > 0) {
|
||||
Iterator e = yacyCore.dhtAgent.getProvidesRemoteCrawlURLs();
|
||||
while (e.hasNext()) {
|
||||
seed = (yacySeed) e.next();
|
||||
if (seed != null) {
|
||||
prop.put("hosts_" + hc + "_hosthash", seed.hash);
|
||||
prop.putHTML("hosts_" + hc + "_hostname", seed.hash + " " + seed.get(yacySeed.NAME, "nameless") + " (" + seed.getLong(yacySeed.RCOUNT, 0) + ")");
|
||||
hc++;
|
||||
}
|
||||
}
|
||||
prop.put("hosts", hc);
|
||||
} else {
|
||||
prop.put("hosts", "0");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -1 +0,0 @@
|
||||
#[list]#
|
@ -1,152 +0,0 @@
|
||||
// list.java
|
||||
// -----------------------
|
||||
// part of YaCy
|
||||
// (C) by Michael Peter Christen; mc@anomic.de
|
||||
// first published on http://www.anomic.de
|
||||
// Frankfurt, Germany, 2004
|
||||
//
|
||||
// This File is contributed by Alexander Schier
|
||||
//
|
||||
// $LastChangedDate$
|
||||
// $LastChangedRevision$
|
||||
// $LastChangedBy$
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
//
|
||||
// Using this software in any meaning (reading, learning, copying, compiling,
|
||||
// running) means that you agree that the Author(s) is (are) not responsible
|
||||
// for cost, loss of data or any harm that may be caused directly or indirectly
|
||||
// by usage of this softare or this documentation. The usage of this software
|
||||
// is on your own risk. The installation and usage (starting/running) of this
|
||||
// software may allow other people or application to access your computer and
|
||||
// any attached devices and is highly dependent on the configuration of the
|
||||
// software which must be done by the user of the software; the author(s) is
|
||||
// (are) also not responsible for proper configuration and usage of the
|
||||
// software, even if provoked by documentation provided together with
|
||||
// the software.
|
||||
//
|
||||
// Any changes to this file according to the GPL as documented in the file
|
||||
// gpl.txt aside this file in the shipment you received can be done to the
|
||||
// lines that follows this copyright notice here, but changes must not be
|
||||
// done inside the copyright notive above. A re-distribution must contain
|
||||
// the intact and unchanged copyright notice.
|
||||
// Contributions and changes to the program code must be marked as such.
|
||||
|
||||
// You must compile this file with
|
||||
// javac -classpath .:../../classes list.java
|
||||
// if the shell's current path is HTROOT
|
||||
|
||||
// contains contributions by [FB] to support listing URLs for URL Fetcher
|
||||
|
||||
import java.io.File;
|
||||
|
||||
import de.anomic.data.URLFetcherStack;
|
||||
import de.anomic.data.htmlTools;
|
||||
import de.anomic.data.listManager;
|
||||
import de.anomic.http.httpHeader;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.server.serverCore;
|
||||
import de.anomic.server.serverObjects;
|
||||
import de.anomic.server.serverSwitch;
|
||||
import de.anomic.server.logging.serverLog;
|
||||
import de.anomic.yacy.yacyCore;
|
||||
import de.anomic.yacy.yacyNetwork;
|
||||
import de.anomic.yacy.yacySeed;
|
||||
import de.anomic.yacy.yacyURL;
|
||||
|
||||
public final class list {
|
||||
|
||||
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
|
||||
if (post == null || env == null)
|
||||
throw new NullPointerException("post: " + post + ", sb: " + env);
|
||||
plasmaSwitchboard sb = (plasmaSwitchboard) env;
|
||||
|
||||
// return variable that accumulates replacements
|
||||
final serverObjects prop = new serverObjects();
|
||||
if ((post == null) || (env == null)) return prop;
|
||||
if (!yacyNetwork.authentifyRequest(post, env)) return prop;
|
||||
|
||||
final String col = post.get("col", "");
|
||||
final File listsPath = env.getConfigPath(plasmaSwitchboard.LISTS_PATH, plasmaSwitchboard.LISTS_PATH_DEFAULT);
|
||||
|
||||
String otherPeerName = null;
|
||||
if (post.containsKey("iam")) {
|
||||
yacySeed bla = yacyCore.seedDB.get(post.get("iam", ""));
|
||||
if (bla != null) otherPeerName = bla.getName();
|
||||
}
|
||||
if (otherPeerName == null) otherPeerName = (String)header.get(httpHeader.CONNECTION_PROP_CLIENTIP);
|
||||
|
||||
if ((sb.isRobinsonMode()) && (!sb.isInMyCluster(otherPeerName))) {
|
||||
// if we are a robinson cluster, answer only if this client is known by our network definition
|
||||
return null;
|
||||
}
|
||||
|
||||
if (col.equals("black")) {
|
||||
final StringBuffer out = new StringBuffer();
|
||||
|
||||
final String filenames=env.getConfig("BlackLists.Shared", "");
|
||||
final String[] filenamesarray = filenames.split(",");
|
||||
|
||||
if(filenamesarray.length > 0){
|
||||
for(int i = 0;i < filenamesarray.length; i++){
|
||||
String filename = filenamesarray[i];
|
||||
File fileObj = new File(listsPath,filename);
|
||||
out.append(listManager.getListString(fileObj, false))
|
||||
.append(serverCore.crlfString);
|
||||
}
|
||||
} // if filenamesarray.length > 0
|
||||
|
||||
prop.put("list",out.toString());
|
||||
}
|
||||
// start contrib by [FB]
|
||||
else if (col.length() == 0 && post.get("list", "").equals("queueUrls")) {
|
||||
final URLFetcherStack db = CrawlURLFetchStack_p.getURLFetcherStack(env);
|
||||
final String display = post.get("display", "list");
|
||||
if (display.equals("list")) {
|
||||
// list urls from remote crawler queue for other peers
|
||||
final int count = Math.min(post.getInt("count", 50), CrawlURLFetchStack_p.maxURLsPerFetch);
|
||||
|
||||
if (count > 0 && db.size() > 0) {
|
||||
final StringBuffer b = new StringBuffer();
|
||||
|
||||
yacyURL url;
|
||||
int cnt = 0;
|
||||
for (int i=0; i<count; i++) {
|
||||
if ((url = db.pop()) == null) continue;
|
||||
b.append(htmlTools.decodeHtml2Unicode(url.toNormalform(false, true))).append("\n");
|
||||
cnt++;
|
||||
}
|
||||
prop.put("list", b.toString());
|
||||
CrawlURLFetchStack_p.fetchMap.put(
|
||||
otherPeerName,
|
||||
new Integer(((CrawlURLFetchStack_p.fetchMap.get(otherPeerName) == null)
|
||||
? 0
|
||||
: ((Integer)CrawlURLFetchStack_p.fetchMap.get(otherPeerName)).intValue()) + cnt));
|
||||
serverLog.logInfo("URLFETCHER", "sent " + cnt + " URLs to " + otherPeerName);
|
||||
} else {
|
||||
prop.put("list", "");
|
||||
serverLog.logInfo("URLFETCHER", "couldn't satisfy URL request from " + otherPeerName + ": stack is empty");
|
||||
}
|
||||
} else if (display.equals("count")) {
|
||||
prop.put("list", db.size());
|
||||
}
|
||||
// end contrib by [FB]
|
||||
} else {
|
||||
prop.put("list","");
|
||||
}
|
||||
|
||||
return prop;
|
||||
}
|
||||
}
|
Loading…
Reference in new issue