- added experimental CrawlURLFetch_p-Servlet to fetch new URLs from a specified location (\n-seperated list). Requested by Theli.
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3361 6c8d7289-2bf4-0310-a012-ef5d649a1542pull/1/head
parent
6c6375577e
commit
50b59e312f
@ -0,0 +1,69 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<title>YaCy '#[clientname]#': Local Cache Management</title>
|
||||
#%env/templates/metas.template%#
|
||||
</head>
|
||||
<body id="CrawlURLFetch_p">
|
||||
#%env/templates/header.template%#
|
||||
<h2>URL-Fetcher</h2>
|
||||
<p><!-- desc --></p>
|
||||
<form method="post" action="CrawlURLFetch_p.html" enctype="multipart/form-data">
|
||||
<p>
|
||||
All newly added URLs will be crawled using the <span class="tt">Proxy</span> Crawl Profile.
|
||||
</p>
|
||||
<fieldset><legend>Fetch new URLs to crawl</legend>
|
||||
<dl>
|
||||
<dt><label for="url">Fetch from URL</label>:</dt>
|
||||
<dd>
|
||||
<input type="radio" name="source" value="url" id="url" checked="checked" />
|
||||
<input type="text" id="host" name="host" size="60" value="#[host]#" />
|
||||
#(hostError)#::<span class="error">Malformed URL</span>#(/hostError)#
|
||||
</dd>
|
||||
#(peersKnown)#::
|
||||
<dt><label for="peer">Fetch from Peer</label>:</dt>
|
||||
<dd>
|
||||
<input type="radio" name="source" value="peer" id="peer" disabled="disabled" />
|
||||
<select name="peerhash" disabled="disabled">#{peers}#
|
||||
<option value="#[hash]#">#[name]#</option>#{/peers}#
|
||||
</select>
|
||||
#(peerError)#::<span class="error">
|
||||
</dd>#(/peersKnown)#
|
||||
<dt><label for="type">List-type</label>:</dt>
|
||||
<dd>
|
||||
<select name="type" id="type">
|
||||
<option value="text">Text</option>
|
||||
<option value="xml" disabled="disabled">XML</option>
|
||||
</select>
|
||||
</dd>
|
||||
<dt><label for="regularly">Run regularly</label>:</dt>
|
||||
<dd>
|
||||
<input type="checkbox" name="regularly" id="regularly" disabled="disabled" />,
|
||||
<label for="frequency">every</label>
|
||||
<input type="text" name="frequency" id="frequency" text-align="left" size="5" disabled="disabled" />
|
||||
<select name="freq_type" disabled="disabled">
|
||||
<option value="weeks">Weeks</option>
|
||||
<option value="days" selected="selected">Days</option>
|
||||
<option value="hours">Hours</option>
|
||||
</select>
|
||||
</dd>
|
||||
<dt><input type="submit" name="start" value="Fetch URLs" /></dt>
|
||||
</dl>
|
||||
</fieldset>
|
||||
#(runs)#::
|
||||
<fieldset><legend>Thread to fetch URLs is #(status)#running::stopped::paused#(/status)#</legend>
|
||||
<dl>
|
||||
<dt>Total runs:</dt><dd>#[totalRuns]#</dd>
|
||||
<dt>Last run duration:</dt><dd>#[lastRun]#</dd>
|
||||
<dt>Last server response:</dt><dd>#[lastServerResponse]#</dd>
|
||||
<dt>Total fetched URLs:</dt><dd>#[totalFetchedURLs]#</dd>
|
||||
<dt>Total failed URLs:</dt><dd>#[totalFailedURLs]#</dd>
|
||||
<dt>Last fetched URLs:</dt><dd>#[lastFetchedURLs]#</dd>
|
||||
<dt><input type="submit" name="stop" value="Stop Thread" /></dt>
|
||||
</dl>
|
||||
</fieldset>
|
||||
#(/runs)#
|
||||
</form>
|
||||
#%env/templates/footer.template%#
|
||||
</body>
|
||||
</html>
|
@ -0,0 +1,252 @@
|
||||
// CrawlURLFetch_p.java
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.Date;
|
||||
import java.util.Enumeration;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import de.anomic.net.URL;
|
||||
import de.anomic.plasma.plasmaCrawlProfile;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.server.serverSwitch;
|
||||
import de.anomic.http.httpHeader;
|
||||
import de.anomic.http.httpc;
|
||||
import de.anomic.server.serverObjects;
|
||||
import de.anomic.yacy.yacyCore;
|
||||
import de.anomic.yacy.yacySeed;
|
||||
|
||||
public class CrawlURLFetch_p {
|
||||
|
||||
private static URLFetcher fetcher = null;
|
||||
|
||||
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
|
||||
serverObjects prop = new serverObjects();
|
||||
|
||||
prop.put("host", "");
|
||||
|
||||
// List known hosts for message sending
|
||||
if (yacyCore.seedDB != null && yacyCore.seedDB.sizeConnected() > 0) {
|
||||
prop.put("peersKnown", 1);
|
||||
int peerCount = 0;
|
||||
try {
|
||||
TreeMap hostList = new TreeMap();
|
||||
final Enumeration e = yacyCore.seedDB.seedsConnected(true, false, null, (float) 0.0);
|
||||
while (e.hasMoreElements()) {
|
||||
yacySeed seed = (yacySeed) e.nextElement();
|
||||
if (seed != null) hostList.put(seed.get(yacySeed.NAME, "nameless"),seed.hash);
|
||||
}
|
||||
|
||||
String peername;
|
||||
while ((peername = (String) hostList.firstKey()) != null) {
|
||||
final String Hash = (String) hostList.get(peername);
|
||||
prop.put("peersKnown_peers_" + peerCount + "_hash", Hash);
|
||||
prop.put("peersKnown_peers_" + peerCount + "_name", peername);
|
||||
hostList.remove(peername);
|
||||
peerCount++;
|
||||
}
|
||||
} catch (Exception e) { /* no comment :P */ }
|
||||
prop.put("peersKnown_peers", peerCount);
|
||||
} else {
|
||||
prop.put("peersKnown", 0);
|
||||
}
|
||||
|
||||
if (post != null) {
|
||||
if (post.containsKey("start")) {
|
||||
try {
|
||||
|
||||
long frequency = -1;
|
||||
if (post.containsKey("regularly"))
|
||||
frequency = getDate(post.get("frequency", ""), post.get("freq_type", ""));
|
||||
|
||||
String t = post.get("type", "text");
|
||||
int type = -1;
|
||||
if (t.equals("text")) {
|
||||
type = URLFetcher.TYPE_TEXT;
|
||||
} else if (t.equals("xml")) {
|
||||
type = URLFetcher.TYPE_XML;
|
||||
}
|
||||
|
||||
URL url = new URL(post.get("host", null));
|
||||
prop.put("host", post.get("host", ""));
|
||||
|
||||
if (type > -1) {
|
||||
if (frequency > -1) {
|
||||
fetcher = new URLFetcher(
|
||||
env,
|
||||
((plasmaSwitchboard)env).defaultProxyProfile,
|
||||
url,
|
||||
frequency,
|
||||
type);
|
||||
} else { // only fetch once
|
||||
fetcher = new URLFetcher(
|
||||
env,
|
||||
((plasmaSwitchboard)env).defaultProxyProfile,
|
||||
url,
|
||||
type);
|
||||
}
|
||||
fetcher.start();
|
||||
}
|
||||
} catch (MalformedURLException e) {
|
||||
prop.put("host", post.get("host", ""));
|
||||
prop.put("hostError", 1);
|
||||
}
|
||||
} else if (post.containsKey("stop")) {
|
||||
fetcher.interrupt();
|
||||
}
|
||||
}
|
||||
|
||||
if (fetcher != null) {
|
||||
prop.put("runs", 1);
|
||||
prop.put("runs_status", (fetcher.isRunning()) ? 0 : (fetcher.isPaused()) ? 2 : 1);
|
||||
prop.put("runs_totalRuns", URLFetcher.totalRuns);
|
||||
prop.put("runs_totalFetchedURLs", URLFetcher.totalFetchedURLs);
|
||||
prop.put("runs_totalFailedURLs", URLFetcher.totalFailed);
|
||||
prop.put("runs_lastRun", URLFetcher.lastRun);
|
||||
prop.put("runs_lastFetchedURLs", URLFetcher.lastFetchedURLs);
|
||||
prop.put("runs_lastServerResponse", (URLFetcher.lastServerResponse == null)
|
||||
? "" : URLFetcher.lastServerResponse);
|
||||
}
|
||||
|
||||
return prop;
|
||||
}
|
||||
|
||||
private static long getDate(String count, String type) {
|
||||
long r = 0;
|
||||
if (count != null && count.matches("\\d+")) r = Long.parseLong(count);
|
||||
if (r < 1) return -1;
|
||||
|
||||
r *= 3600 * 24;
|
||||
if (type.equals("weeks")) return r * 24 * 7;
|
||||
else if (type.equals("days")) return r * 24;
|
||||
else if (type.equals("hours")) return r;
|
||||
else return -1;
|
||||
}
|
||||
|
||||
public static class URLFetcher extends Thread {
|
||||
|
||||
public static final int TYPE_TEXT = 0;
|
||||
public static final int TYPE_XML = 1;
|
||||
|
||||
public static int lastFetchedURLs = 0;
|
||||
public static long lastRun = 0;
|
||||
public static String lastServerResponse = null;
|
||||
public static int lastFailed = 0;
|
||||
public static int totalRuns = 0;
|
||||
public static int totalFetchedURLs = 0;
|
||||
public static int totalFailed = 0;
|
||||
|
||||
private final URL url;
|
||||
private final long delay;
|
||||
private final int type;
|
||||
private final plasmaSwitchboard sb;
|
||||
private final plasmaCrawlProfile.entry profile;
|
||||
|
||||
private boolean running = false;
|
||||
private boolean paused = false;
|
||||
|
||||
public URLFetcher(
|
||||
serverSwitch env,
|
||||
plasmaCrawlProfile.entry profile,
|
||||
URL url,
|
||||
int type) {
|
||||
this.sb = (plasmaSwitchboard)env;
|
||||
this.profile = profile;
|
||||
this.url = url;
|
||||
this.type = type;
|
||||
this.delay = 0;
|
||||
this.setName("URL-Fetcher");
|
||||
}
|
||||
|
||||
public URLFetcher(
|
||||
serverSwitch env,
|
||||
plasmaCrawlProfile.entry profile,
|
||||
URL url,
|
||||
long delayMs,
|
||||
int type) {
|
||||
this.sb = (plasmaSwitchboard)env;
|
||||
this.profile = profile;
|
||||
this.url = url;
|
||||
this.delay = delayMs;
|
||||
this.type = type;
|
||||
this.setName("URL-Fetcher");
|
||||
}
|
||||
|
||||
public boolean isRunning() { return this.running; }
|
||||
public boolean isPaused() { return this.paused; }
|
||||
|
||||
public void run() {
|
||||
this.running = true;
|
||||
this.paused = false;
|
||||
long start;
|
||||
while (!isInterrupted() && this.delay > 0) {
|
||||
try {
|
||||
start = System.currentTimeMillis();
|
||||
totalFetchedURLs += addURLs();
|
||||
lastRun = System.currentTimeMillis() - start;
|
||||
totalRuns++;
|
||||
this.paused = true;
|
||||
this.wait(this.delay);
|
||||
this.paused = false;
|
||||
} catch (InterruptedException e) { break; }
|
||||
}
|
||||
this.running = false;
|
||||
}
|
||||
|
||||
private int addURLs() throws InterruptedException {
|
||||
String[] urls = getURLs();
|
||||
lastFailed = 0;
|
||||
if (urls == null) return 0;
|
||||
String reason;
|
||||
for (int i=0; i<urls.length; i++) {
|
||||
reason = this.sb.sbStackCrawlThread.stackCrawl(
|
||||
urls[i],
|
||||
null,
|
||||
yacyCore.seedDB.mySeed.hash,
|
||||
"PROXY",
|
||||
new Date(),
|
||||
this.profile.generalDepth(),
|
||||
this.profile);
|
||||
if (reason != null) lastFailed++;
|
||||
}
|
||||
return urls.length;
|
||||
}
|
||||
|
||||
private String[] getURLs() {
|
||||
String[] r = null;
|
||||
try {
|
||||
httpc con = httpc.getInstance(
|
||||
this.url.getHost(),
|
||||
this.url.getHost(),
|
||||
this.url.getPort(),
|
||||
15000,
|
||||
this.url.getProtocol().equals("https"));
|
||||
|
||||
httpHeader header = new httpHeader();
|
||||
header.put(httpHeader.ACCEPT_ENCODING, "utf-8");
|
||||
header.put(httpHeader.HOST, this.url.getHost());
|
||||
|
||||
httpc.response res = con.GET(this.url.getPath(), header);
|
||||
lastServerResponse = res.statusCode + " (" + res.statusText + ")";
|
||||
System.err.println("LAST RESPONSE: " + lastServerResponse);
|
||||
if (res.status.startsWith("2")) {
|
||||
byte[] cbs = res.writeContent();
|
||||
String encoding = res.responseHeader.getCharacterEncoding();
|
||||
|
||||
if (encoding == null) encoding = "ASCII";
|
||||
switch (this.type) {
|
||||
case TYPE_TEXT: r = parseText(new String(cbs, encoding)); break;
|
||||
// case TYPE_XML: r = parseXML(new String(cbs, encoding));
|
||||
}
|
||||
}
|
||||
con.close();
|
||||
httpc.returnInstance(con);
|
||||
} catch (IOException e) { }
|
||||
return r;
|
||||
}
|
||||
|
||||
private static String[] parseText(String text) {
|
||||
return text.split("\n");
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in new issue