- added experimental CrawlURLFetch_p-Servlet to fetch new URLs from a specified location (\n-seperated list). Requested by Theli.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3361 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
karlchenofhell 18 years ago
parent 6c6375577e
commit 50b59e312f

@ -0,0 +1,69 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>YaCy '#[clientname]#': Local Cache Management</title>
#%env/templates/metas.template%#
</head>
<body id="CrawlURLFetch_p">
#%env/templates/header.template%#
<h2>URL-Fetcher</h2>
<p><!-- desc --></p>
<form method="post" action="CrawlURLFetch_p.html" enctype="multipart/form-data">
<p>
All newly added URLs will be crawled using the <span class="tt">Proxy</span> Crawl Profile.
</p>
<fieldset><legend>Fetch new URLs to crawl</legend>
<dl>
<dt><label for="url">Fetch from URL</label>:</dt>
<dd>
<input type="radio" name="source" value="url" id="url" checked="checked" />
<input type="text" id="host" name="host" size="60" value="#[host]#" />
#(hostError)#::<span class="error">Malformed URL</span>#(/hostError)#
</dd>
#(peersKnown)#::
<dt><label for="peer">Fetch from Peer</label>:</dt>
<dd>
<input type="radio" name="source" value="peer" id="peer" disabled="disabled" />
<select name="peerhash" disabled="disabled">#{peers}#
<option value="#[hash]#">#[name]#</option>#{/peers}#
</select>
#(peerError)#::<span class="error">
</dd>#(/peersKnown)#
<dt><label for="type">List-type</label>:</dt>
<dd>
<select name="type" id="type">
<option value="text">Text</option>
<option value="xml" disabled="disabled">XML</option>
</select>
</dd>
<dt><label for="regularly">Run regularly</label>:</dt>
<dd>
<input type="checkbox" name="regularly" id="regularly" disabled="disabled" />,
<label for="frequency">every</label>
<input type="text" name="frequency" id="frequency" text-align="left" size="5" disabled="disabled" />
<select name="freq_type" disabled="disabled">
<option value="weeks">Weeks</option>
<option value="days" selected="selected">Days</option>
<option value="hours">Hours</option>
</select>
</dd>
<dt><input type="submit" name="start" value="Fetch URLs" /></dt>
</dl>
</fieldset>
#(runs)#::
<fieldset><legend>Thread to fetch URLs is #(status)#running::stopped::paused#(/status)#</legend>
<dl>
<dt>Total runs:</dt><dd>#[totalRuns]#</dd>
<dt>Last run duration:</dt><dd>#[lastRun]#</dd>
<dt>Last server response:</dt><dd>#[lastServerResponse]#</dd>
<dt>Total fetched URLs:</dt><dd>#[totalFetchedURLs]#</dd>
<dt>Total failed URLs:</dt><dd>#[totalFailedURLs]#</dd>
<dt>Last fetched URLs:</dt><dd>#[lastFetchedURLs]#</dd>
<dt><input type="submit" name="stop" value="Stop Thread" /></dt>
</dl>
</fieldset>
#(/runs)#
</form>
#%env/templates/footer.template%#
</body>
</html>

@ -0,0 +1,252 @@
// CrawlURLFetch_p.java
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.Enumeration;
import java.util.TreeMap;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverSwitch;
import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
import de.anomic.server.serverObjects;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeed;
public class CrawlURLFetch_p {
private static URLFetcher fetcher = null;
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
serverObjects prop = new serverObjects();
prop.put("host", "");
// List known hosts for message sending
if (yacyCore.seedDB != null && yacyCore.seedDB.sizeConnected() > 0) {
prop.put("peersKnown", 1);
int peerCount = 0;
try {
TreeMap hostList = new TreeMap();
final Enumeration e = yacyCore.seedDB.seedsConnected(true, false, null, (float) 0.0);
while (e.hasMoreElements()) {
yacySeed seed = (yacySeed) e.nextElement();
if (seed != null) hostList.put(seed.get(yacySeed.NAME, "nameless"),seed.hash);
}
String peername;
while ((peername = (String) hostList.firstKey()) != null) {
final String Hash = (String) hostList.get(peername);
prop.put("peersKnown_peers_" + peerCount + "_hash", Hash);
prop.put("peersKnown_peers_" + peerCount + "_name", peername);
hostList.remove(peername);
peerCount++;
}
} catch (Exception e) { /* no comment :P */ }
prop.put("peersKnown_peers", peerCount);
} else {
prop.put("peersKnown", 0);
}
if (post != null) {
if (post.containsKey("start")) {
try {
long frequency = -1;
if (post.containsKey("regularly"))
frequency = getDate(post.get("frequency", ""), post.get("freq_type", ""));
String t = post.get("type", "text");
int type = -1;
if (t.equals("text")) {
type = URLFetcher.TYPE_TEXT;
} else if (t.equals("xml")) {
type = URLFetcher.TYPE_XML;
}
URL url = new URL(post.get("host", null));
prop.put("host", post.get("host", ""));
if (type > -1) {
if (frequency > -1) {
fetcher = new URLFetcher(
env,
((plasmaSwitchboard)env).defaultProxyProfile,
url,
frequency,
type);
} else { // only fetch once
fetcher = new URLFetcher(
env,
((plasmaSwitchboard)env).defaultProxyProfile,
url,
type);
}
fetcher.start();
}
} catch (MalformedURLException e) {
prop.put("host", post.get("host", ""));
prop.put("hostError", 1);
}
} else if (post.containsKey("stop")) {
fetcher.interrupt();
}
}
if (fetcher != null) {
prop.put("runs", 1);
prop.put("runs_status", (fetcher.isRunning()) ? 0 : (fetcher.isPaused()) ? 2 : 1);
prop.put("runs_totalRuns", URLFetcher.totalRuns);
prop.put("runs_totalFetchedURLs", URLFetcher.totalFetchedURLs);
prop.put("runs_totalFailedURLs", URLFetcher.totalFailed);
prop.put("runs_lastRun", URLFetcher.lastRun);
prop.put("runs_lastFetchedURLs", URLFetcher.lastFetchedURLs);
prop.put("runs_lastServerResponse", (URLFetcher.lastServerResponse == null)
? "" : URLFetcher.lastServerResponse);
}
return prop;
}
private static long getDate(String count, String type) {
long r = 0;
if (count != null && count.matches("\\d+")) r = Long.parseLong(count);
if (r < 1) return -1;
r *= 3600 * 24;
if (type.equals("weeks")) return r * 24 * 7;
else if (type.equals("days")) return r * 24;
else if (type.equals("hours")) return r;
else return -1;
}
public static class URLFetcher extends Thread {
public static final int TYPE_TEXT = 0;
public static final int TYPE_XML = 1;
public static int lastFetchedURLs = 0;
public static long lastRun = 0;
public static String lastServerResponse = null;
public static int lastFailed = 0;
public static int totalRuns = 0;
public static int totalFetchedURLs = 0;
public static int totalFailed = 0;
private final URL url;
private final long delay;
private final int type;
private final plasmaSwitchboard sb;
private final plasmaCrawlProfile.entry profile;
private boolean running = false;
private boolean paused = false;
public URLFetcher(
serverSwitch env,
plasmaCrawlProfile.entry profile,
URL url,
int type) {
this.sb = (plasmaSwitchboard)env;
this.profile = profile;
this.url = url;
this.type = type;
this.delay = 0;
this.setName("URL-Fetcher");
}
public URLFetcher(
serverSwitch env,
plasmaCrawlProfile.entry profile,
URL url,
long delayMs,
int type) {
this.sb = (plasmaSwitchboard)env;
this.profile = profile;
this.url = url;
this.delay = delayMs;
this.type = type;
this.setName("URL-Fetcher");
}
public boolean isRunning() { return this.running; }
public boolean isPaused() { return this.paused; }
public void run() {
this.running = true;
this.paused = false;
long start;
while (!isInterrupted() && this.delay > 0) {
try {
start = System.currentTimeMillis();
totalFetchedURLs += addURLs();
lastRun = System.currentTimeMillis() - start;
totalRuns++;
this.paused = true;
this.wait(this.delay);
this.paused = false;
} catch (InterruptedException e) { break; }
}
this.running = false;
}
private int addURLs() throws InterruptedException {
String[] urls = getURLs();
lastFailed = 0;
if (urls == null) return 0;
String reason;
for (int i=0; i<urls.length; i++) {
reason = this.sb.sbStackCrawlThread.stackCrawl(
urls[i],
null,
yacyCore.seedDB.mySeed.hash,
"PROXY",
new Date(),
this.profile.generalDepth(),
this.profile);
if (reason != null) lastFailed++;
}
return urls.length;
}
private String[] getURLs() {
String[] r = null;
try {
httpc con = httpc.getInstance(
this.url.getHost(),
this.url.getHost(),
this.url.getPort(),
15000,
this.url.getProtocol().equals("https"));
httpHeader header = new httpHeader();
header.put(httpHeader.ACCEPT_ENCODING, "utf-8");
header.put(httpHeader.HOST, this.url.getHost());
httpc.response res = con.GET(this.url.getPath(), header);
lastServerResponse = res.statusCode + " (" + res.statusText + ")";
System.err.println("LAST RESPONSE: " + lastServerResponse);
if (res.status.startsWith("2")) {
byte[] cbs = res.writeContent();
String encoding = res.responseHeader.getCharacterEncoding();
if (encoding == null) encoding = "ASCII";
switch (this.type) {
case TYPE_TEXT: r = parseText(new String(cbs, encoding)); break;
// case TYPE_XML: r = parseXML(new String(cbs, encoding));
}
}
con.close();
httpc.returnInstance(con);
} catch (IOException e) { }
return r;
}
private static String[] parseText(String text) {
return text.split("\n");
}
}
}
Loading…
Cancel
Save