From 50b59e312fe4b2d0a2df92ad4155d3331760d2b7 Mon Sep 17 00:00:00 2001 From: karlchenofhell Date: Sat, 10 Feb 2007 20:20:00 +0000 Subject: [PATCH] - added experimental CrawlURLFetch_p-Servlet to fetch new URLs from a specified location (\n-seperated list). Requested by Theli. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3361 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/CrawlURLFetch_p.html | 69 ++++++++++ htroot/CrawlURLFetch_p.java | 252 ++++++++++++++++++++++++++++++++++++ 2 files changed, 321 insertions(+) create mode 100644 htroot/CrawlURLFetch_p.html create mode 100644 htroot/CrawlURLFetch_p.java diff --git a/htroot/CrawlURLFetch_p.html b/htroot/CrawlURLFetch_p.html new file mode 100644 index 000000000..cc238d6ca --- /dev/null +++ b/htroot/CrawlURLFetch_p.html @@ -0,0 +1,69 @@ + + + + YaCy '#[clientname]#': Local Cache Management + #%env/templates/metas.template%# + + + #%env/templates/header.template%# +

URL-Fetcher

+

+
+

+ All newly added URLs will be crawled using the Proxy Crawl Profile. +

+
Fetch new URLs to crawl +
+
:
+
+ + + #(hostError)#::Malformed URL#(/hostError)# +
+ #(peersKnown)#:: +
:
+
+ + + #(peerError)#:: +
#(/peersKnown)# +
:
+
+ +
+
:
+
+ , + + + +
+
+
+
+ #(runs)#:: +
Thread to fetch URLs is #(status)#running::stopped::paused#(/status)# +
+
Total runs:
#[totalRuns]#
+
Last run duration:
#[lastRun]#
+
Last server response:
#[lastServerResponse]#
+
Total fetched URLs:
#[totalFetchedURLs]#
+
Total failed URLs:
#[totalFailedURLs]#
+
Last fetched URLs:
#[lastFetchedURLs]#
+
+
+
+ #(/runs)# +
+ #%env/templates/footer.template%# + + \ No newline at end of file diff --git a/htroot/CrawlURLFetch_p.java b/htroot/CrawlURLFetch_p.java new file mode 100644 index 000000000..bb2917ec1 --- /dev/null +++ b/htroot/CrawlURLFetch_p.java @@ -0,0 +1,252 @@ +// CrawlURLFetch_p.java + +import java.io.IOException; +import java.net.MalformedURLException; +import java.util.Date; +import java.util.Enumeration; +import java.util.TreeMap; + +import de.anomic.net.URL; +import de.anomic.plasma.plasmaCrawlProfile; +import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.server.serverSwitch; +import de.anomic.http.httpHeader; +import de.anomic.http.httpc; +import de.anomic.server.serverObjects; +import de.anomic.yacy.yacyCore; +import de.anomic.yacy.yacySeed; + +public class CrawlURLFetch_p { + + private static URLFetcher fetcher = null; + + public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) { + serverObjects prop = new serverObjects(); + + prop.put("host", ""); + + // List known hosts for message sending + if (yacyCore.seedDB != null && yacyCore.seedDB.sizeConnected() > 0) { + prop.put("peersKnown", 1); + int peerCount = 0; + try { + TreeMap hostList = new TreeMap(); + final Enumeration e = yacyCore.seedDB.seedsConnected(true, false, null, (float) 0.0); + while (e.hasMoreElements()) { + yacySeed seed = (yacySeed) e.nextElement(); + if (seed != null) hostList.put(seed.get(yacySeed.NAME, "nameless"),seed.hash); + } + + String peername; + while ((peername = (String) hostList.firstKey()) != null) { + final String Hash = (String) hostList.get(peername); + prop.put("peersKnown_peers_" + peerCount + "_hash", Hash); + prop.put("peersKnown_peers_" + peerCount + "_name", peername); + hostList.remove(peername); + peerCount++; + } + } catch (Exception e) { /* no comment :P */ } + prop.put("peersKnown_peers", peerCount); + } else { + prop.put("peersKnown", 0); + } + + if (post != null) { + if (post.containsKey("start")) { + try { + + long frequency = -1; + if (post.containsKey("regularly")) + frequency = getDate(post.get("frequency", ""), post.get("freq_type", "")); + + String t = post.get("type", "text"); + int type = -1; + if (t.equals("text")) { + type = URLFetcher.TYPE_TEXT; + } else if (t.equals("xml")) { + type = URLFetcher.TYPE_XML; + } + + URL url = new URL(post.get("host", null)); + prop.put("host", post.get("host", "")); + + if (type > -1) { + if (frequency > -1) { + fetcher = new URLFetcher( + env, + ((plasmaSwitchboard)env).defaultProxyProfile, + url, + frequency, + type); + } else { // only fetch once + fetcher = new URLFetcher( + env, + ((plasmaSwitchboard)env).defaultProxyProfile, + url, + type); + } + fetcher.start(); + } + } catch (MalformedURLException e) { + prop.put("host", post.get("host", "")); + prop.put("hostError", 1); + } + } else if (post.containsKey("stop")) { + fetcher.interrupt(); + } + } + + if (fetcher != null) { + prop.put("runs", 1); + prop.put("runs_status", (fetcher.isRunning()) ? 0 : (fetcher.isPaused()) ? 2 : 1); + prop.put("runs_totalRuns", URLFetcher.totalRuns); + prop.put("runs_totalFetchedURLs", URLFetcher.totalFetchedURLs); + prop.put("runs_totalFailedURLs", URLFetcher.totalFailed); + prop.put("runs_lastRun", URLFetcher.lastRun); + prop.put("runs_lastFetchedURLs", URLFetcher.lastFetchedURLs); + prop.put("runs_lastServerResponse", (URLFetcher.lastServerResponse == null) + ? "" : URLFetcher.lastServerResponse); + } + + return prop; + } + + private static long getDate(String count, String type) { + long r = 0; + if (count != null && count.matches("\\d+")) r = Long.parseLong(count); + if (r < 1) return -1; + + r *= 3600 * 24; + if (type.equals("weeks")) return r * 24 * 7; + else if (type.equals("days")) return r * 24; + else if (type.equals("hours")) return r; + else return -1; + } + + public static class URLFetcher extends Thread { + + public static final int TYPE_TEXT = 0; + public static final int TYPE_XML = 1; + + public static int lastFetchedURLs = 0; + public static long lastRun = 0; + public static String lastServerResponse = null; + public static int lastFailed = 0; + public static int totalRuns = 0; + public static int totalFetchedURLs = 0; + public static int totalFailed = 0; + + private final URL url; + private final long delay; + private final int type; + private final plasmaSwitchboard sb; + private final plasmaCrawlProfile.entry profile; + + private boolean running = false; + private boolean paused = false; + + public URLFetcher( + serverSwitch env, + plasmaCrawlProfile.entry profile, + URL url, + int type) { + this.sb = (plasmaSwitchboard)env; + this.profile = profile; + this.url = url; + this.type = type; + this.delay = 0; + this.setName("URL-Fetcher"); + } + + public URLFetcher( + serverSwitch env, + plasmaCrawlProfile.entry profile, + URL url, + long delayMs, + int type) { + this.sb = (plasmaSwitchboard)env; + this.profile = profile; + this.url = url; + this.delay = delayMs; + this.type = type; + this.setName("URL-Fetcher"); + } + + public boolean isRunning() { return this.running; } + public boolean isPaused() { return this.paused; } + + public void run() { + this.running = true; + this.paused = false; + long start; + while (!isInterrupted() && this.delay > 0) { + try { + start = System.currentTimeMillis(); + totalFetchedURLs += addURLs(); + lastRun = System.currentTimeMillis() - start; + totalRuns++; + this.paused = true; + this.wait(this.delay); + this.paused = false; + } catch (InterruptedException e) { break; } + } + this.running = false; + } + + private int addURLs() throws InterruptedException { + String[] urls = getURLs(); + lastFailed = 0; + if (urls == null) return 0; + String reason; + for (int i=0; i