You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
yacy_search_server/htroot/CrawlURLFetch_p.java

431 lines
18 KiB

// CrawlURLFetch_p.java
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Random;
import java.util.TreeMap;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverSwitch;
import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
import de.anomic.server.serverObjects;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeed;
public class CrawlURLFetch_p {
private static final long ERR_DATE = 1;
private static final long ERR_HOST_MALFORMED_URL = 1;
private static final long ERR_PEER_GENERAL_CONN = 1;
private static final long ERR_PEER_OFFLINE = 2;
private static final long ERR_THREAD_STOP = 1;
private static final long ERR_THREAD_RESUME = 2;
private static final long STAT_THREAD_ALIVE = 0;
private static final long STAT_THREAD_STOPPED = 1;
private static final long STAT_THREAD_PAUSED = 2;
public static final float MIN_PEER_VERSION_LIST_SERVLET = 0.504033F;
private static URLFetcher fetcher = null;
private static plasmaCrawlProfile.entry profile = null;
private static ArrayList savedURLs = new ArrayList();
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
serverObjects prop = new serverObjects();
prop.put("host", "");
listURLs(prop); // List previously saved URLs for easy selection
listPeers(prop); // List known hosts
if (profile == null) {
profile = ((plasmaSwitchboard)env).profiles.newEntry(
"URLFetcher", // Name
null, // URL
".*", ".*", // General / specific filter
0, 0, // General / specific depth
-1, -1, -1, // Recrawl / Dom-filter depth / Dom-max-pages
true, // Crawl query
true, true, // Index text / media
false, true, // Store in HT- / TX-Cache
false, // Remote indexing
true, false, false); // Exclude static / dynamic / parent stopwords
}
if (post != null) {
if (post.containsKey("start")) {
long frequency = URLFetcher.DELAY_ONCE;
if (post.containsKey("reg")) {
if (post.get("reg", "").equals("self_det")) {
frequency = URLFetcher.DELAY_SELF_DET;
} else if (post.get("reg", "").equals("delay")) {
frequency = getDate(post.get("frequency", ""), post.get("freq_type", ""));
if (frequency == -1)
prop.put("freqError", ERR_DATE);
}
}
int count = 50;
if (post.get("amount", "").matches("\\d+")) {
count = Integer.parseInt(post.get("amount", ""));
if (count > 999) count = 999;
}
if (fetcher != null) fetcher.interrupt();
fetcher = null;
if (post.get("source", "").equals("peer") &&
post.get("peerhash", "").equals("random")) {
fetcher = new URLFetcher(
env,
profile,
count,
frequency);
} else {
URL url = null;
if (post.get("source", "").equals("url")) {
try {
url = new URL(post.get("host", null));
if (!savedURLs.contains(url.toNormalform()))
savedURLs.add(url.toNormalform());
prop.put("host", post.get("host", url.toString()));
} catch (MalformedURLException e) {
prop.put("host", post.get("host", ""));
prop.put("hostError", ERR_HOST_MALFORMED_URL);
}
} else if (post.get("source", "").equals("savedURL")) {
try {
url = new URL(post.get("saved", ""));
} catch (MalformedURLException e) {
/* should never appear, except for invalid input, see above */
}
} else if (post.get("source", "").equals("peer")) {
yacySeed ys = null;
try {
ys = yacyCore.seedDB.get(post.get("peerhash", ""));
if (ys != null) {
url = new URL("http://" + ys.getAddress() + URLFetcher.LIST_SERVLET);
} else {
prop.put("peerError", ERR_PEER_OFFLINE);
prop.put("peerError_hash", post.get("peerhash", ""));
}
} catch (MalformedURLException e) {
prop.put("peerError", ERR_PEER_GENERAL_CONN);
prop.put("peerError_hash", post.get("peerhash", ""));
prop.put("peerError_name", ys.getName());
}
}
if (url != null) {
fetcher = new URLFetcher(
env,
profile,
url,
count,
frequency);
}
}
if (fetcher != null)
fetcher.start();
}
else if (post.containsKey("stop")) {
if (fetcher != null) {
fetcher.interrupt();
} else {
prop.put("threadError", ERR_THREAD_STOP);
}
}
else if (post.containsKey("restart")) {
if (fetcher != null) {
fetcher.interrupt();
if (fetcher.url == null) {
fetcher = new URLFetcher(
env,
profile,
fetcher.count,
fetcher.delay);
} else {
fetcher = new URLFetcher(
env,
profile,
fetcher.url,
fetcher.count,
fetcher.delay);
}
fetcher.start();
} else {
prop.put("threadError", ERR_THREAD_RESUME);
}
}
}
if (fetcher != null) {
prop.put("runs", 1);
prop.put("runs_status",
((fetcher.paused && fetcher.isAlive()) ? STAT_THREAD_PAUSED :
(fetcher.isAlive()) ? STAT_THREAD_ALIVE : STAT_THREAD_STOPPED));
prop.put("runs_totalRuns", URLFetcher.totalRuns);
prop.put("runs_totalFetchedURLs", URLFetcher.totalFetchedURLs);
prop.put("runs_totalFailedURLs", URLFetcher.totalFailed);
prop.put("runs_lastRun", fetcher.lastRun);
prop.put("runs_lastFetchedURLs", fetcher.lastFetchedURLs);
prop.put("runs_lastServerResponse", (fetcher.lastServerResponse == null)
? "" : fetcher.lastServerResponse);
Iterator it = fetcher.failed.keySet().iterator();
int i = 0;
Object key;
while (it.hasNext()) {
key = it.next();
prop.put("runs_error_" + i + "_reason", fetcher.failed.get(key));
prop.put("runs_error_" + i + "_url", (String)key);
i++;
}
prop.put("runs_error", i);
}
return prop;
}
private static int listURLs(serverObjects prop) {
if (savedURLs.size() == 0) return 0;
prop.put("saved", 1);
for (int i=0; i<savedURLs.size(); i++)
prop.put("saved_urls_" + i + "_url", savedURLs.get(i));
prop.put("saved_urls", savedURLs.size());
return savedURLs.size();
}
private static int listPeers(serverObjects prop) {
int peerCount = 0;
if (yacyCore.seedDB != null && yacyCore.seedDB.sizeConnected() > 0) {
prop.put("peersKnown", 1);
try {
TreeMap hostList = new TreeMap();
final Enumeration e = yacyCore.seedDB.seedsConnected(true, false, null, MIN_PEER_VERSION_LIST_SERVLET);
while (e.hasMoreElements()) {
yacySeed seed = (yacySeed) e.nextElement();
if (seed != null) hostList.put(seed.get(yacySeed.NAME, "nameless"),seed.hash);
}
String peername;
while ((peername = (String) hostList.firstKey()) != null) {
final String Hash = (String) hostList.get(peername);
if (Hash.equals(yacyCore.seedDB.mySeed.hash)) continue;
prop.put("peersKnown_peers_" + peerCount + "_hash", Hash);
prop.put("peersKnown_peers_" + peerCount + "_name", peername);
hostList.remove(peername);
peerCount++;
}
} catch (Exception e) { /* no comment :P */ }
prop.put("peersKnown_peers", peerCount);
} else {
prop.put("peersKnown", 0);
}
return peerCount;
}
private static long getDate(String count, String type) {
long r = 0;
if (count != null && count.matches("\\d+")) r = Long.parseLong(count);
if (r < 1) return -1;
r *= 3600000;
if (type.equals("weeks")) return r * 24 * 7;
else if (type.equals("days")) return r * 24;
else if (type.equals("hours")) return r;
else return -1;
}
public static class URLFetcher extends Thread {
public static final long DELAY_ONCE = -1;
public static final long DELAY_SELF_DET = 0;
private static final String LIST_SERVLET = "/yacy/list.html?list=queueUrls";
public static int totalRuns = 0;
public static int totalFetchedURLs = 0;
public static int totalFailed = 0;
public final HashMap failed = new HashMap();
public int lastFetchedURLs = 0;
public long lastRun = 0;
public String lastServerResponse = null;
public int lastFailed = 0;
public final URL url;
public final int count;
public final long delay;
public final plasmaSwitchboard sb;
public final plasmaCrawlProfile.entry profile;
public boolean paused = false;
public URLFetcher(
serverSwitch env,
plasmaCrawlProfile.entry profile,
URL url,
int count,
long delayMs) {
if (env == null || profile == null || url == null)
throw new NullPointerException("env, profile or url must not be null");
this.sb = (plasmaSwitchboard)env;
this.profile = profile;
this.url = url;
this.count = count;
this.delay = delayMs;
this.setName("URLFetcher");
}
public URLFetcher(
serverSwitch env,
plasmaCrawlProfile.entry profile,
int count,
long delayMs) {
if (env == null || profile == null)
throw new NullPointerException("env or profile must not be null");
this.sb = (plasmaSwitchboard)env;
this.profile = profile;
this.url = null;
this.count = count;
this.delay = delayMs;
this.setName("URLFetcher");
}
public void run() {
this.paused = false;
long start;
URL url;
while (!isInterrupted()) {
try {
start = System.currentTimeMillis();
url = getDLURL();
if (url == null) {
serverLog.logSevere(this.getName(), "canceled because no valid URL for the URL-list could be determinded");
return;
}
totalFetchedURLs += stackURLs(getURLs(url));
this.lastRun = System.currentTimeMillis() - start;
totalRuns++;
if (this.delay < 0 || isInterrupted()) {
return;
} else synchronized (this) {
if (this.delay == 0) {
this.paused = true;
while (this.paused) this.wait();
} else {
this.paused = true;
this.wait(this.delay);
}
}
this.paused = false;
} catch (InterruptedException e) { return; }
}
}
private URL getDLURL() {
if (this.url != null) return this.url;
// choose random seed
yacySeed ys = null;
Enumeration e = yacyCore.seedDB.seedsConnected(true, false, null, MIN_PEER_VERSION_LIST_SERVLET);
int num = new Random().nextInt(yacyCore.seedDB.sizeConnected()) + 1;
Object o;
for (int i=0; i<num && e.hasMoreElements(); i++) {
o = e.nextElement();
if (o != null) ys = (yacySeed)o;
}
if (ys == null) return null;
try {
return new URL("http://" + ys.getAddress() + LIST_SERVLET + "&count=" + this.count);
} catch (MalformedURLException ee) { return null; }
}
private int stackURLs(String[] urls) throws InterruptedException {
this.lastFailed = 0;
this.lastFetchedURLs = 0;
if (urls == null) return 0;
String reason;
for (int i=0; i<urls.length && !isInterrupted(); i++) {
if (urls[i].trim().length() == 0) continue;
serverLog.logFine(this.getName(), "stacking " + urls[i]);
reason = this.sb.sbStackCrawlThread.stackCrawl(
urls[i],
null,
yacyCore.seedDB.mySeed.hash,
null,
new Date(),
this.profile.generalDepth(),
this.profile);
if (reason == null) {
this.lastFetchedURLs++;
} else {
this.lastFailed++;
totalFailed++;
this.failed.put(urls[i], reason);
try {
plasmaCrawlEURL.Entry ee = this.sb.errorURL.newEntry(
new URL(urls[i]),
null,
yacyCore.seedDB.mySeed.hash,
yacyCore.seedDB.mySeed.hash,
"",
reason,
new kelondroBitfield());
ee.store();
this.sb.errorURL.stackPushEntry(ee);
} catch (MalformedURLException e) { }
}
}
return this.lastFetchedURLs;
}
private String[] getURLs(URL url) {
if (url == null) return null;
String[] r = null;
try {
httpc con = httpc.getInstance(
url.getHost(),
url.getHost(),
url.getPort(),
15000,
url.getProtocol().equals("https"));
httpHeader header = new httpHeader();
header.put(httpHeader.ACCEPT_ENCODING, "US-ASCII");
header.put(httpHeader.HOST, url.getHost());
httpc.response res = con.GET(url.getPath() + "?" + url.getQuery(), header);
serverLog.logFine(this.getName(), "downloaded URL-list from " + url + " (" + res.statusCode + ")");
this.lastServerResponse = res.statusCode + " (" + res.statusText + ")";
if (res.status.startsWith("2")) {
byte[] cbs = res.writeContent();
String encoding = res.responseHeader.getCharacterEncoding();
if (encoding == null) encoding = "US-ASCII";
r = parseText(wikiCode.deReplaceHTMLEntities(new String(cbs, encoding)));
}
httpc.returnInstance(con);
} catch (IOException e) { }
return r;
}
private static String[] parseText(String text) {
return text.split("\n");
}
}
}