|
|
|
@ -4,7 +4,10 @@
|
|
|
|
|
// (C) by Michael Peter Christen; mc@anomic.de
|
|
|
|
|
// first published on http://www.anomic.de
|
|
|
|
|
// Frankfurt, Germany, 2004
|
|
|
|
|
// last change: 02.05.2004
|
|
|
|
|
//
|
|
|
|
|
// $LastChangedDate$
|
|
|
|
|
// $LastChangedRevision$
|
|
|
|
|
// $LastChangedBy$
|
|
|
|
|
//
|
|
|
|
|
// This program is free software; you can redistribute it and/or modify
|
|
|
|
|
// it under the terms of the GNU General Public License as published by
|
|
|
|
@ -42,15 +45,12 @@
|
|
|
|
|
// You must compile this file with
|
|
|
|
|
// javac -classpath .:../classes crawlOrder.java
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import java.util.ArrayList;
|
|
|
|
|
import java.util.Date;
|
|
|
|
|
|
|
|
|
|
import de.anomic.http.httpHeader;
|
|
|
|
|
import de.anomic.plasma.plasmaCrawlLURL;
|
|
|
|
|
import de.anomic.plasma.plasmaParser;
|
|
|
|
|
import de.anomic.plasma.plasmaSwitchboard;
|
|
|
|
|
import de.anomic.plasma.plasmaURL;
|
|
|
|
|
import de.anomic.server.serverObjects;
|
|
|
|
|
import de.anomic.server.serverSwitch;
|
|
|
|
|
import de.anomic.tools.crypt;
|
|
|
|
@ -59,7 +59,6 @@ import de.anomic.yacy.yacySeed;
|
|
|
|
|
|
|
|
|
|
public final class crawlOrder {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
|
|
|
|
|
// return variable that accumulates replacements
|
|
|
|
|
plasmaSwitchboard switchboard = (plasmaSwitchboard) env;
|
|
|
|
@ -155,10 +154,12 @@ public final class crawlOrder {
|
|
|
|
|
}
|
|
|
|
|
if (refencoded != null) {
|
|
|
|
|
// old method: only one url
|
|
|
|
|
env.getLog().logFinest("crawlOrder: refencoded=" + refencoded + " key=" + key);
|
|
|
|
|
refv.add(crypt.simpleDecode(refencoded, key)); // the referrer url
|
|
|
|
|
} else {
|
|
|
|
|
// new method: read a vector of urls
|
|
|
|
|
while ((refencoded = (String) post.get("ref" + refv.size(), null)) != null) {
|
|
|
|
|
env.getLog().logFinest("crawlOrder: refencoded=" + refencoded + " key=" + key);
|
|
|
|
|
refv.add(crypt.simpleDecode(refencoded, key));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
@ -180,6 +181,8 @@ public final class crawlOrder {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// adding URL to noticeURL Queue
|
|
|
|
|
env.getLog().logFinest("crawlOrder: a: url='" + newURL + "'");
|
|
|
|
|
|
|
|
|
|
stackresult = stack(switchboard, newURL, refURL, iam, youare);
|
|
|
|
|
response = (String) stackresult[0];
|
|
|
|
|
reason = (String) stackresult[1];
|
|
|
|
@ -191,6 +194,8 @@ public final class crawlOrder {
|
|
|
|
|
int doubleCount = 0;
|
|
|
|
|
int rejectedCount = 0;
|
|
|
|
|
for (int i = 0; i < count; i++) {
|
|
|
|
|
env.getLog().logFinest("crawlOrder: b: url='" + (String) urlv.get(i) + "'");
|
|
|
|
|
|
|
|
|
|
stackresult = stack(switchboard, (String) urlv.get(i), (String) refv.get(i), iam, youare);
|
|
|
|
|
response = (String) stackresult[0];
|
|
|
|
|
prop.put("list_" + i + "_job", (String) stackresult[0] + "," + (String) stackresult[1]);
|
|
|
|
@ -223,10 +228,10 @@ public final class crawlOrder {
|
|
|
|
|
return prop;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
private static Object[] stack(plasmaSwitchboard switchboard, String url, String referrer, String iam, String youare) {
|
|
|
|
|
String response, reason, lurl;
|
|
|
|
|
// stack url
|
|
|
|
|
switchboard.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
|
|
|
|
|
String reasonString = switchboard.sbStackCrawlThread.stackCrawl(url, referrer, iam, "REMOTE-CRAWLING", new Date(), 0, switchboard.defaultRemoteProfile);
|
|
|
|
|
if (reasonString == null) {
|
|
|
|
|
// liftoff!
|
|
|
|
|