preparations for bulk remote crawls

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@408 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent 0f663bcebf
commit 9f505af7aa

@ -105,7 +105,7 @@ public class Network {
accActLinks += links;
accActWords += words;
}
try {myppm = Long.parseLong(seed.get("ISpeed", "0"));} catch (NumberFormatException e) {}
myppm = seed.getPPM();
prop.put("table_my-version", seed.get("Version", "-"));
prop.put("table_my-uptime", serverDate.intervalToString(60000 * Long.parseLong(seed.get("Uptime", ""))));
prop.put("table_my-links", groupDigits(links));

@ -4,6 +4,10 @@ response=#[response]#
reason=#[reason]#
delay=#[delay]#
depth=#[depth]#
lurl=#[lurl]#
forward=#[forward]#
key=#[key]#
key=#[key]#
lurl=#[lurl]#
#{list}#
job#[count]#=#[job]#
lurl#[count]#=#[lurl]#
#{/list}#

@ -45,6 +45,7 @@
import java.net.URL;
import java.util.Date;
import java.util.Vector;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlLURL;
@ -74,10 +75,8 @@ public class crawlOrder {
String youare = (String) post.get("youare", ""); // seed hash of the target peer, needed for network stability
String process = (String) post.get("process", ""); // process type
String key = (String) post.get("key", ""); // transmission key
String url = crypt.simpleDecode((String) post.get("url", ""), key); // the url string to crawl
String referrer = crypt.simpleDecode((String) post.get("referrer", ""), key); // the referrer url
int orderDepth = Integer.parseInt((String) post.get("depth", "0")); // crawl depth
// response values
/*
the result can have one of the following values:
@ -101,7 +100,8 @@ public class crawlOrder {
String lurl = "";
boolean granted = switchboard.getConfig("crawlResponse", "false").equals("true");
int acceptDepth = Integer.parseInt(switchboard.getConfig("crawlResponseDepth", "0"));
int acceptDelay = Integer.parseInt(switchboard.getConfig("crawlResponseDelay", "0"));
int ppm = yacyCore.seedDB.mySeed.getPPM();
int acceptDelay = (ppm == 0) ? 10 : (2 + 60 / yacyCore.seedDB.mySeed.getPPM());
if (orderDepth > acceptDepth) orderDepth = acceptDepth;
@ -113,7 +113,7 @@ public class crawlOrder {
delay = "3600"; // may request one hour later again
} else if (orderDepth > 0) {
response = "denied";
reason = "order must be 0";
reason = "order depth must be 0";
delay = "3600"; // may request one hour later again
} else if (!(granted)) {
response = "denied";
@ -121,8 +121,7 @@ public class crawlOrder {
delay = "3600"; // may request one hour later again
} else try {
yacySeed requester = yacyCore.seedDB.getConnected(iam);
int queuesize = switchboard.queueSize();
String urlhash = plasmaURL.urlHash(new URL(url));
int queuesize = switchboard.coreCrawlJobSize() + switchboard.limitCrawlTriggerJobSize() + switchboard.remoteTriggeredCrawlJobSize();
if (requester == null) {
response = "denied";
reason = "unknown-client";
@ -131,40 +130,65 @@ public class crawlOrder {
response = "denied";
reason = "not-qualified";
delay = "240";
} else if (queuesize > 1) {
} else if (queuesize > 100) {
response = "rejected";
reason = "busy";
delay = "" + (queuesize * acceptDelay);
delay = "" + (30 + queuesize * acceptDelay);
} else if (!(process.equals("crawl"))) {
response = "denied";
reason = "unknown-order";
delay = "9999";
} else {
// stack url
String reasonString = switchboard.stackCrawl(url, referrer, iam, "REMOTE-CRAWLING", new Date(), 0, switchboard.defaultRemoteProfile);
if (reasonString == null) {
// liftoff!
response = "stacked";
reason = "ok";
delay = "" + acceptDelay; // this value needs to be calculated individually
} else if (reasonString.equals("double_(already_loaded)")) {
// case where we have already the url loaded;
reason = reasonString;
delay = "" + (acceptDelay / 4);
// send lurl-Entry as response
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(plasmaCrawlLURL.urlHash(url));
if (entry != null) {
response = "double";
switchboard.urlPool.loadedURL.notifyGCrawl(entry.hash(), iam, youare);
lurl = crypt.simpleEncode(entry.toString());
delay = "1";
} else {
response = "rejected";
// read the urls/referrer-vector
Vector urlv = new Vector();
Vector refv = new Vector();
String refencoded = (String) post.get("referrer", null);
String urlencoded = (String) post.get("url", null);
if (urlencoded != null) {
// old method: only one url
urlv.add(crypt.simpleDecode(urlencoded, key)); // the url string to crawl
} else {
// new method: read a vector of urls
while ((urlencoded = (String) post.get("url" + urlv.size(), null)) != null) {
urlv.add(crypt.simpleDecode(urlencoded, key));
}
}
if (refencoded != null) {
// old method: only one url
refv.add(crypt.simpleDecode(refencoded, key)); // the referrer url
} else {
response = "rejected";
reason = reasonString;
delay = "" + (acceptDelay / 4);
// new method: read a vector of urls
while ((refencoded = (String) post.get("ref" + refv.size(), null)) != null) {
refv.add(crypt.simpleDecode(refencoded, key));
}
}
// stack the urls
Object[] stackresult;
int count = Math.min(urlv.size(), refv.size());
if (count == 1) {
// old method: only one url
stackresult = stack(switchboard, (String) urlv.elementAt(0), (String) refv.elementAt(0), iam, youare);
response = (String) stackresult[0];
reason = (String) stackresult[1];
lurl = (String) stackresult[2];
delay = (response.equals("stacked")) ? "" + (5 + acceptDelay) : "1"; // this value needs to be calculated individually
} else {
// new method: several urls
int stackCount = 0;
int doubleCount = 0;
int rejectedCount = 0;
for (int i = 0; i < count; i++) {
stackresult = stack(switchboard, (String) urlv.elementAt(i), (String) refv.elementAt(i), iam, youare);
response = (String) stackresult[0];
prop.put("list_" + i + "_job", (String) stackresult[0] + "," + (String) stackresult[1]);
prop.put("list_" + i + "_lurl", (String) stackresult[2]);
prop.put("list_" + i + "_count", i);
}
response = "enqueued";
reason = "ok";
lurl = "";
delay = "" + (stackCount * acceptDelay + 1);
}
}
} catch (Exception e) {
@ -185,5 +209,36 @@ public class crawlOrder {
// return rewrite properties
return prop;
}
private static Object[] stack(plasmaSwitchboard switchboard, String url, String referrer, String iam, String youare) {
String response, reason, lurl;
// stack url
String reasonString = switchboard.stackCrawl(url, referrer, iam, "REMOTE-CRAWLING", new Date(), 0, switchboard.defaultRemoteProfile);
if (reasonString == null) {
// liftoff!
response = "stacked";
reason = "ok";
lurl = "";
} else if (reasonString.startsWith("double")) {
// case where we have already the url loaded;
reason = reasonString;
// send lurl-Entry as response
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(plasmaCrawlLURL.urlHash(url));
if (entry != null) {
response = "double";
switchboard.urlPool.loadedURL.notifyGCrawl(entry.hash(), iam, youare);
lurl = crypt.simpleEncode(entry.toString());
} else {
response = "rejected";
lurl = "";
}
} else {
response = "rejected";
reason = reasonString;
lurl = "";
}
return new Object[]{response, reason, lurl};
}
}

@ -1145,7 +1145,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
// do the request
HashMap page = yacyClient.crawlOrder(remoteSeed, urlEntry.url(), urlPool.getURL(urlEntry.referrerHash()), 0);
HashMap page = yacyClient.crawlOrder(remoteSeed, urlEntry.url(), urlPool.getURL(urlEntry.referrerHash()));
// check success

@ -450,6 +450,7 @@ public class yacyClient {
}
}
/*
public static HashMap crawlOrder(yacySeed targetSeed, URL url, URL referrer, int depth) {
// this post a message to the remote message board
if (targetSeed == null) return null;
@ -479,6 +480,39 @@ public class yacyClient {
return null;
}
}
*/
public static HashMap crawlOrder(yacySeed targetSeed, URL url, URL referrer) {
// this post a message to the remote message board
if (targetSeed == null) return null;
if (yacyCore.seedDB.mySeed == null) return null;
if (yacyCore.seedDB.mySeed == targetSeed) return null;
// construct request
serverObjects post = new serverObjects();
String key = crypt.randomSalt();
post.put("key", key);
post.put("process", "crawl");
post.put("iam", yacyCore.seedDB.mySeed.hash);
post.put("youare", targetSeed.hash);
post.put("mytime", yacyCore.universalDateShortString());
post.put("url", crypt.simpleEncode(url.toString()));
post.put("referrer", crypt.simpleEncode((referrer == null) ? "" : referrer.toString()));
post.put("depth", "0");
post.put("ttl", "0");
String address = targetSeed.getAddress();
if (address == null) return null;
try {
return nxTools.table(httpc.wput(
new URL("http://" + address + "/yacy/crawlOrder.html"),
10000, null, null, yacyCore.seedDB.sb.remoteProxyHost, yacyCore.seedDB.sb.remoteProxyPort, post));
} catch (Exception e) {
// most probably a network time-out exception
yacyCore.log.logError("yacyClient.crawlOrder error: peer=" + targetSeed.getName() + ", error=" + e.getMessage());
return null;
}
}
/*
Test:

@ -203,6 +203,13 @@ public class yacySeed {
if ((ip != null) && (ip.length() >= 8) && (port != null) && (port.length() >= 2)) return ip + ":" + port; else return null;
}
public int getPPM() {
try {
return Integer.parseInt(get("ISpeed", "0"));
} catch (NumberFormatException e) {
return 0;
}
}
private boolean getFlag(int flag) {
String flags = get("Flags", "0000");
return (new bitfield(flags.getBytes())).get(flag);

@ -376,7 +376,6 @@ crawlOrderDepth=0
crawlOrderDelay=8
crawlResponse=true
crawlResponseDepth=0
crawlResponseDelay=30
# indexing-exclusion - rules
# There rules are important to reduce the number of words that are indexed
@ -388,7 +387,6 @@ xsstopw=true
xdstopw=true
xpstopw=true
# performance-settings
# delay-times for permanent loops (milliseconds)
# the idlesleep is the pause that an proces sleeps if the last call to the

Loading…
Cancel
Save