diff --git a/htroot/Network.java b/htroot/Network.java index 90177e27b..d5afeff1e 100644 --- a/htroot/Network.java +++ b/htroot/Network.java @@ -105,7 +105,7 @@ public class Network { accActLinks += links; accActWords += words; } - try {myppm = Long.parseLong(seed.get("ISpeed", "0"));} catch (NumberFormatException e) {} + myppm = seed.getPPM(); prop.put("table_my-version", seed.get("Version", "-")); prop.put("table_my-uptime", serverDate.intervalToString(60000 * Long.parseLong(seed.get("Uptime", "")))); prop.put("table_my-links", groupDigits(links)); diff --git a/htroot/yacy/crawlOrder.html b/htroot/yacy/crawlOrder.html index daf73e7de..54229c167 100644 --- a/htroot/yacy/crawlOrder.html +++ b/htroot/yacy/crawlOrder.html @@ -4,6 +4,10 @@ response=#[response]# reason=#[reason]# delay=#[delay]# depth=#[depth]# -lurl=#[lurl]# forward=#[forward]# -key=#[key]# \ No newline at end of file +key=#[key]# +lurl=#[lurl]# +#{list}# +job#[count]#=#[job]# +lurl#[count]#=#[lurl]# +#{/list}# \ No newline at end of file diff --git a/htroot/yacy/crawlOrder.java b/htroot/yacy/crawlOrder.java index 78429ff02..6b7e0277c 100644 --- a/htroot/yacy/crawlOrder.java +++ b/htroot/yacy/crawlOrder.java @@ -45,6 +45,7 @@ import java.net.URL; import java.util.Date; +import java.util.Vector; import de.anomic.http.httpHeader; import de.anomic.plasma.plasmaCrawlLURL; @@ -74,10 +75,8 @@ public class crawlOrder { String youare = (String) post.get("youare", ""); // seed hash of the target peer, needed for network stability String process = (String) post.get("process", ""); // process type String key = (String) post.get("key", ""); // transmission key - String url = crypt.simpleDecode((String) post.get("url", ""), key); // the url string to crawl - String referrer = crypt.simpleDecode((String) post.get("referrer", ""), key); // the referrer url int orderDepth = Integer.parseInt((String) post.get("depth", "0")); // crawl depth - + // response values /* the result can have one of the following values: @@ -101,7 +100,8 @@ public class crawlOrder { String lurl = ""; boolean granted = switchboard.getConfig("crawlResponse", "false").equals("true"); int acceptDepth = Integer.parseInt(switchboard.getConfig("crawlResponseDepth", "0")); - int acceptDelay = Integer.parseInt(switchboard.getConfig("crawlResponseDelay", "0")); + int ppm = yacyCore.seedDB.mySeed.getPPM(); + int acceptDelay = (ppm == 0) ? 10 : (2 + 60 / yacyCore.seedDB.mySeed.getPPM()); if (orderDepth > acceptDepth) orderDepth = acceptDepth; @@ -113,7 +113,7 @@ public class crawlOrder { delay = "3600"; // may request one hour later again } else if (orderDepth > 0) { response = "denied"; - reason = "order must be 0"; + reason = "order depth must be 0"; delay = "3600"; // may request one hour later again } else if (!(granted)) { response = "denied"; @@ -121,8 +121,7 @@ public class crawlOrder { delay = "3600"; // may request one hour later again } else try { yacySeed requester = yacyCore.seedDB.getConnected(iam); - int queuesize = switchboard.queueSize(); - String urlhash = plasmaURL.urlHash(new URL(url)); + int queuesize = switchboard.coreCrawlJobSize() + switchboard.limitCrawlTriggerJobSize() + switchboard.remoteTriggeredCrawlJobSize(); if (requester == null) { response = "denied"; reason = "unknown-client"; @@ -131,40 +130,65 @@ public class crawlOrder { response = "denied"; reason = "not-qualified"; delay = "240"; - } else if (queuesize > 1) { + } else if (queuesize > 100) { response = "rejected"; reason = "busy"; - delay = "" + (queuesize * acceptDelay); + delay = "" + (30 + queuesize * acceptDelay); } else if (!(process.equals("crawl"))) { response = "denied"; reason = "unknown-order"; delay = "9999"; } else { - // stack url - String reasonString = switchboard.stackCrawl(url, referrer, iam, "REMOTE-CRAWLING", new Date(), 0, switchboard.defaultRemoteProfile); - if (reasonString == null) { - // liftoff! - response = "stacked"; - reason = "ok"; - delay = "" + acceptDelay; // this value needs to be calculated individually - } else if (reasonString.equals("double_(already_loaded)")) { - // case where we have already the url loaded; - reason = reasonString; - delay = "" + (acceptDelay / 4); - // send lurl-Entry as response - plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(plasmaCrawlLURL.urlHash(url)); - if (entry != null) { - response = "double"; - switchboard.urlPool.loadedURL.notifyGCrawl(entry.hash(), iam, youare); - lurl = crypt.simpleEncode(entry.toString()); - delay = "1"; - } else { - response = "rejected"; + // read the urls/referrer-vector + Vector urlv = new Vector(); + Vector refv = new Vector(); + String refencoded = (String) post.get("referrer", null); + String urlencoded = (String) post.get("url", null); + if (urlencoded != null) { + // old method: only one url + urlv.add(crypt.simpleDecode(urlencoded, key)); // the url string to crawl + } else { + // new method: read a vector of urls + while ((urlencoded = (String) post.get("url" + urlv.size(), null)) != null) { + urlv.add(crypt.simpleDecode(urlencoded, key)); } + } + if (refencoded != null) { + // old method: only one url + refv.add(crypt.simpleDecode(refencoded, key)); // the referrer url } else { - response = "rejected"; - reason = reasonString; - delay = "" + (acceptDelay / 4); + // new method: read a vector of urls + while ((refencoded = (String) post.get("ref" + refv.size(), null)) != null) { + refv.add(crypt.simpleDecode(refencoded, key)); + } + } + + // stack the urls + Object[] stackresult; + int count = Math.min(urlv.size(), refv.size()); + if (count == 1) { + // old method: only one url + stackresult = stack(switchboard, (String) urlv.elementAt(0), (String) refv.elementAt(0), iam, youare); + response = (String) stackresult[0]; + reason = (String) stackresult[1]; + lurl = (String) stackresult[2]; + delay = (response.equals("stacked")) ? "" + (5 + acceptDelay) : "1"; // this value needs to be calculated individually + } else { + // new method: several urls + int stackCount = 0; + int doubleCount = 0; + int rejectedCount = 0; + for (int i = 0; i < count; i++) { + stackresult = stack(switchboard, (String) urlv.elementAt(i), (String) refv.elementAt(i), iam, youare); + response = (String) stackresult[0]; + prop.put("list_" + i + "_job", (String) stackresult[0] + "," + (String) stackresult[1]); + prop.put("list_" + i + "_lurl", (String) stackresult[2]); + prop.put("list_" + i + "_count", i); + } + response = "enqueued"; + reason = "ok"; + lurl = ""; + delay = "" + (stackCount * acceptDelay + 1); } } } catch (Exception e) { @@ -185,5 +209,36 @@ public class crawlOrder { // return rewrite properties return prop; } + + private static Object[] stack(plasmaSwitchboard switchboard, String url, String referrer, String iam, String youare) { + String response, reason, lurl; + // stack url + String reasonString = switchboard.stackCrawl(url, referrer, iam, "REMOTE-CRAWLING", new Date(), 0, switchboard.defaultRemoteProfile); + if (reasonString == null) { + // liftoff! + response = "stacked"; + reason = "ok"; + lurl = ""; + } else if (reasonString.startsWith("double")) { + // case where we have already the url loaded; + reason = reasonString; + // send lurl-Entry as response + plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(plasmaCrawlLURL.urlHash(url)); + if (entry != null) { + response = "double"; + switchboard.urlPool.loadedURL.notifyGCrawl(entry.hash(), iam, youare); + lurl = crypt.simpleEncode(entry.toString()); + } else { + response = "rejected"; + lurl = ""; + } + } else { + response = "rejected"; + reason = reasonString; + lurl = ""; + } + return new Object[]{response, reason, lurl}; + } + } diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 9a1800433..393b91bf3 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1145,7 +1145,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } // do the request - HashMap page = yacyClient.crawlOrder(remoteSeed, urlEntry.url(), urlPool.getURL(urlEntry.referrerHash()), 0); + HashMap page = yacyClient.crawlOrder(remoteSeed, urlEntry.url(), urlPool.getURL(urlEntry.referrerHash())); // check success diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index dfc8e29ff..d8e96540b 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -450,6 +450,7 @@ public class yacyClient { } } + /* public static HashMap crawlOrder(yacySeed targetSeed, URL url, URL referrer, int depth) { // this post a message to the remote message board if (targetSeed == null) return null; @@ -479,6 +480,39 @@ public class yacyClient { return null; } } + */ + + public static HashMap crawlOrder(yacySeed targetSeed, URL url, URL referrer) { + // this post a message to the remote message board + if (targetSeed == null) return null; + if (yacyCore.seedDB.mySeed == null) return null; + if (yacyCore.seedDB.mySeed == targetSeed) return null; + + // construct request + serverObjects post = new serverObjects(); + String key = crypt.randomSalt(); + post.put("key", key); + post.put("process", "crawl"); + post.put("iam", yacyCore.seedDB.mySeed.hash); + post.put("youare", targetSeed.hash); + post.put("mytime", yacyCore.universalDateShortString()); + post.put("url", crypt.simpleEncode(url.toString())); + post.put("referrer", crypt.simpleEncode((referrer == null) ? "" : referrer.toString())); + post.put("depth", "0"); + post.put("ttl", "0"); + + String address = targetSeed.getAddress(); + if (address == null) return null; + try { + return nxTools.table(httpc.wput( + new URL("http://" + address + "/yacy/crawlOrder.html"), + 10000, null, null, yacyCore.seedDB.sb.remoteProxyHost, yacyCore.seedDB.sb.remoteProxyPort, post)); + } catch (Exception e) { + // most probably a network time-out exception + yacyCore.log.logError("yacyClient.crawlOrder error: peer=" + targetSeed.getName() + ", error=" + e.getMessage()); + return null; + } + } /* Test: diff --git a/source/de/anomic/yacy/yacySeed.java b/source/de/anomic/yacy/yacySeed.java index ea7b33a40..1014253a8 100644 --- a/source/de/anomic/yacy/yacySeed.java +++ b/source/de/anomic/yacy/yacySeed.java @@ -203,6 +203,13 @@ public class yacySeed { if ((ip != null) && (ip.length() >= 8) && (port != null) && (port.length() >= 2)) return ip + ":" + port; else return null; } + public int getPPM() { + try { + return Integer.parseInt(get("ISpeed", "0")); + } catch (NumberFormatException e) { + return 0; + } + } private boolean getFlag(int flag) { String flags = get("Flags", "0000"); return (new bitfield(flags.getBytes())).get(flag); diff --git a/yacy.init b/yacy.init index 1629c184e..acdb3a8bd 100644 --- a/yacy.init +++ b/yacy.init @@ -376,7 +376,6 @@ crawlOrderDepth=0 crawlOrderDelay=8 crawlResponse=true crawlResponseDepth=0 -crawlResponseDelay=30 # indexing-exclusion - rules # There rules are important to reduce the number of words that are indexed @@ -388,7 +387,6 @@ xsstopw=true xdstopw=true xpstopw=true - # performance-settings # delay-times for permanent loops (milliseconds) # the idlesleep is the pause that an proces sleeps if the last call to the