From 22db449f2ab8c6b0d596b6eee57fd9bc13136a2a Mon Sep 17 00:00:00 2001 From: reger Date: Tue, 5 Jul 2016 23:22:35 +0200 Subject: [PATCH 1/8] to prevent crawler to concurrently access and alter same crawl queue after restart, put hosthash in queue's filename (which is used as primary key for crawl queue. Hint: initial hosthash from url and recalculated hosthash from just hostname:port are not the same. fixes http://mantis.tokeek.de/view.php?id=668 (partially) --- source/net/yacy/crawler/HostBalancer.java | 12 +-- source/net/yacy/crawler/HostQueue.java | 96 ++++++++++++----------- 2 files changed, 57 insertions(+), 51 deletions(-) diff --git a/source/net/yacy/crawler/HostBalancer.java b/source/net/yacy/crawler/HostBalancer.java index 9ffe4cf23..b6338b6c1 100644 --- a/source/net/yacy/crawler/HostBalancer.java +++ b/source/net/yacy/crawler/HostBalancer.java @@ -103,7 +103,7 @@ public class HostBalancer implements Balancer { queue.close(); FileUtils.deletedelete(queuePath); } else { - queues.put(DigestURL.hosthash(queue.getHost(), queue.getPort()), queue); + queues.put(queue.getHostHash(), queue); } } catch (MalformedURLException | RuntimeException e) { log.warn("delete queue due to init error for " + hostsPath.getName() + " host=" + hoststr + " " + e.getLocalizedMessage()); @@ -244,11 +244,11 @@ public class HostBalancer implements Balancer { public String push(final Request entry, CrawlProfile profile, final RobotsTxt robots) throws IOException, SpaceExceededException { if (this.has(entry.url().hash())) return "double occurrence"; depthCache.put(entry.url().hash(), entry.depth()); - String hosthash = ASCII.String(entry.url().hash(), 6, 6); + String hosthash = entry.url().hosthash(); synchronized (this) { HostQueue queue = this.queues.get(hosthash); if (queue == null) { - queue = new HostQueue(this.hostsPath, entry.url().getHost(), entry.url().getPort(), this.queues.size() > this.onDemandLimit, this.exceed134217727); + queue = new HostQueue(this.hostsPath, entry.url(), this.queues.size() > this.onDemandLimit, this.exceed134217727); this.queues.put(hosthash, queue); // profile might be null when continue crawls after YaCy restart robots.ensureExist(entry.url(), profile == null ? ClientIdentification.yacyInternetCrawlerAgent : profile.getAgent(), true); // concurrently load all robots.txt @@ -376,7 +376,7 @@ public class HostBalancer implements Balancer { for (String h: lastEntries) this.roundRobinHostHashes.remove(h); } } - + /* // first strategy: get one entry which does not need sleep time Iterator nhhi = this.roundRobinHostHashes.iterator(); @@ -386,7 +386,7 @@ public class HostBalancer implements Balancer { if (rhq == null) { nhhi.remove(); continue nosleep; - } + } int delta = Latency.waitingRemainingGuessed(rhq.getHost(), rhh, robots, ClientIdentification.yacyInternetCrawlerAgent); if (delta <= 10 || this.roundRobinHostHashes.size() == 1 || rhq.size() == 1) { nhhi.remove(); @@ -494,7 +494,7 @@ public class HostBalancer implements Balancer { map.put(hq.getHost() + ":" + hq.getPort(), new Integer[]{hq.size(), delta}); } catch (MalformedURLException e) { ConcurrentLog.logException(e); - } + } return map; } diff --git a/source/net/yacy/crawler/HostQueue.java b/source/net/yacy/crawler/HostQueue.java index d16a65f39..3e9621a11 100644 --- a/source/net/yacy/crawler/HostQueue.java +++ b/source/net/yacy/crawler/HostQueue.java @@ -64,28 +64,48 @@ public class HostQueue implements Balancer { private static final int EcoFSBufferSize = 1000; private static final int objectIndexBufferSize = 1000; - private final File hostPath; + private final File hostPath; // path to the stack files private final String hostName; - private String hostHash; + private final String hostHash; private final int port; private final boolean exceed134217727; private final boolean onDemand; private TreeMap depthStacks; + /** + * Create or open host queue. The host part of the hostUrl parameter is used + * to calculate the stack directory name. + * + * @param hostsPath + * @param hostUrl + * @param onDemand + * @param exceed134217727 + * @throws MalformedURLException + */ public HostQueue ( final File hostsPath, - final String hostName, - final int port, + final DigestURL hostUrl, // any url from that host (only host data are extracted) final boolean onDemand, final boolean exceed134217727) throws MalformedURLException { this.onDemand = onDemand; this.exceed134217727 = exceed134217727; - this.hostName = (hostName == null) ? "localhost" : hostName; // might be null (file://) but hostqueue needs a name (for queue file) - this.port = port; - this.hostPath = new File(hostsPath, this.hostName + "." + this.port); + this.hostName = (hostUrl.getHost() == null) ? "localhost" : hostUrl.getHost(); // might be null (file://) but hostqueue needs a name (for queue file) + this.port = hostUrl.getPort(); + this.hostHash = hostUrl.hosthash(); // hosthash is calculated by protocol + hostname + port + // hostName/port included just for human readability (& historically), "-#" marker used to define begin of hosthash in directoryname + this.hostPath = new File(hostsPath, this.hostName + "-#"+ this.hostHash + "." + this.port); init(); } + /** + * Initializes host queue from cache files. The internal id of the queue is + * extracted form the path name an must match the key initially generated + * currently the hosthash is used as id. + * @param hostPath path of the stack directory (containing the primary key/id of the queue) + * @param onDemand + * @param exceed134217727 + * @throws MalformedURLException + */ public HostQueue ( final File hostPath, final boolean onDemand, @@ -95,10 +115,14 @@ public class HostQueue implements Balancer { this.hostPath = hostPath; // parse the hostName and port from the file name String filename = hostPath.getName(); - int p = filename.lastIndexOf('.'); - if (p < 0) throw new RuntimeException("hostPath name must contain a dot: " + filename); - this.hostName = filename.substring(0, p); - this.port = Integer.parseInt(filename.substring(p + 1)); // consider "host.com" contains dot but no required port -> will throw exception + int pdot = filename.lastIndexOf('.'); + if (pdot < 0) throw new RuntimeException("hostPath name must contain a dot: " + filename); + this.port = Integer.parseInt(filename.substring(pdot + 1)); // consider "host.com" contains dot but no required port -> will throw exception + int p1 = filename.lastIndexOf("-#"); + if (p1 >= 0) { + this.hostName = filename.substring(0,p1); + this.hostHash = filename.substring(p1+2,pdot); + } else throw new RuntimeException("hostPath name must contain -# followd by hosthash: " + filename); init(); } @@ -107,14 +131,6 @@ public class HostQueue implements Balancer { * @throws MalformedURLException if directory for the host could not be created */ private final void init() throws MalformedURLException { - try { - if (this.hostName == null) - this.hostHash=""; - else - this.hostHash = DigestURL.hosthash(this.hostName, this.port); - } catch (MalformedURLException e) { - this.hostHash = ""; - } if (!(this.hostPath.exists())) { this.hostPath.mkdirs(); if (!this.hostPath.exists()) { // check if directory created (if not, likely a name violation) @@ -134,6 +150,15 @@ public class HostQueue implements Balancer { return this.port; } + /** + * Get the hosthash of this queue determined during init. + * + * @return + */ + public String getHostHash() { + return this.hostHash; + } + private int openAllStacks() { String[] l = this.hostPath.list(); int c = 0; @@ -157,26 +182,6 @@ public class HostQueue implements Balancer { return c; } - public synchronized int getLowestStackDepth() { - while (this.depthStacks.size() > 0) { - Map.Entry entry; - synchronized (this) { - entry = this.depthStacks.firstEntry(); - } - if (entry == null) return 0; // happens only if map is empty - if (entry.getValue().size() == 0) { - entry.getValue().close(); - deletedelete(getFile(entry.getKey())); - this.depthStacks.remove(entry.getKey()); - continue; - } - return entry.getKey(); - } - // this should not happen but it happens if a deletion is done - //assert false; - return 0; - } - private Index getLowestStack() { while (this.depthStacks.size() > 0) { Map.Entry entry; @@ -196,16 +201,17 @@ public class HostQueue implements Balancer { //assert false; return null; } - + + /** + * Get existing url stack with crawl depth or create a new (empty) stack + * + * @param depth + * @return existing or new/empty stack + */ private Index getStack(int depth) { Index depthStack; - synchronized (this) { - depthStack = this.depthStacks.get(depth); - if (depthStack != null) return depthStack; - } // create a new stack synchronized (this) { - // check again depthStack = this.depthStacks.get(depth); if (depthStack != null) return depthStack; // now actually create a new stack From 7b226afc3360db06f661e0eac356f39ac859c64a Mon Sep 17 00:00:00 2001 From: reger Date: Wed, 6 Jul 2016 23:52:02 +0200 Subject: [PATCH 2/8] fix HostQueueTest - changed open parameter --- test/java/net/yacy/crawler/HostQueueTest.java | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/test/java/net/yacy/crawler/HostQueueTest.java b/test/java/net/yacy/crawler/HostQueueTest.java index 6f819b453..85a0fc4bc 100644 --- a/test/java/net/yacy/crawler/HostQueueTest.java +++ b/test/java/net/yacy/crawler/HostQueueTest.java @@ -28,14 +28,13 @@ public class HostQueueTest { public void testClear() throws MalformedURLException, IOException, SpaceExceededException { File stackDirFile = new File(stackDir); String hostDir = "a.com"; - int hostPort = 80; - + String urlstr = "http://" + hostDir + "/test.html"; + DigestURL url = new DigestURL(urlstr); + // open queue - HostQueue testhq = new HostQueue(stackDirFile, hostDir, hostPort, true, true); + HostQueue testhq = new HostQueue(stackDirFile, url, true, true); // add a url - String urlstr = "http://" + hostDir + "/test.html"; - DigestURL url = new DigestURL(urlstr); Request req = new Request(url, null); testhq.push(req, null, null); From 708bcbb042df055741ed08efdeb39556463acc6c Mon Sep 17 00:00:00 2001 From: reger Date: Thu, 7 Jul 2016 02:50:57 +0200 Subject: [PATCH 3/8] one more replacement to use cached hosthash vs. calculated --- source/net/yacy/crawler/HostBalancer.java | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/source/net/yacy/crawler/HostBalancer.java b/source/net/yacy/crawler/HostBalancer.java index b6338b6c1..970017c83 100644 --- a/source/net/yacy/crawler/HostBalancer.java +++ b/source/net/yacy/crawler/HostBalancer.java @@ -489,12 +489,10 @@ public class HostBalancer implements Balancer { @Override public Map getDomainStackHosts(RobotsTxt robots) { Map map = new TreeMap(); // we use a tree map to get a stable ordering - for (HostQueue hq: this.queues.values()) try { - int delta = Latency.waitingRemainingGuessed(hq.getHost(), hq.getPort(), DigestURL.hosthash(hq.getHost(), hq.getPort()), robots, ClientIdentification.yacyInternetCrawlerAgent); + for (HostQueue hq: this.queues.values()) { + int delta = Latency.waitingRemainingGuessed(hq.getHost(), hq.getPort(), hq.getHostHash(), robots, ClientIdentification.yacyInternetCrawlerAgent); map.put(hq.getHost() + ":" + hq.getPort(), new Integer[]{hq.size(), delta}); - } catch (MalformedURLException e) { - ConcurrentLog.logException(e); - } + } return map; } From 7ab41d4ff1f44c973972151f58e737a46606e54e Mon Sep 17 00:00:00 2001 From: reger Date: Sat, 9 Jul 2016 19:55:47 +0200 Subject: [PATCH 4/8] use directories original lastmodified date in file- & smbloader in response --- source/net/yacy/crawler/retrieval/FileLoader.java | 2 +- source/net/yacy/crawler/retrieval/SMBLoader.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/source/net/yacy/crawler/retrieval/FileLoader.java b/source/net/yacy/crawler/retrieval/FileLoader.java index 12e809914..c1aab3fc2 100644 --- a/source/net/yacy/crawler/retrieval/FileLoader.java +++ b/source/net/yacy/crawler/retrieval/FileLoader.java @@ -81,7 +81,7 @@ public class FileLoader { StringBuilder content = FTPClient.dirhtml(u, null, null, null, list, true); ResponseHeader responseHeader = new ResponseHeader(200); - responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date())); + responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified()))); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html"); final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle())); Response response = new Response( diff --git a/source/net/yacy/crawler/retrieval/SMBLoader.java b/source/net/yacy/crawler/retrieval/SMBLoader.java index 8de78c8b0..dec2938b8 100644 --- a/source/net/yacy/crawler/retrieval/SMBLoader.java +++ b/source/net/yacy/crawler/retrieval/SMBLoader.java @@ -99,7 +99,7 @@ public class SMBLoader { StringBuilder content = FTPClient.dirhtml(u, null, null, null, list, true); ResponseHeader responseHeader = new ResponseHeader(200); - responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date())); + responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified()))); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html"); final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle())); Response response = new Response( From 3811184abdb0832ba7b08d7dd937917184721558 Mon Sep 17 00:00:00 2001 From: reger Date: Sat, 9 Jul 2016 23:39:43 +0200 Subject: [PATCH 5/8] fix GSA servlet clientIP retrival --- source/net/yacy/http/servlets/GSAsearchServlet.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/net/yacy/http/servlets/GSAsearchServlet.java b/source/net/yacy/http/servlets/GSAsearchServlet.java index 89468c030..bd71b49ad 100644 --- a/source/net/yacy/http/servlets/GSAsearchServlet.java +++ b/source/net/yacy/http/servlets/GSAsearchServlet.java @@ -96,7 +96,7 @@ public class GSAsearchServlet extends HttpServlet { public static void respond(final HttpServletRequest header, final Switchboard sb, final OutputStream out) { // remember the peer contact for peer statistics - String clientip = header.getHeader(HeaderFramework.CONNECTION_PROP_CLIENTIP); + String clientip = header.getRemoteAddr(); if (clientip == null) clientip = ""; // read an artificial header addendum String userAgent = header.getHeader(HeaderFramework.USER_AGENT); if (userAgent == null) userAgent = ""; From 360b38d9b6cd8d2fc3b3b946b808a7b560392dc8 Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 10 Jul 2016 05:44:56 +0200 Subject: [PATCH 6/8] fix CookieTest_p parameter from ResponseHeader to RequestHeader --- htroot/CookieTest_p.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/htroot/CookieTest_p.java b/htroot/CookieTest_p.java index 6179232f6..afb5ec832 100644 --- a/htroot/CookieTest_p.java +++ b/htroot/CookieTest_p.java @@ -29,6 +29,7 @@ import java.util.Iterator; import java.util.Map; +import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.util.CommonPattern; @@ -38,7 +39,7 @@ import net.yacy.server.servletProperties; public class CookieTest_p { - public static serverObjects respond(final ResponseHeader header, final serverObjects post, final serverSwitch env) { + public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { // case if no values are requested From a9527877121e6445095e9a68946658cd56043bc7 Mon Sep 17 00:00:00 2001 From: reger Date: Mon, 11 Jul 2016 02:33:12 +0200 Subject: [PATCH 7/8] adjust opensearchdescription to return url with protocol it was call on fix http://mantis.tokeek.de/view.php?id=669 --- htroot/opensearchdescription.java | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/htroot/opensearchdescription.java b/htroot/opensearchdescription.java index de72ff1c1..59a594ade 100644 --- a/htroot/opensearchdescription.java +++ b/htroot/opensearchdescription.java @@ -39,9 +39,16 @@ public class opensearchdescription { String promoteSearchPageGreeting = env.getConfig(SwitchboardConstants.GREETING, ""); if (env.getConfigBool(SwitchboardConstants.GREETING_NETWORK_NAME, false)) promoteSearchPageGreeting = env.getConfig("network.unit.description", ""); - String thisaddress = header.get("Host", Domains.LOCALHOST); - if (thisaddress.indexOf(':',0) == -1) thisaddress += ":" + env.getLocalPort(); - String thisprotocol = env.getConfigBool("server.https", false) ? "https" : "http"; + String thisaddress = header.get("Host"); // returns host:port (if not defalt http/https ports) + String thisprotocol = "http"; + if (thisaddress == null) { + thisaddress = Domains.LOCALHOST + ":" + sb.getConfig("port", "8090"); + } else { + final String sslport = ":" + sb.getConfig("port.ssl", "8443"); + if (thisaddress.endsWith(sslport)) { // connection on ssl port, use https protocol + thisprotocol = "https"; + } + } final serverObjects prop = new serverObjects(); prop.put("compareyacy", post != null && post.getBoolean("compare_yacy") ? 1 : 0); From 41d845285d1893d9bf2b9e0fd2268af88487be07 Mon Sep 17 00:00:00 2001 From: reger Date: Mon, 11 Jul 2016 04:08:24 +0200 Subject: [PATCH 8/8] add missing text for ConfigBasic.html to master.lng.xlf --- locales/master.lng.xlf | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/locales/master.lng.xlf b/locales/master.lng.xlf index 294c984de..20b4c1c31 100644 --- a/locales/master.lng.xlf +++ b/locales/master.lng.xlf @@ -1055,6 +1055,15 @@ please open your firewall for this port and/or set a virtual server option in your router to allow connections on this port + + Opening a router port is <i>not</i> a YaCy-specific task; + + + you can see instruction videos everywhere in the internet, just search for <a href="http://www.youtube.com/results?search_query=Open+Ports+on+a+Router">Open Ports on a &lt;our-router-type&gt; Router</a> and add your router type as search term. + + + However: if you fail to open a router port, you can nevertheless use YaCy with full functionality, the only function that is missing is on the side of the other YaCy users because they cannot see your peer. + Your peer can be reached by other peers