diff --git a/htroot/CookieTest_p.java b/htroot/CookieTest_p.java
index 6179232f6..afb5ec832 100644
--- a/htroot/CookieTest_p.java
+++ b/htroot/CookieTest_p.java
@@ -29,6 +29,7 @@
import java.util.Iterator;
import java.util.Map;
+import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.util.CommonPattern;
@@ -38,7 +39,7 @@ import net.yacy.server.servletProperties;
public class CookieTest_p {
- public static serverObjects respond(final ResponseHeader header, final serverObjects post, final serverSwitch env) {
+ public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
// case if no values are requested
diff --git a/htroot/opensearchdescription.java b/htroot/opensearchdescription.java
index de72ff1c1..59a594ade 100644
--- a/htroot/opensearchdescription.java
+++ b/htroot/opensearchdescription.java
@@ -39,9 +39,16 @@ public class opensearchdescription {
String promoteSearchPageGreeting = env.getConfig(SwitchboardConstants.GREETING, "");
if (env.getConfigBool(SwitchboardConstants.GREETING_NETWORK_NAME, false)) promoteSearchPageGreeting = env.getConfig("network.unit.description", "");
- String thisaddress = header.get("Host", Domains.LOCALHOST);
- if (thisaddress.indexOf(':',0) == -1) thisaddress += ":" + env.getLocalPort();
- String thisprotocol = env.getConfigBool("server.https", false) ? "https" : "http";
+ String thisaddress = header.get("Host"); // returns host:port (if not defalt http/https ports)
+ String thisprotocol = "http";
+ if (thisaddress == null) {
+ thisaddress = Domains.LOCALHOST + ":" + sb.getConfig("port", "8090");
+ } else {
+ final String sslport = ":" + sb.getConfig("port.ssl", "8443");
+ if (thisaddress.endsWith(sslport)) { // connection on ssl port, use https protocol
+ thisprotocol = "https";
+ }
+ }
final serverObjects prop = new serverObjects();
prop.put("compareyacy", post != null && post.getBoolean("compare_yacy") ? 1 : 0);
diff --git a/locales/master.lng.xlf b/locales/master.lng.xlf
index 294c984de..20b4c1c31 100644
--- a/locales/master.lng.xlf
+++ b/locales/master.lng.xlf
@@ -1055,6 +1055,15 @@
+
+
+
+
+
+
+
+
+
diff --git a/source/net/yacy/crawler/HostBalancer.java b/source/net/yacy/crawler/HostBalancer.java
index 9ffe4cf23..970017c83 100644
--- a/source/net/yacy/crawler/HostBalancer.java
+++ b/source/net/yacy/crawler/HostBalancer.java
@@ -103,7 +103,7 @@ public class HostBalancer implements Balancer {
queue.close();
FileUtils.deletedelete(queuePath);
} else {
- queues.put(DigestURL.hosthash(queue.getHost(), queue.getPort()), queue);
+ queues.put(queue.getHostHash(), queue);
}
} catch (MalformedURLException | RuntimeException e) {
log.warn("delete queue due to init error for " + hostsPath.getName() + " host=" + hoststr + " " + e.getLocalizedMessage());
@@ -244,11 +244,11 @@ public class HostBalancer implements Balancer {
public String push(final Request entry, CrawlProfile profile, final RobotsTxt robots) throws IOException, SpaceExceededException {
if (this.has(entry.url().hash())) return "double occurrence";
depthCache.put(entry.url().hash(), entry.depth());
- String hosthash = ASCII.String(entry.url().hash(), 6, 6);
+ String hosthash = entry.url().hosthash();
synchronized (this) {
HostQueue queue = this.queues.get(hosthash);
if (queue == null) {
- queue = new HostQueue(this.hostsPath, entry.url().getHost(), entry.url().getPort(), this.queues.size() > this.onDemandLimit, this.exceed134217727);
+ queue = new HostQueue(this.hostsPath, entry.url(), this.queues.size() > this.onDemandLimit, this.exceed134217727);
this.queues.put(hosthash, queue);
// profile might be null when continue crawls after YaCy restart
robots.ensureExist(entry.url(), profile == null ? ClientIdentification.yacyInternetCrawlerAgent : profile.getAgent(), true); // concurrently load all robots.txt
@@ -376,7 +376,7 @@ public class HostBalancer implements Balancer {
for (String h: lastEntries) this.roundRobinHostHashes.remove(h);
}
}
-
+
/*
// first strategy: get one entry which does not need sleep time
Iterator nhhi = this.roundRobinHostHashes.iterator();
@@ -386,7 +386,7 @@ public class HostBalancer implements Balancer {
if (rhq == null) {
nhhi.remove();
continue nosleep;
- }
+ }
int delta = Latency.waitingRemainingGuessed(rhq.getHost(), rhh, robots, ClientIdentification.yacyInternetCrawlerAgent);
if (delta <= 10 || this.roundRobinHostHashes.size() == 1 || rhq.size() == 1) {
nhhi.remove();
@@ -489,11 +489,9 @@ public class HostBalancer implements Balancer {
@Override
public Map getDomainStackHosts(RobotsTxt robots) {
Map map = new TreeMap(); // we use a tree map to get a stable ordering
- for (HostQueue hq: this.queues.values()) try {
- int delta = Latency.waitingRemainingGuessed(hq.getHost(), hq.getPort(), DigestURL.hosthash(hq.getHost(), hq.getPort()), robots, ClientIdentification.yacyInternetCrawlerAgent);
+ for (HostQueue hq: this.queues.values()) {
+ int delta = Latency.waitingRemainingGuessed(hq.getHost(), hq.getPort(), hq.getHostHash(), robots, ClientIdentification.yacyInternetCrawlerAgent);
map.put(hq.getHost() + ":" + hq.getPort(), new Integer[]{hq.size(), delta});
- } catch (MalformedURLException e) {
- ConcurrentLog.logException(e);
}
return map;
}
diff --git a/source/net/yacy/crawler/HostQueue.java b/source/net/yacy/crawler/HostQueue.java
index d16a65f39..3e9621a11 100644
--- a/source/net/yacy/crawler/HostQueue.java
+++ b/source/net/yacy/crawler/HostQueue.java
@@ -64,28 +64,48 @@ public class HostQueue implements Balancer {
private static final int EcoFSBufferSize = 1000;
private static final int objectIndexBufferSize = 1000;
- private final File hostPath;
+ private final File hostPath; // path to the stack files
private final String hostName;
- private String hostHash;
+ private final String hostHash;
private final int port;
private final boolean exceed134217727;
private final boolean onDemand;
private TreeMap depthStacks;
+ /**
+ * Create or open host queue. The host part of the hostUrl parameter is used
+ * to calculate the stack directory name.
+ *
+ * @param hostsPath
+ * @param hostUrl
+ * @param onDemand
+ * @param exceed134217727
+ * @throws MalformedURLException
+ */
public HostQueue (
final File hostsPath,
- final String hostName,
- final int port,
+ final DigestURL hostUrl, // any url from that host (only host data are extracted)
final boolean onDemand,
final boolean exceed134217727) throws MalformedURLException {
this.onDemand = onDemand;
this.exceed134217727 = exceed134217727;
- this.hostName = (hostName == null) ? "localhost" : hostName; // might be null (file://) but hostqueue needs a name (for queue file)
- this.port = port;
- this.hostPath = new File(hostsPath, this.hostName + "." + this.port);
+ this.hostName = (hostUrl.getHost() == null) ? "localhost" : hostUrl.getHost(); // might be null (file://) but hostqueue needs a name (for queue file)
+ this.port = hostUrl.getPort();
+ this.hostHash = hostUrl.hosthash(); // hosthash is calculated by protocol + hostname + port
+ // hostName/port included just for human readability (& historically), "-#" marker used to define begin of hosthash in directoryname
+ this.hostPath = new File(hostsPath, this.hostName + "-#"+ this.hostHash + "." + this.port);
init();
}
+ /**
+ * Initializes host queue from cache files. The internal id of the queue is
+ * extracted form the path name an must match the key initially generated
+ * currently the hosthash is used as id.
+ * @param hostPath path of the stack directory (containing the primary key/id of the queue)
+ * @param onDemand
+ * @param exceed134217727
+ * @throws MalformedURLException
+ */
public HostQueue (
final File hostPath,
final boolean onDemand,
@@ -95,10 +115,14 @@ public class HostQueue implements Balancer {
this.hostPath = hostPath;
// parse the hostName and port from the file name
String filename = hostPath.getName();
- int p = filename.lastIndexOf('.');
- if (p < 0) throw new RuntimeException("hostPath name must contain a dot: " + filename);
- this.hostName = filename.substring(0, p);
- this.port = Integer.parseInt(filename.substring(p + 1)); // consider "host.com" contains dot but no required port -> will throw exception
+ int pdot = filename.lastIndexOf('.');
+ if (pdot < 0) throw new RuntimeException("hostPath name must contain a dot: " + filename);
+ this.port = Integer.parseInt(filename.substring(pdot + 1)); // consider "host.com" contains dot but no required port -> will throw exception
+ int p1 = filename.lastIndexOf("-#");
+ if (p1 >= 0) {
+ this.hostName = filename.substring(0,p1);
+ this.hostHash = filename.substring(p1+2,pdot);
+ } else throw new RuntimeException("hostPath name must contain -# followd by hosthash: " + filename);
init();
}
@@ -107,14 +131,6 @@ public class HostQueue implements Balancer {
* @throws MalformedURLException if directory for the host could not be created
*/
private final void init() throws MalformedURLException {
- try {
- if (this.hostName == null)
- this.hostHash="";
- else
- this.hostHash = DigestURL.hosthash(this.hostName, this.port);
- } catch (MalformedURLException e) {
- this.hostHash = "";
- }
if (!(this.hostPath.exists())) {
this.hostPath.mkdirs();
if (!this.hostPath.exists()) { // check if directory created (if not, likely a name violation)
@@ -134,6 +150,15 @@ public class HostQueue implements Balancer {
return this.port;
}
+ /**
+ * Get the hosthash of this queue determined during init.
+ *
+ * @return
+ */
+ public String getHostHash() {
+ return this.hostHash;
+ }
+
private int openAllStacks() {
String[] l = this.hostPath.list();
int c = 0;
@@ -157,26 +182,6 @@ public class HostQueue implements Balancer {
return c;
}
- public synchronized int getLowestStackDepth() {
- while (this.depthStacks.size() > 0) {
- Map.Entry entry;
- synchronized (this) {
- entry = this.depthStacks.firstEntry();
- }
- if (entry == null) return 0; // happens only if map is empty
- if (entry.getValue().size() == 0) {
- entry.getValue().close();
- deletedelete(getFile(entry.getKey()));
- this.depthStacks.remove(entry.getKey());
- continue;
- }
- return entry.getKey();
- }
- // this should not happen but it happens if a deletion is done
- //assert false;
- return 0;
- }
-
private Index getLowestStack() {
while (this.depthStacks.size() > 0) {
Map.Entry entry;
@@ -196,16 +201,17 @@ public class HostQueue implements Balancer {
//assert false;
return null;
}
-
+
+ /**
+ * Get existing url stack with crawl depth or create a new (empty) stack
+ *
+ * @param depth
+ * @return existing or new/empty stack
+ */
private Index getStack(int depth) {
Index depthStack;
- synchronized (this) {
- depthStack = this.depthStacks.get(depth);
- if (depthStack != null) return depthStack;
- }
// create a new stack
synchronized (this) {
- // check again
depthStack = this.depthStacks.get(depth);
if (depthStack != null) return depthStack;
// now actually create a new stack
diff --git a/source/net/yacy/crawler/retrieval/FileLoader.java b/source/net/yacy/crawler/retrieval/FileLoader.java
index 12e809914..c1aab3fc2 100644
--- a/source/net/yacy/crawler/retrieval/FileLoader.java
+++ b/source/net/yacy/crawler/retrieval/FileLoader.java
@@ -81,7 +81,7 @@ public class FileLoader {
StringBuilder content = FTPClient.dirhtml(u, null, null, null, list, true);
ResponseHeader responseHeader = new ResponseHeader(200);
- responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
+ responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified())));
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
Response response = new Response(
diff --git a/source/net/yacy/crawler/retrieval/SMBLoader.java b/source/net/yacy/crawler/retrieval/SMBLoader.java
index 8de78c8b0..dec2938b8 100644
--- a/source/net/yacy/crawler/retrieval/SMBLoader.java
+++ b/source/net/yacy/crawler/retrieval/SMBLoader.java
@@ -99,7 +99,7 @@ public class SMBLoader {
StringBuilder content = FTPClient.dirhtml(u, null, null, null, list, true);
ResponseHeader responseHeader = new ResponseHeader(200);
- responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
+ responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified())));
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
Response response = new Response(
diff --git a/source/net/yacy/http/servlets/GSAsearchServlet.java b/source/net/yacy/http/servlets/GSAsearchServlet.java
index 89468c030..bd71b49ad 100644
--- a/source/net/yacy/http/servlets/GSAsearchServlet.java
+++ b/source/net/yacy/http/servlets/GSAsearchServlet.java
@@ -96,7 +96,7 @@ public class GSAsearchServlet extends HttpServlet {
public static void respond(final HttpServletRequest header, final Switchboard sb, final OutputStream out) {
// remember the peer contact for peer statistics
- String clientip = header.getHeader(HeaderFramework.CONNECTION_PROP_CLIENTIP);
+ String clientip = header.getRemoteAddr();
if (clientip == null) clientip = ""; // read an artificial header addendum
String userAgent = header.getHeader(HeaderFramework.USER_AGENT);
if (userAgent == null) userAgent = "";
diff --git a/test/java/net/yacy/crawler/HostQueueTest.java b/test/java/net/yacy/crawler/HostQueueTest.java
index 6f819b453..85a0fc4bc 100644
--- a/test/java/net/yacy/crawler/HostQueueTest.java
+++ b/test/java/net/yacy/crawler/HostQueueTest.java
@@ -28,14 +28,13 @@ public class HostQueueTest {
public void testClear() throws MalformedURLException, IOException, SpaceExceededException {
File stackDirFile = new File(stackDir);
String hostDir = "a.com";
- int hostPort = 80;
-
+ String urlstr = "http://" + hostDir + "/test.html";
+ DigestURL url = new DigestURL(urlstr);
+
// open queue
- HostQueue testhq = new HostQueue(stackDirFile, hostDir, hostPort, true, true);
+ HostQueue testhq = new HostQueue(stackDirFile, url, true, true);
// add a url
- String urlstr = "http://" + hostDir + "/test.html";
- DigestURL url = new DigestURL(urlstr);
Request req = new Request(url, null);
testhq.push(req, null, null);