diff --git a/source/de/anomic/crawler/retrieval/FTPLoader.java b/source/de/anomic/crawler/retrieval/FTPLoader.java
index acb5e5344..202c87034 100644
--- a/source/de/anomic/crawler/retrieval/FTPLoader.java
+++ b/source/de/anomic/crawler/retrieval/FTPLoader.java
@@ -134,6 +134,7 @@ public class FTPLoader {
responseHeader,
"200",
profile,
+ false,
dirList.toString().getBytes());
}
} else {
@@ -253,6 +254,7 @@ public class FTPLoader {
responseHeader,
"200",
profile,
+ false,
null);
return response;
}
@@ -268,6 +270,7 @@ public class FTPLoader {
responseHeader,
"200",
profile,
+ false,
b);
return response;
}
diff --git a/source/de/anomic/crawler/retrieval/FileLoader.java b/source/de/anomic/crawler/retrieval/FileLoader.java
index a2b9e6687..87451c169 100644
--- a/source/de/anomic/crawler/retrieval/FileLoader.java
+++ b/source/de/anomic/crawler/retrieval/FileLoader.java
@@ -11,12 +11,12 @@
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
- *
+ *
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
- *
+ *
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see .
@@ -30,8 +30,6 @@ import java.util.ArrayList;
import java.util.Date;
import java.util.List;
-import de.anomic.crawler.CrawlProfile;
-
import net.yacy.cora.document.Classification;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
@@ -43,6 +41,7 @@ import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Segments;
+import de.anomic.crawler.CrawlProfile;
public class FileLoader {
@@ -53,19 +52,19 @@ public class FileLoader {
public FileLoader(final Switchboard sb, final Log log) {
this.sb = sb;
this.log = log;
- maxFileSize = (int) sb.getConfigLong("crawler.file.maxFileSize", -1l);
+ this.maxFileSize = (int) sb.getConfigLong("crawler.file.maxFileSize", -1l);
}
-
+
public Response load(final Request request, boolean acceptOnlyParseable) throws IOException {
DigestURI url = request.url();
if (!url.getProtocol().equals("file")) throw new IOException("wrong loader for FileLoader: " + url.getProtocol());
RequestHeader requestHeader = new RequestHeader();
if (request.referrerhash() != null) {
- DigestURI ur = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
+ DigestURI ur = this.sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
if (ur != null) requestHeader.put(RequestHeader.REFERER, ur.toNormalform(true, false));
}
-
+
// process directories: transform them to html with meta robots=noindex (using the ftpc lib)
String[] l = null;
try {l = url.list();} catch (IOException e) {}
@@ -83,30 +82,31 @@ public class FileLoader {
for (String s: l) {
list.add(u + ((u.endsWith("/") || u.endsWith("\\")) ? "" : "/") + s);
}
-
+
StringBuilder content = FTPClient.dirhtml(u, null, null, null, list, true);
-
+
ResponseHeader responseHeader = new ResponseHeader();
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
- final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
+ final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
Response response = new Response(
- request,
+ request,
requestHeader,
responseHeader,
"200",
profile,
+ false,
content.toString().getBytes());
-
+
return response;
}
-
+
// create response header
String mime = Classification.ext2mime(url.getFileExtension());
ResponseHeader responseHeader = new ResponseHeader();
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified())));
responseHeader.put(HeaderFramework.CONTENT_TYPE, mime);
-
+
// check mime type and availability of parsers
// and also check resource size and limitation of the size
long size;
@@ -117,42 +117,44 @@ public class FileLoader {
}
String parserError = null;
if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) ||
- (size > maxFileSize && maxFileSize >= 0)) {
+ (size > this.maxFileSize && this.maxFileSize >= 0)) {
// we know that we cannot process that file before loading
// only the metadata is returned
-
+
if (parserError != null) {
- log.logInfo("No parser available in File crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata");
+ this.log.logInfo("No parser available in File crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata");
} else {
- log.logInfo("Too big file in File crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata");
+ this.log.logInfo("Too big file in File crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata");
}
-
+
// create response with metadata only
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
- final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
+ final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
Response response = new Response(
- request,
+ request,
requestHeader,
responseHeader,
"200",
profile,
+ false,
url.toTokens().getBytes());
return response;
}
-
+
// load the resource
InputStream is = url.getInputStream(null, -1);
byte[] b = FileUtils.read(is);
is.close();
-
+
// create response with loaded content
- final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
+ final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
Response response = new Response(
- request,
+ request,
requestHeader,
responseHeader,
"200",
profile,
+ false,
b);
return response;
}
diff --git a/source/de/anomic/crawler/retrieval/HTTPLoader.java b/source/de/anomic/crawler/retrieval/HTTPLoader.java
index d68ccc743..a3547feda 100644
--- a/source/de/anomic/crawler/retrieval/HTTPLoader.java
+++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java
@@ -205,6 +205,7 @@ public final class HTTPLoader {
header,
Integer.toString(code),
profile,
+ false,
responseBody
);
@@ -273,6 +274,7 @@ public final class HTTPLoader {
header,
Integer.toString(code),
null,
+ false,
responseBody
);
diff --git a/source/de/anomic/crawler/retrieval/Response.java b/source/de/anomic/crawler/retrieval/Response.java
index a3d0eaa8b..c6a854d20 100644
--- a/source/de/anomic/crawler/retrieval/Response.java
+++ b/source/de/anomic/crawler/retrieval/Response.java
@@ -66,6 +66,7 @@ public class Response {
private final CrawlProfile profile;
private byte[] content;
private int status; // tracker indexing status, see status defs below
+ private final boolean fromCache;
// doctype calculation
public static char docType(final MultiProtocolURI url) {
@@ -151,6 +152,7 @@ public class Response {
final ResponseHeader responseHeader,
final String responseStatus,
final CrawlProfile profile,
+ final boolean fromCache,
final byte[] content) {
this.request = request;
// request and response headers may be zero in case that we process surrogates
@@ -160,6 +162,7 @@ public class Response {
this.profile = profile;
this.status = QUEUE_STATE_FRESH;
this.content = content;
+ this.fromCache = fromCache;
}
/**
@@ -179,6 +182,7 @@ public class Response {
this.profile = profile;
this.status = QUEUE_STATE_FRESH;
this.content = request.name().length() > 0 ? request.name().getBytes() : request.url().toTokens().getBytes();
+ this.fromCache = true;
}
public Response(
@@ -186,8 +190,9 @@ public class Response {
final RequestHeader requestHeader,
final ResponseHeader responseHeader,
final String responseStatus,
- final CrawlProfile profile) {
- this(request, requestHeader, responseHeader, responseStatus, profile, null);
+ final CrawlProfile profile,
+ final boolean fromCache) {
+ this(request, requestHeader, responseHeader, responseStatus, profile, fromCache, null);
}
public void updateStatus(final int newStatus) {
@@ -198,6 +203,10 @@ public class Response {
return this.responseHeader;
}
+ public boolean fromCache() {
+ return this.fromCache;
+ }
+
public int getStatus() {
return this.status;
}
diff --git a/source/de/anomic/crawler/retrieval/SMBLoader.java b/source/de/anomic/crawler/retrieval/SMBLoader.java
index d3e516b00..e968263be 100644
--- a/source/de/anomic/crawler/retrieval/SMBLoader.java
+++ b/source/de/anomic/crawler/retrieval/SMBLoader.java
@@ -9,7 +9,7 @@
// $LastChangedBy$
//
// LICENSE
-//
+//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
@@ -38,9 +38,6 @@ import java.util.List;
import jcifs.smb.SmbException;
import jcifs.smb.SmbFile;
import jcifs.smb.SmbFileInputStream;
-
-import de.anomic.crawler.CrawlProfile;
-
import net.yacy.cora.document.Classification;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.HeaderFramework;
@@ -53,11 +50,12 @@ import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Segments;
+import de.anomic.crawler.CrawlProfile;
public class SMBLoader {
public static final long DEFAULT_MAXFILESIZE = 1024 * 1024 * 10;
-
+
private final Switchboard sb;
private final Log log;
private final long maxFileSize;
@@ -65,20 +63,20 @@ public class SMBLoader {
public SMBLoader(final Switchboard sb, final Log log) {
this.sb = sb;
this.log = log;
- maxFileSize = sb.getConfigLong("crawler.smb.maxFileSize", -1l);
+ this.maxFileSize = sb.getConfigLong("crawler.smb.maxFileSize", -1l);
}
-
-
+
+
public Response load(final Request request, boolean acceptOnlyParseable) throws IOException {
DigestURI url = request.url();
if (!url.getProtocol().equals("smb")) throw new IOException("wrong loader for SMBLoader: " + url.getProtocol());
RequestHeader requestHeader = new RequestHeader();
if (request.referrerhash() != null) {
- DigestURI ur = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
+ DigestURI ur = this.sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
if (ur != null) requestHeader.put(RequestHeader.REFERER, ur.toNormalform(true, false));
}
-
+
// process directories: transform them to html with meta robots=noindex (using the ftpc lib)
String[] l = null;
try {l = url.list();} catch (IOException e) {}
@@ -103,30 +101,31 @@ public class SMBLoader {
}
list.add(u + s);
}
-
+
StringBuilder content = FTPClient.dirhtml(u, null, null, null, list, true);
-
+
ResponseHeader responseHeader = new ResponseHeader();
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
- final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
+ final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
Response response = new Response(
- request,
+ request,
requestHeader,
responseHeader,
"200",
profile,
+ false,
content.toString().getBytes());
-
+
return response;
}
-
+
// create response header
String mime = Classification.ext2mime(url.getFileExtension());
ResponseHeader responseHeader = new ResponseHeader();
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified())));
responseHeader.put(HeaderFramework.CONTENT_TYPE, mime);
-
+
// check mime type and availability of parsers
// and also check resource size and limitation of the size
long size;
@@ -137,46 +136,48 @@ public class SMBLoader {
}
String parserError = null;
if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) ||
- (size > maxFileSize && maxFileSize >= 0)) {
+ (size > this.maxFileSize && this.maxFileSize >= 0)) {
// we know that we cannot process that file before loading
// only the metadata is returned
-
+
if (parserError != null) {
- log.logInfo("No parser available in SMB crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata");
+ this.log.logInfo("No parser available in SMB crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata");
} else {
- log.logInfo("Too big file in SMB crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata");
+ this.log.logInfo("Too big file in SMB crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata");
}
-
+
// create response with metadata only
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
- final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
+ final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
Response response = new Response(
- request,
+ request,
requestHeader,
responseHeader,
"200",
profile,
+ false,
url.toTokens().getBytes());
return response;
}
-
+
// load the resource
InputStream is = url.getInputStream(null, -1);
byte[] b = FileUtils.read(is);
is.close();
-
+
// create response with loaded content
- final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
+ final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
Response response = new Response(
- request,
+ request,
requestHeader,
responseHeader,
"200",
profile,
+ false,
b);
return response;
}
-
+
public static void main(String[] args) {
//jcifs.Config.setProperty( "jcifs.netbios.wins", "192.168.1.220" );
//NtlmPasswordAuthentication auth = new NtlmPasswordAuthentication("domain", "username", "password");
diff --git a/source/de/anomic/http/server/HTTPDProxyHandler.java b/source/de/anomic/http/server/HTTPDProxyHandler.java
index 54f83ce31..7fcc86ac9 100644
--- a/source/de/anomic/http/server/HTTPDProxyHandler.java
+++ b/source/de/anomic/http/server/HTTPDProxyHandler.java
@@ -404,7 +404,8 @@ public final class HTTPDProxyHandler {
requestHeader,
cachedResponseHeader,
"200 OK",
- sb.crawler.defaultProxyProfile
+ sb.crawler.defaultProxyProfile,
+ false
);
final byte[] cacheContent = Cache.getContent(url.hash());
if (cacheContent != null && response.isFreshForProxy()) {
@@ -548,7 +549,8 @@ public final class HTTPDProxyHandler {
requestHeader,
responseHeader,
Integer.toString(client.getHttpResponse().getStatusLine().getStatusCode()),
- sb.crawler.defaultProxyProfile
+ sb.crawler.defaultProxyProfile,
+ false
);
final String storeError = response.shallStoreCacheForProxy();
final boolean storeHTCache = response.profile().storeHTCache();
diff --git a/source/net/yacy/kelondro/data/meta/DigestURI.java b/source/net/yacy/kelondro/data/meta/DigestURI.java
index 6e2cd9491..5b2b95aab 100644
--- a/source/net/yacy/kelondro/data/meta/DigestURI.java
+++ b/source/net/yacy/kelondro/data/meta/DigestURI.java
@@ -186,12 +186,7 @@ public class DigestURI extends MultiProtocolURI implements Serializable {
*/
public final byte[] hash() {
// in case that the object was initialized without a known url hash, compute it now
- if (this.hash == null) {
- // we check the this.hash value twice to avoid synchronization where possible
- synchronized (this.protocol) {
- if (this.hash == null) this.hash = urlHashComputation();
- }
- }
+ if (this.hash == null) this.hash = urlHashComputation();
return this.hash;
}
@@ -376,11 +371,7 @@ public class DigestURI extends MultiProtocolURI implements Serializable {
@Override
public final boolean isLocal() {
if (this.isFile()) return true;
- if (this.hash == null) synchronized (this.protocol) {
- // this is synchronized because another thread may also call the same method in between
- // that is the reason that this.hash is checked again
- if (this.hash == null) this.hash = urlHashComputation();
- }
+ if (this.hash == null) this.hash = urlHashComputation();
return domDomain(this.hash) == 7;
}
diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java
index 254f0c66a..82cef5fd4 100644
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@@ -218,6 +218,7 @@ public final class LoaderDispatcher {
cachedResponse,
"200",
crawlProfile,
+ true,
content);
// check which caching strategy shall be used
diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java
index de1ac8465..171a13905 100644
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@@ -1794,7 +1794,7 @@ public final class Switchboard extends serverSwitch
0,
0,
0);
- response = new Response(request, null, null, "200", this.crawler.defaultSurrogateProfile);
+ response = new Response(request, null, null, "200", this.crawler.defaultSurrogateProfile, false);
final indexingQueueEntry queueEntry =
new indexingQueueEntry(Segments.Process.SURROGATES, response, new Document[] {
document
diff --git a/source/net/yacy/search/snippet/TextSnippet.java b/source/net/yacy/search/snippet/TextSnippet.java
index 2e00cf792..c0d0c5fa9 100644
--- a/source/net/yacy/search/snippet/TextSnippet.java
+++ b/source/net/yacy/search/snippet/TextSnippet.java
@@ -201,12 +201,10 @@ public class TextSnippet implements Comparable, Comparator, Comparator