added a "fromCache" flag in Response object to omit one cache.has()

check during snippet generation. This should cause less blockings
13 years ago · 7e0ddbd275
parent 81737dcb18
commit 7e0ddbd275
10 changed files with 84 additions and 75 deletions
--- a/source/de/anomic/crawler/retrieval/FTPLoader.java
+++ b/source/de/anomic/crawler/retrieval/FTPLoader.java
@ -134,6 +134,7 @@ public class FTPLoader {
                            responseHeader,
                            "200",
                            profile,
                            false,
                            dirList.toString().getBytes());
                }
            } else {
@ -253,6 +254,7 @@ public class FTPLoader {
                    responseHeader,
                    "200",
                    profile,
                    false,
                    null);
            return response;
        }
@ -268,6 +270,7 @@ public class FTPLoader {
                responseHeader,
                "200",
                profile,
                false,
                b);
        return response;
    }
--- a/source/de/anomic/crawler/retrieval/FileLoader.java
+++ b/source/de/anomic/crawler/retrieval/FileLoader.java
@ -11,12 +11,12 @@
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
- *  
+ *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
- *  
+ *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program in the file lgpl21.txt
 *  If not, see <http://www.gnu.org/licenses/>.
@ -30,8 +30,6 @@ import java.util.ArrayList;
 import java.util.Date;
 import java.util.List;
 import de.anomic.crawler.CrawlProfile;
 import net.yacy.cora.document.Classification;
 import net.yacy.cora.protocol.HeaderFramework;
 import net.yacy.cora.protocol.RequestHeader;
@ -43,6 +41,7 @@ import net.yacy.kelondro.logging.Log;
 import net.yacy.kelondro.util.FileUtils;
 import net.yacy.search.Switchboard;
 import net.yacy.search.index.Segments;
 import de.anomic.crawler.CrawlProfile;
 public class FileLoader {
@ -53,19 +52,19 @@ public class FileLoader {
    public FileLoader(final Switchboard sb, final Log log) {
        this.sb = sb;
        this.log = log;
-        maxFileSize = (int) sb.getConfigLong("crawler.file.maxFileSize", -1l);
+        this.maxFileSize = (int) sb.getConfigLong("crawler.file.maxFileSize", -1l);
    }
-    
+
    public Response load(final Request request, boolean acceptOnlyParseable) throws IOException {
        DigestURI url = request.url();
        if (!url.getProtocol().equals("file")) throw new IOException("wrong loader for FileLoader: " + url.getProtocol());
        RequestHeader requestHeader = new RequestHeader();
        if (request.referrerhash() != null) {
-            DigestURI ur = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
+            DigestURI ur = this.sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
            if (ur != null) requestHeader.put(RequestHeader.REFERER, ur.toNormalform(true, false));
        }
-        
+
        // process directories: transform them to html with meta robots=noindex (using the ftpc lib)
        String[] l = null;
        try {l = url.list();} catch (IOException e) {}
@ -83,30 +82,31 @@ public class FileLoader {
            for (String s: l) {
                list.add(u + ((u.endsWith("/") || u.endsWith("\\")) ? "" : "/") + s);
            }
-         
+
            StringBuilder content = FTPClient.dirhtml(u, null, null, null, list, true);
-            
+
            ResponseHeader responseHeader = new ResponseHeader();
            responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
            responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
-            final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
+            final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
            Response response = new Response(
-                    request, 
+                    request,
                    requestHeader,
                    responseHeader,
                    "200",
                    profile,
                    false,
                    content.toString().getBytes());
-            
+
            return response;
        }
-        
+
        // create response header
        String mime = Classification.ext2mime(url.getFileExtension());
        ResponseHeader responseHeader = new ResponseHeader();
        responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified())));
        responseHeader.put(HeaderFramework.CONTENT_TYPE, mime);
-        
+
        // check mime type and availability of parsers
        // and also check resource size and limitation of the size
        long size;
@ -117,42 +117,44 @@ public class FileLoader {
        }
        String parserError = null;
        if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) ||
-            (size > maxFileSize && maxFileSize >= 0)) {
+            (size > this.maxFileSize && this.maxFileSize >= 0)) {
            // we know that we cannot process that file before loading
            // only the metadata is returned
-            
+
            if (parserError != null) {
-                log.logInfo("No parser available in File crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata");
+                this.log.logInfo("No parser available in File crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata");
            } else {
-                log.logInfo("Too big file in File crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata");
+                this.log.logInfo("Too big file in File crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata");
            }
-            
+
            // create response with metadata only
            responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
-            final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
+            final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
            Response response = new Response(
-                    request, 
+                    request,
                    requestHeader,
                    responseHeader,
                    "200",
                    profile,
                    false,
                    url.toTokens().getBytes());
            return response;
        }
-        
+
        // load the resource
        InputStream is = url.getInputStream(null, -1);
        byte[] b = FileUtils.read(is);
        is.close();
-        
+
        // create response with loaded content
-        final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
+        final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
        Response response = new Response(
-                request, 
+                request,
                requestHeader,
                responseHeader,
                "200",
                profile,
                false,
                b);
        return response;
    }
--- a/source/de/anomic/crawler/retrieval/HTTPLoader.java
+++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java
@ -205,6 +205,7 @@ public final class HTTPLoader {
                    header,
                    Integer.toString(code),
                    profile,
                    false,
                    responseBody
            );
@ -273,6 +274,7 @@ public final class HTTPLoader {
                        header,
                        Integer.toString(code),
                        null,
                        false,
                        responseBody
                );
--- a/source/de/anomic/crawler/retrieval/Response.java
+++ b/source/de/anomic/crawler/retrieval/Response.java
@ -66,6 +66,7 @@ public class Response {
    private final  CrawlProfile       profile;
    private        byte[]             content;
    private        int                status;          // tracker indexing status, see status defs below
    private final  boolean            fromCache;
    // doctype calculation
    public static char docType(final MultiProtocolURI url) {
@ -151,6 +152,7 @@ public class Response {
            final ResponseHeader responseHeader,
            final String responseStatus,
            final CrawlProfile profile,
            final boolean fromCache,
            final byte[] content) {
        this.request = request;
        // request and response headers may be zero in case that we process surrogates
@ -160,6 +162,7 @@ public class Response {
        this.profile = profile;
        this.status = QUEUE_STATE_FRESH;
        this.content = content;
        this.fromCache = fromCache;
    }
    /**
@ -179,6 +182,7 @@ public class Response {
        this.profile = profile;
        this.status = QUEUE_STATE_FRESH;
        this.content = request.name().length() > 0 ? request.name().getBytes() : request.url().toTokens().getBytes();
        this.fromCache = true;
    }
    public Response(
@ -186,8 +190,9 @@ public class Response {
            final RequestHeader requestHeader,
            final ResponseHeader responseHeader,
            final String responseStatus,
-            final CrawlProfile profile) {
+            final CrawlProfile profile,
-        this(request, requestHeader, responseHeader, responseStatus, profile, null);
+            final boolean fromCache) {
        this(request, requestHeader, responseHeader, responseStatus, profile, fromCache, null);
    }
    public void updateStatus(final int newStatus) {
@ -198,6 +203,10 @@ public class Response {
        return this.responseHeader;
    }
    public boolean fromCache() {
        return this.fromCache;
    }
    public int getStatus() {
        return this.status;
    }
--- a/source/de/anomic/crawler/retrieval/SMBLoader.java
+++ b/source/de/anomic/crawler/retrieval/SMBLoader.java
@ -9,7 +9,7 @@
 // $LastChangedBy$
 //
 // LICENSE
-// 
+//
 // This program is free software; you can redistribute it and/or modify
 // it under the terms of the GNU General Public License as published by
 // the Free Software Foundation; either version 2 of the License, or
@ -38,9 +38,6 @@ import java.util.List;
 import jcifs.smb.SmbException;
 import jcifs.smb.SmbFile;
 import jcifs.smb.SmbFileInputStream;
 import de.anomic.crawler.CrawlProfile;
 import net.yacy.cora.document.Classification;
 import net.yacy.cora.document.MultiProtocolURI;
 import net.yacy.cora.protocol.HeaderFramework;
@ -53,11 +50,12 @@ import net.yacy.kelondro.logging.Log;
 import net.yacy.kelondro.util.FileUtils;
 import net.yacy.search.Switchboard;
 import net.yacy.search.index.Segments;
 import de.anomic.crawler.CrawlProfile;
 public class SMBLoader {
    public  static final long   DEFAULT_MAXFILESIZE = 1024 * 1024 * 10;
-    
+
    private final Switchboard sb;
    private final Log log;
    private final long maxFileSize;
@ -65,20 +63,20 @@ public class SMBLoader {
    public SMBLoader(final Switchboard sb, final Log log) {
        this.sb = sb;
        this.log = log;
-        maxFileSize = sb.getConfigLong("crawler.smb.maxFileSize", -1l);
+        this.maxFileSize = sb.getConfigLong("crawler.smb.maxFileSize", -1l);
    }
-    
+
-    
+
    public Response load(final Request request, boolean acceptOnlyParseable) throws IOException {
        DigestURI url = request.url();
        if (!url.getProtocol().equals("smb")) throw new IOException("wrong loader for SMBLoader: " + url.getProtocol());
        RequestHeader requestHeader = new RequestHeader();
        if (request.referrerhash() != null) {
-            DigestURI ur = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
+            DigestURI ur = this.sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
            if (ur != null) requestHeader.put(RequestHeader.REFERER, ur.toNormalform(true, false));
        }
-        
+
        // process directories: transform them to html with meta robots=noindex (using the ftpc lib)
        String[] l = null;
        try {l = url.list();} catch (IOException e) {}
@ -103,30 +101,31 @@ public class SMBLoader {
                }
                list.add(u + s);
            }
-         
+
            StringBuilder content = FTPClient.dirhtml(u, null, null, null, list, true);
-            
+
            ResponseHeader responseHeader = new ResponseHeader();
            responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
            responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
-            final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
+            final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
            Response response = new Response(
-                    request, 
+                    request,
                    requestHeader,
                    responseHeader,
                    "200",
                    profile,
                    false,
                    content.toString().getBytes());
-            
+
            return response;
        }
-        
+
        // create response header
        String mime = Classification.ext2mime(url.getFileExtension());
        ResponseHeader responseHeader = new ResponseHeader();
        responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified())));
        responseHeader.put(HeaderFramework.CONTENT_TYPE, mime);
-        
+
        // check mime type and availability of parsers
        // and also check resource size and limitation of the size
        long size;
@ -137,46 +136,48 @@ public class SMBLoader {
        }
        String parserError = null;
        if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) ||
-            (size > maxFileSize && maxFileSize >= 0)) {
+            (size > this.maxFileSize && this.maxFileSize >= 0)) {
            // we know that we cannot process that file before loading
            // only the metadata is returned
-            
+
            if (parserError != null) {
-                log.logInfo("No parser available in SMB crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata");
+                this.log.logInfo("No parser available in SMB crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata");
            } else {
-                log.logInfo("Too big file in SMB crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata");
+                this.log.logInfo("Too big file in SMB crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata");
            }
-            
+
            // create response with metadata only
            responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
-            final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
+            final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
            Response response = new Response(
-                    request, 
+                    request,
                    requestHeader,
                    responseHeader,
                    "200",
                    profile,
                    false,
                    url.toTokens().getBytes());
            return response;
        }
-        
+
        // load the resource
        InputStream is = url.getInputStream(null, -1);
        byte[] b = FileUtils.read(is);
        is.close();
-        
+
        // create response with loaded content
-        final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
+        final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
        Response response = new Response(
-                request, 
+                request,
                requestHeader,
                responseHeader,
                "200",
                profile,
                false,
                b);
        return response;
    }
- 
+
    public static void main(String[] args) {
        //jcifs.Config.setProperty( "jcifs.netbios.wins", "192.168.1.220" );
        //NtlmPasswordAuthentication auth = new NtlmPasswordAuthentication("domain", "username", "password");
--- a/source/de/anomic/http/server/HTTPDProxyHandler.java
+++ b/source/de/anomic/http/server/HTTPDProxyHandler.java
@ -404,7 +404,8 @@ public final class HTTPDProxyHandler {
                        requestHeader,
                        cachedResponseHeader,
                        "200 OK",
-                        sb.crawler.defaultProxyProfile
+                        sb.crawler.defaultProxyProfile,
                        false
                );
                final byte[] cacheContent = Cache.getContent(url.hash());
                if (cacheContent != null && response.isFreshForProxy()) {
@ -548,7 +549,8 @@ public final class HTTPDProxyHandler {
                            requestHeader,
                            responseHeader,
                            Integer.toString(client.getHttpResponse().getStatusLine().getStatusCode()),
-                            sb.crawler.defaultProxyProfile
+                            sb.crawler.defaultProxyProfile,
                            false
                    );
                    final String storeError = response.shallStoreCacheForProxy();
                    final boolean storeHTCache = response.profile().storeHTCache();
--- a/source/net/yacy/kelondro/data/meta/DigestURI.java
+++ b/source/net/yacy/kelondro/data/meta/DigestURI.java
@ -186,12 +186,7 @@ public class DigestURI extends MultiProtocolURI implements Serializable {
     */
    public final byte[] hash() {
        // in case that the object was initialized without a known url hash, compute it now
-        if (this.hash == null) {
+        if (this.hash == null) this.hash = urlHashComputation();
            // we check the this.hash value twice to avoid synchronization where possible
            synchronized (this.protocol) {
                if (this.hash == null) this.hash = urlHashComputation();
            }
        }
        return this.hash;
    }
@ -376,11 +371,7 @@ public class DigestURI extends MultiProtocolURI implements Serializable {
    @Override
    public final boolean isLocal() {
        if (this.isFile()) return true;
-        if (this.hash == null) synchronized (this.protocol) {
+        if (this.hash == null) this.hash = urlHashComputation();
            // this is synchronized because another thread may also call the same method in between
            // that is the reason that this.hash is checked again
            if (this.hash == null) this.hash = urlHashComputation();
        }
        return domDomain(this.hash) == 7;
    }
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@ -218,6 +218,7 @@ public final class LoaderDispatcher {
                        cachedResponse,
                        "200",
                        crawlProfile,
                        true,
                        content);
                // check which caching strategy shall be used
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -1794,7 +1794,7 @@ public final class Switchboard extends serverSwitch
                    0,
                    0,
                    0);
-            response = new Response(request, null, null, "200", this.crawler.defaultSurrogateProfile);
+            response = new Response(request, null, null, "200", this.crawler.defaultSurrogateProfile, false);
            final indexingQueueEntry queueEntry =
                new indexingQueueEntry(Segments.Process.SURROGATES, response, new Document[] {
                    document
--- a/source/net/yacy/search/snippet/TextSnippet.java
+++ b/source/net/yacy/search/snippet/TextSnippet.java
@ -201,12 +201,10 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
            removeMatchingHashes(row.dc_subject(), remainingHashes);
            removeMatchingHashes(row.url().toNormalform(true, true).replace('-', ' '), remainingHashes);
            boolean isInCache = de.anomic.crawler.Cache.has(url.hash());
            if (remainingHashes.size() == 0) {
                // the snippet is fully inside the metadata!
-                if (isInCache) {
+                if (de.anomic.crawler.Cache.has(url.hash())) {
                    // get the sentences from the cache
                    final Request request = loader.request(url, true, reindexing);
                    Response response;
@ -261,7 +259,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
                return;
            }
-            if (!isInCache && response != null) {
+            if (!response.fromCache()) {
                // place entry on indexing queue
                Switchboard.getSwitchboard().toIndexer(response);
                this.resultStatus = ResultClass.SOURCE_WEB;