From 393a7d10beee92acd2c8b26cdb5a60508a8573d3 Mon Sep 17 00:00:00 2001 From: theli Date: Mon, 4 Sep 2006 15:03:54 +0000 Subject: [PATCH] *) setting htCache.Entry fields to private git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2484 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/http/httpdProxyHandler.java | 2 +- .../plasma/crawler/http/CrawlWorker.java | 8 +-- source/de/anomic/plasma/plasmaHTCache.java | 71 ++++++++++++++----- .../de/anomic/plasma/plasmaSnippetCache.java | 2 +- .../de/anomic/plasma/plasmaSwitchboard.java | 48 ++++++------- 5 files changed, 84 insertions(+), 47 deletions(-) diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java index 211f80087..4accb1604 100644 --- a/source/de/anomic/http/httpdProxyHandler.java +++ b/source/de/anomic/http/httpdProxyHandler.java @@ -630,7 +630,7 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt String storeError = cacheEntry.shallStoreCacheForProxy(); boolean storeHTCache = cacheEntry.profile.storeHTCache(); - boolean isSupportedContent = plasmaParser.supportedContent(plasmaParser.PARSER_MODE_PROXY,cacheEntry.url,cacheEntry.responseHeader.mime()); + boolean isSupportedContent = plasmaParser.supportedContent(plasmaParser.PARSER_MODE_PROXY,cacheEntry.url(),cacheEntry.responseHeader.mime()); if ( /* * Now we store the response into the htcache directory if diff --git a/source/de/anomic/plasma/crawler/http/CrawlWorker.java b/source/de/anomic/plasma/crawler/http/CrawlWorker.java index 7fe642b65..5eaafd77b 100644 --- a/source/de/anomic/plasma/crawler/http/CrawlWorker.java +++ b/source/de/anomic/plasma/crawler/http/CrawlWorker.java @@ -200,7 +200,7 @@ public final class CrawlWorker extends AbstractCrawlWorker { htCache = createCacheEntry(requestDate, requestHeader, res); // aborting download if content is to long ... - if (htCache.cacheFile.getAbsolutePath().length() > serverSystem.maxPathLength) { + if (htCache.cacheFile().getAbsolutePath().length() > serverSystem.maxPathLength) { remote.close(); this.log.logInfo("REJECTED URL " + this.url.toString() + " because path too long '" + this.cacheManager.cachePath.getAbsolutePath() + "'"); addURLtoErrorDB(plasmaCrawlEURL.DENIED_CACHEFILE_PATH_TOO_LONG); @@ -208,11 +208,11 @@ public final class CrawlWorker extends AbstractCrawlWorker { } // reserve cache entry - if (!htCache.cacheFile.getCanonicalPath().startsWith(this.cacheManager.cachePath.getCanonicalPath())) { + if (!htCache.cacheFile().getCanonicalPath().startsWith(this.cacheManager.cachePath.getCanonicalPath())) { // if the response has not the right file type then reject file remote.close(); this.log.logInfo("REJECTED URL " + this.url.toString() + " because of an invalid file path ('" + - htCache.cacheFile.getCanonicalPath() + "' does not start with '" + + htCache.cacheFile().getCanonicalPath() + "' does not start with '" + this.cacheManager.cachePath.getAbsolutePath() + "')."); addURLtoErrorDB(plasmaCrawlEURL.DENIED_INVALID_CACHEFILE_PATH); return (htCache = null); @@ -231,7 +231,7 @@ public final class CrawlWorker extends AbstractCrawlWorker { try { fos = new FileOutputStream(cacheFile); res.writeContent(fos); // superfluous write to array - htCache.cacheArray = null; + htCache.setCacheArray(null); this.cacheManager.writeFileAnnouncement(cacheFile); //htCache.cacheArray = res.writeContent(fos); // writes in cacheArray and cache file } finally { diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java index 8fe6b0693..a1f79c61a 100644 --- a/source/de/anomic/plasma/plasmaHTCache.java +++ b/source/de/anomic/plasma/plasmaHTCache.java @@ -701,23 +701,23 @@ public final class plasmaHTCache { public final class Entry { // the class objects - public Date initDate; // the date when the request happened; will be used as a key - public int depth; // the depth of prefetching - public httpHeader requestHeader; // we carry also the header to prevent too many file system access - public String responseStatus; - public httpHeader responseHeader; // we carry also the header to prevent too many file system access - public File cacheFile; // the cache file - public byte[] cacheArray; // or the cache as byte-array - public URL url; - public String name; // the name of the link, read as anchor from an -tag - public String nomalizedURLHash; - public String nomalizedURLString; - public int status; // cache load/hit/stale etc status - public Date lastModified; - public char doctype; - public String language; - public plasmaCrawlProfile.entry profile; - private String initiator; + private Date initDate; // the date when the request happened; will be used as a key + private int depth; // the depth of prefetching + private httpHeader requestHeader; // we carry also the header to prevent too many file system access + private String responseStatus; + private httpHeader responseHeader; // we carry also the header to prevent too many file system access + private File cacheFile; // the cache file + private byte[] cacheArray; // or the cache as byte-array + private URL url; + private String name; // the name of the link, read as anchor from an -tag + private String nomalizedURLHash; + private String nomalizedURLString; + private int status; // cache load/hit/stale etc status + private Date lastModified; + private char doctype; + private String language; + private plasmaCrawlProfile.entry profile; + private String initiator; protected Object clone() throws CloneNotSupportedException { return new Entry( @@ -793,6 +793,19 @@ public final class plasmaHTCache { public String name() { return this.name; } + + public URL url() { + return this.url; + } + + public String urlHash() { + return this.nomalizedURLHash; + } + + public plasmaCrawlProfile.entry profile() { + return this.profile; + } + public String initiator() { return this.initiator; } @@ -804,6 +817,10 @@ public final class plasmaHTCache { return this.cacheArray.length; } + public int depth() { + return this.depth; + } + public URL referrerURL() { if (this.requestHeader == null) return null; try { @@ -813,6 +830,26 @@ public final class plasmaHTCache { } } + public File cacheFile() { + return this.cacheFile; + } + + public void setCacheArray(byte[] data) { + this.cacheArray = data; + } + + public byte[] cacheArray() { + return this.cacheArray; + } + + public httpHeader requestHeader() { + return this.requestHeader; + } + + public httpHeader responseHeader() { + return this.responseHeader; + } + /* public boolean update() { return ((status == CACHE_FILL) || (status == CACHE_STALE_RELOAD_GOOD)); diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index 2ebdc5213..676f3e047 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -173,7 +173,7 @@ public class plasmaSnippetCache { if ((fetchOnline) && (resource == null)) { plasmaHTCache.Entry entry = loadResourceFromWeb(url, 5000); if (entry != null) { - header = entry.responseHeader; + header = entry.responseHeader(); } resource = cacheManager.loadResource(url); source = SOURCE_WEB; diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index ae8f929c1..9a13401c0 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -814,7 +814,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser * Testing if the content type is supported by the available parsers * ========================================================================= */ boolean isSupportedContent = (entry.responseHeader != null) && - plasmaParser.supportedContent(entry.url,entry.responseHeader.mime()); + plasmaParser.supportedContent(entry.url(),entry.responseHeader.mime()); /* ========================================================================= * INDEX CONTROL HEADER @@ -823,10 +823,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser * yacy to index the response returned as answer to a request * ========================================================================= */ boolean doIndexing = true; - if (entry.requestHeader != null) { + if (entry.requestHeader() != null) { if ( - (entry.requestHeader.containsKey(httpHeader.X_YACY_INDEX_CONTROL)) && - (((String) entry.requestHeader.get(httpHeader.X_YACY_INDEX_CONTROL)).toUpperCase().equals("NO-INDEX")) + (entry.requestHeader().containsKey(httpHeader.X_YACY_INDEX_CONTROL)) && + (((String) entry.requestHeader().get(httpHeader.X_YACY_INDEX_CONTROL)).toUpperCase().equals("NO-INDEX")) ) { doIndexing = false; } @@ -837,17 +837,17 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser * * check if ip is local ip address * ========================================================================= */ - InetAddress hostAddress = httpc.dnsResolve(entry.url.getHost()); + InetAddress hostAddress = httpc.dnsResolve(entry.url().getHost()); if (hostAddress == null) { if (this.remoteProxyConfig == null || !this.remoteProxyConfig.useProxy()) { - this.log.logFine("Unknown host in URL '" + entry.url + "'. Will not be indexed."); + this.log.logFine("Unknown host in URL '" + entry.url() + "'. Will not be indexed."); doIndexing = false; } } else if (hostAddress.isSiteLocalAddress()) { - this.log.logFine("Host in URL '" + entry.url + "' has private ip address. Will not be indexed."); + this.log.logFine("Host in URL '" + entry.url() + "' has private ip address. Will not be indexed."); doIndexing = false; } else if (hostAddress.isLoopbackAddress()) { - this.log.logFine("Host in URL '" + entry.url + "' has loopback ip address. Will not be indexed."); + this.log.logFine("Host in URL '" + entry.url() + "' has loopback ip address. Will not be indexed."); doIndexing = false; } @@ -859,25 +859,25 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser * b) the content should be indexed * ========================================================================= */ if ( - (entry.profile.storeHTCache()) || + (entry.profile().storeHTCache()) || (doIndexing && isSupportedContent) ) { // store response header if (entry.responseHeader != null) { - this.cacheManager.storeHeader(entry.nomalizedURLHash, entry.responseHeader); - this.log.logInfo("WROTE HEADER for " + entry.cacheFile); + this.cacheManager.storeHeader(entry.urlHash(), entry.responseHeader); + this.log.logInfo("WROTE HEADER for " + entry.cacheFile()); } // work off unwritten files - if (entry.cacheArray == null) { + if (entry.cacheArray() == null) { //this.log.logFine("EXISTING FILE (" + entry.cacheFile.length() + " bytes) for " + entry.cacheFile); } else { String error = entry.shallStoreCacheForProxy(); if (error == null) { - this.cacheManager.writeFile(entry.url, entry.cacheArray); - this.log.logFine("WROTE FILE (" + entry.cacheArray.length + " bytes) for " + entry.cacheFile); + this.cacheManager.writeFile(entry.url(), entry.cacheArray()); + this.log.logFine("WROTE FILE (" + entry.cacheArray().length + " bytes) for " + entry.cacheFile()); } else { - this.log.logFine("WRITE OF FILE " + entry.cacheFile + " FORBIDDEN: " + error); + this.log.logFine("WRITE OF FILE " + entry.cacheFile() + " FORBIDDEN: " + error); } } } @@ -888,24 +888,24 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if (doIndexing && isSupportedContent){ // registering the cachefile as in use - if (entry.cacheFile.exists()) { - plasmaHTCache.filesInUse.add(entry.cacheFile); + if (entry.cacheFile().exists()) { + plasmaHTCache.filesInUse.add(entry.cacheFile()); } // enqueue for further crawling enQueue(this.sbQueue.newEntry( - entry.url, + entry.url(), indexURL.urlHash(entry.referrerURL()), - entry.requestHeader.ifModifiedSince(), - entry.requestHeader.containsKey(httpHeader.COOKIE), + entry.requestHeader().ifModifiedSince(), + entry.requestHeader().containsKey(httpHeader.COOKIE), entry.initiator(), - entry.depth, - entry.profile.handle(), + entry.depth(), + entry.profile().handle(), entry.name() )); } else { - if (!entry.profile.storeHTCache() && entry.cacheFile.exists()) { - this.cacheManager.deleteFile(entry.url); + if (!entry.profile().storeHTCache() && entry.cacheFile().exists()) { + this.cacheManager.deleteFile(entry.url()); } }