From 7989335ed653ca6befeabf50d4a990078d952f1c Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 19 Aug 2008 14:10:40 +0000 Subject: [PATCH] Preparations to replace the HTCache with a new storage data structure: - refactoring of the HTCache (separation of cache entry) - added new storage class for BLOBs. (not used yet, this is half-way to a new structure) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5062 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- build.properties | 2 +- htroot/ViewFile.java | 3 +- source/de/anomic/crawler/CrawlQueues.java | 4 +- source/de/anomic/crawler/FTPLoader.java | 13 +- source/de/anomic/crawler/HTTPLoader.java | 10 +- source/de/anomic/crawler/LoaderMessage.java | 10 +- source/de/anomic/crawler/ProtocolLoader.java | 6 +- .../de/anomic/http/httpdProxyCacheEntry.java | 335 +++++++++++++++++ source/de/anomic/http/httpdProxyHandler.java | 7 +- source/de/anomic/icap/icapd.java | 4 +- .../de/anomic/kelondro/kelondroBLOBArray.java | 242 +++++++++++++ .../kelondro/kelondroMergeIterator.java | 4 +- .../anomic/kelondro/kelondroSplitTable.java | 13 +- source/de/anomic/plasma/plasmaHTCache.java | 340 +----------------- .../de/anomic/plasma/plasmaSnippetCache.java | 7 +- .../de/anomic/plasma/plasmaSwitchboard.java | 3 +- source/de/anomic/plasma/plasmaWordIndex.java | 5 +- source/de/anomic/ymage/ymageOSM.java | 3 +- 18 files changed, 644 insertions(+), 367 deletions(-) create mode 100755 source/de/anomic/http/httpdProxyCacheEntry.java create mode 100755 source/de/anomic/kelondro/kelondroBLOBArray.java diff --git a/build.properties b/build.properties index 135ba9bdf..19c3c4fbc 100644 --- a/build.properties +++ b/build.properties @@ -3,7 +3,7 @@ javacSource=1.5 javacTarget=1.5 # Release Configuration -releaseVersion=0.593 +releaseVersion=0.594 stdReleaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz embReleaseFile=yacy_emb_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz proReleaseFile=yacy_pro_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index 90f9305b4..bd3d19f3b 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -37,6 +37,7 @@ import de.anomic.data.htmlTools; import de.anomic.htmlFilter.htmlFilterImageEntry; import de.anomic.http.HttpClient; import de.anomic.http.httpHeader; +import de.anomic.http.httpdProxyCacheEntry; import de.anomic.index.indexURLReference; import de.anomic.plasma.plasmaCondenser; import de.anomic.plasma.plasmaHTCache; @@ -145,7 +146,7 @@ public class ViewFile { // if the resource body was not cached we try to load it from web if (resource == null) { - plasmaHTCache.Entry entry = null; + httpdProxyCacheEntry entry = null; try { entry = sb.crawlQueues.loadResourceFromWeb(url, 5000, false, true, false); } catch (final Exception e) { diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index 0bdae76cc..e574851bb 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -36,8 +36,8 @@ import java.util.Iterator; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; +import de.anomic.http.httpdProxyCacheEntry; import de.anomic.kelondro.kelondroFlexWidthArray; -import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaParser; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboardConstants; @@ -465,7 +465,7 @@ public class CrawlQueues { return; } - public plasmaHTCache.Entry loadResourceFromWeb( + public httpdProxyCacheEntry loadResourceFromWeb( final yacyURL url, final int socketTimeout, final boolean keepInMemory, diff --git a/source/de/anomic/crawler/FTPLoader.java b/source/de/anomic/crawler/FTPLoader.java index 0953d6cee..dab1224e1 100644 --- a/source/de/anomic/crawler/FTPLoader.java +++ b/source/de/anomic/crawler/FTPLoader.java @@ -34,6 +34,7 @@ import java.io.PrintStream; import java.io.PrintWriter; import java.util.Date; +import de.anomic.http.httpdProxyCacheEntry; import de.anomic.net.ftpc; import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaParser; @@ -54,9 +55,9 @@ public class FTPLoader { maxFileSize = (int) sb.getConfigLong("crawler.ftp.maxFileSize", -1l); } - protected plasmaHTCache.Entry createCacheEntry(final CrawlEntry entry, final String mimeType, + protected httpdProxyCacheEntry createCacheEntry(final CrawlEntry entry, final String mimeType, final Date fileDate) { - return plasmaHTCache.newEntry(new Date(), entry.depth(), entry.url(), entry.name(), "OK", new ResourceInfo( + return plasmaHTCache.newEntry(entry.depth(), entry.url(), entry.name(), "OK", new ResourceInfo( entry.url(), sb.getURL(entry.referrerhash()), mimeType, fileDate), entry.initiator(), sb.webIndex.profilesActiveCrawls.getEntry(entry.profileHandle())); } @@ -67,13 +68,13 @@ public class FTPLoader { * @param entry * @return */ - public plasmaHTCache.Entry load(final CrawlEntry entry) { + public httpdProxyCacheEntry load(final CrawlEntry entry) { final yacyURL entryUrl = entry.url(); final String fullPath = getPath(entryUrl); final File cacheFile = createCachefile(entryUrl); // the return value - plasmaHTCache.Entry htCache = null; + httpdProxyCacheEntry htCache = null; // determine filename and path String file, path; @@ -232,7 +233,7 @@ public class FTPLoader { * @return * @throws Exception */ - private plasmaHTCache.Entry getFile(final ftpc ftpClient, final CrawlEntry entry, final File cacheFile) + private httpdProxyCacheEntry getFile(final ftpc ftpClient, final CrawlEntry entry, final File cacheFile) throws Exception { // determine the mimetype of the resource final yacyURL entryUrl = entry.url(); @@ -242,7 +243,7 @@ public class FTPLoader { // if the mimetype and file extension is supported we start to download // the file - plasmaHTCache.Entry htCache = null; + httpdProxyCacheEntry htCache = null; if (plasmaParser.supportedContent(plasmaParser.PARSER_MODE_CRAWLER, entryUrl, mimeType)) { // aborting download if content is too long final int size = ftpClient.fileSize(path); diff --git a/source/de/anomic/crawler/HTTPLoader.java b/source/de/anomic/crawler/HTTPLoader.java index c7dd43592..0e9e054bd 100644 --- a/source/de/anomic/crawler/HTTPLoader.java +++ b/source/de/anomic/crawler/HTTPLoader.java @@ -42,6 +42,7 @@ import de.anomic.http.httpHeader; import de.anomic.http.httpdBoundedSizeOutputStream; import de.anomic.http.httpdByteCountOutputStream; import de.anomic.http.httpdLimitExceededException; +import de.anomic.http.httpdProxyCacheEntry; import de.anomic.index.indexReferenceBlacklist; import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaParser; @@ -95,10 +96,9 @@ public final class HTTPLoader { * @param responseStatus Status-Code SPACE Reason-Phrase * @return */ - protected plasmaHTCache.Entry createCacheEntry(final CrawlEntry entry, final Date requestDate, final httpHeader requestHeader, final httpHeader responseHeader, final String responseStatus) { + protected httpdProxyCacheEntry createCacheEntry(final CrawlEntry entry, final Date requestDate, final httpHeader requestHeader, final httpHeader responseHeader, final String responseStatus) { final IResourceInfo resourceInfo = new ResourceInfo(entry.url(), requestHeader, responseHeader); return plasmaHTCache.newEntry( - requestDate, entry.depth(), entry.url(), entry.name(), @@ -109,11 +109,11 @@ public final class HTTPLoader { ); } - public plasmaHTCache.Entry load(final CrawlEntry entry, final String parserMode) { + public httpdProxyCacheEntry load(final CrawlEntry entry, final String parserMode) { return load(entry, parserMode, DEFAULT_CRAWLING_RETRY_COUNT); } - private plasmaHTCache.Entry load(final CrawlEntry entry, final String parserMode, final int retryCount) { + private httpdProxyCacheEntry load(final CrawlEntry entry, final String parserMode, final int retryCount) { if (retryCount < 0) { this.log.logInfo("Redirection counter exceeded for URL " + entry.url().toString() + ". Processing aborted."); @@ -137,7 +137,7 @@ public final class HTTPLoader { } // take a file from the net - plasmaHTCache.Entry htCache = null; + httpdProxyCacheEntry htCache = null; final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", DEFAULT_MAXFILESIZE); try { // create a request header diff --git a/source/de/anomic/crawler/LoaderMessage.java b/source/de/anomic/crawler/LoaderMessage.java index 54ed0a4b2..c9c5e287a 100644 --- a/source/de/anomic/crawler/LoaderMessage.java +++ b/source/de/anomic/crawler/LoaderMessage.java @@ -23,7 +23,7 @@ package de.anomic.crawler; -import de.anomic.plasma.plasmaHTCache; +import de.anomic.http.httpdProxyCacheEntry; import de.anomic.server.serverSemaphore; import de.anomic.yacy.yacyURL; @@ -41,7 +41,7 @@ public final class LoaderMessage { public final boolean keepInMemory; private serverSemaphore resultSync = null; - private plasmaHTCache.Entry result; + private httpdProxyCacheEntry result; private String errorMessage; // loadParallel(URL url, String referer, String initiator, int depth, plasmaCrawlProfile.entry profile) { @@ -80,7 +80,7 @@ public final class LoaderMessage { return this.errorMessage; } - public void setResult(final plasmaHTCache.Entry theResult) { + public void setResult(final httpdProxyCacheEntry theResult) { // store the result this.result = theResult; @@ -88,8 +88,8 @@ public final class LoaderMessage { this.resultSync.V(); } - public plasmaHTCache.Entry waitForResult() throws InterruptedException { - plasmaHTCache.Entry theResult = null; + public httpdProxyCacheEntry waitForResult() throws InterruptedException { + httpdProxyCacheEntry theResult = null; this.resultSync.P(); /* =====> CRITICAL SECTION <======== */ diff --git a/source/de/anomic/crawler/ProtocolLoader.java b/source/de/anomic/crawler/ProtocolLoader.java index e6653c03f..0044a12ad 100644 --- a/source/de/anomic/crawler/ProtocolLoader.java +++ b/source/de/anomic/crawler/ProtocolLoader.java @@ -32,7 +32,7 @@ import java.util.Iterator; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; -import de.anomic.plasma.plasmaHTCache; +import de.anomic.http.httpdProxyCacheEntry; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverCore; import de.anomic.server.logging.serverLog; @@ -68,7 +68,7 @@ public final class ProtocolLoader { return (HashSet) this.supportedProtocols.clone(); } - public plasmaHTCache.Entry load(final CrawlEntry entry, final String parserMode) { + public httpdProxyCacheEntry load(final CrawlEntry entry, final String parserMode) { // getting the protocol of the next URL final String protocol = entry.url().getProtocol(); final String host = entry.url().getHost(); @@ -109,7 +109,7 @@ public final class ProtocolLoader { public String process(final CrawlEntry entry, final String parserMode) { // load a resource, store it to htcache and push queue entry to switchboard queue // returns null if everything went fine, a fail reason string if a problem occurred - plasmaHTCache.Entry h; + httpdProxyCacheEntry h; try { h = load(entry, parserMode); entry.setStatus("loaded"); diff --git a/source/de/anomic/http/httpdProxyCacheEntry.java b/source/de/anomic/http/httpdProxyCacheEntry.java new file mode 100755 index 000000000..ddbcd1dea --- /dev/null +++ b/source/de/anomic/http/httpdProxyCacheEntry.java @@ -0,0 +1,335 @@ +// httpdProxyCacheEntry.java +// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 19.08.2008 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ +// $LastChangedRevision: 1986 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.http; + +import java.io.File; +import java.util.Date; + +import de.anomic.crawler.CrawlProfile; +import de.anomic.plasma.plasmaHTCache; +import de.anomic.plasma.cache.IResourceInfo; +import de.anomic.server.serverSystem; +import de.anomic.yacy.yacyURL; + +public class httpdProxyCacheEntry { + + // doctypes: + public static final char DT_PDFPS = 'p'; + public static final char DT_TEXT = 't'; + public static final char DT_HTML = 'h'; + public static final char DT_DOC = 'd'; + public static final char DT_IMAGE = 'i'; + public static final char DT_MOVIE = 'm'; + public static final char DT_FLASH = 'f'; + public static final char DT_SHARE = 's'; + public static final char DT_AUDIO = 'a'; + public static final char DT_BINARY = 'b'; + public static final char DT_UNKNOWN = 'u'; + + // the class objects + private final int depth; // the depth of pre-fetching + private final String responseStatus; + private final File cacheFile; // the cache file + private byte[] cacheArray; // or the cache as byte-array + private final yacyURL url; + private final String name; // the name of the link, read as anchor from an -tag + private final Date lastModified; + private char doctype; + private final String language; + private final CrawlProfile.entry profile; + private final String initiator; + + /** + * protocol specific information about the resource + */ + private final IResourceInfo resInfo; + + // doctype calculation + public static char docType(final yacyURL url) { + final String path = url.getPath().toLowerCase(); + // serverLog.logFinest("PLASMA", "docType URL=" + path); + char doctype = DT_UNKNOWN; + if (path.endsWith(".gif")) { doctype = DT_IMAGE; } + else if (path.endsWith(".ico")) { doctype = DT_IMAGE; } + else if (path.endsWith(".bmp")) { doctype = DT_IMAGE; } + else if (path.endsWith(".jpg")) { doctype = DT_IMAGE; } + else if (path.endsWith(".jpeg")) { doctype = DT_IMAGE; } + else if (path.endsWith(".png")) { doctype = DT_IMAGE; } + else if (path.endsWith(".html")) { doctype = DT_HTML; } + else if (path.endsWith(".txt")) { doctype = DT_TEXT; } + else if (path.endsWith(".doc")) { doctype = DT_DOC; } + else if (path.endsWith(".rtf")) { doctype = DT_DOC; } + else if (path.endsWith(".pdf")) { doctype = DT_PDFPS; } + else if (path.endsWith(".ps")) { doctype = DT_PDFPS; } + else if (path.endsWith(".avi")) { doctype = DT_MOVIE; } + else if (path.endsWith(".mov")) { doctype = DT_MOVIE; } + else if (path.endsWith(".qt")) { doctype = DT_MOVIE; } + else if (path.endsWith(".mpg")) { doctype = DT_MOVIE; } + else if (path.endsWith(".md5")) { doctype = DT_SHARE; } + else if (path.endsWith(".mpeg")) { doctype = DT_MOVIE; } + else if (path.endsWith(".asf")) { doctype = DT_FLASH; } + return doctype; + } + + public static char docType(final String mime) { + // serverLog.logFinest("PLASMA", "docType mime=" + mime); + char doctype = DT_UNKNOWN; + if (mime == null) doctype = DT_UNKNOWN; + else if (mime.startsWith("image/")) doctype = DT_IMAGE; + else if (mime.endsWith("/gif")) doctype = DT_IMAGE; + else if (mime.endsWith("/jpeg")) doctype = DT_IMAGE; + else if (mime.endsWith("/png")) doctype = DT_IMAGE; + else if (mime.endsWith("/html")) doctype = DT_HTML; + else if (mime.endsWith("/rtf")) doctype = DT_DOC; + else if (mime.endsWith("/pdf")) doctype = DT_PDFPS; + else if (mime.endsWith("/octet-stream")) doctype = DT_BINARY; + else if (mime.endsWith("/x-shockwave-flash")) doctype = DT_FLASH; + else if (mime.endsWith("/msword")) doctype = DT_DOC; + else if (mime.endsWith("/mspowerpoint")) doctype = DT_DOC; + else if (mime.endsWith("/postscript")) doctype = DT_PDFPS; + else if (mime.startsWith("text/")) doctype = DT_TEXT; + else if (mime.startsWith("image/")) doctype = DT_IMAGE; + else if (mime.startsWith("audio/")) doctype = DT_AUDIO; + else if (mime.startsWith("video/")) doctype = DT_MOVIE; + //bz2 = application/x-bzip2 + //dvi = application/x-dvi + //gz = application/gzip + //hqx = application/mac-binhex40 + //lha = application/x-lzh + //lzh = application/x-lzh + //pac = application/x-ns-proxy-autoconfig + //php = application/x-httpd-php + //phtml = application/x-httpd-php + //rss = application/xml + //tar = application/tar + //tex = application/x-tex + //tgz = application/tar + //torrent = application/x-bittorrent + //xhtml = application/xhtml+xml + //xla = application/msexcel + //xls = application/msexcel + //xsl = application/xml + //xml = application/xml + //Z = application/x-compress + //zip = application/zip + return doctype; + } + + public httpdProxyCacheEntry(final int depth, + final yacyURL url, final String name, final String responseStatus, + final IResourceInfo resourceInfo, final String initiator, + final CrawlProfile.entry profile) { + if (resourceInfo == null) { + System.out.println("Content information object is null. " + url); + System.exit(0); + } + this.resInfo = resourceInfo; + this.url = url; + this.name = name; + this.cacheFile = plasmaHTCache.getCachePath(this.url); + + // assigned: + this.depth = depth; + this.responseStatus = responseStatus; + this.profile = profile; + this.initiator = (initiator == null) ? null : ((initiator.length() == 0) ? null : initiator); + + // getting the last modified date + this.lastModified = resourceInfo.getModificationDate(); + + // getting the doctype + this.doctype = docType(resourceInfo.getMimeType()); + if (this.doctype == DT_UNKNOWN) + this.doctype = docType(url); + this.language = yacyURL.language(url); + + // to be defined later: + this.cacheArray = null; + } + + public String name() { + // the anchor name; can be either the text inside the anchor tag or the + // page description after loading of the page + return this.name; + } + + public yacyURL url() { + return this.url; + } + + public String urlHash() { + return this.url.hash(); + } + + public Date lastModified() { + return this.lastModified; + } + + public String language() { + return this.language; + } + + public CrawlProfile.entry profile() { + return this.profile; + } + + public String initiator() { + return this.initiator; + } + + public boolean proxy() { + return initiator() == null; + } + + public long size() { + if (this.cacheArray == null) + return 0; + return this.cacheArray.length; + } + + public int depth() { + return this.depth; + } + + public yacyURL referrerURL() { + return (this.resInfo == null) ? null : this.resInfo.getRefererUrl(); + } + + public File cacheFile() { + return this.cacheFile; + } + + public void setCacheArray(final byte[] data) { + this.cacheArray = data; + } + + public byte[] cacheArray() { + return this.cacheArray; + } + + public IResourceInfo getDocumentInfo() { + return this.resInfo; + } + + public String getMimeType() { + return (this.resInfo == null) ? null : this.resInfo.getMimeType(); + } + + public Date ifModifiedSince() { + return (this.resInfo == null) ? null : this.resInfo.ifModifiedSince(); + } + + public boolean requestWithCookie() { + return (this.resInfo == null) ? false : this.resInfo.requestWithCookie(); + } + + public boolean requestProhibitsIndexing() { + return (this.resInfo == null) ? false : this.resInfo.requestProhibitsIndexing(); + } + + + // the following three methods for cache read/write granting shall be as loose + // as possible but also as strict as necessary to enable caching of most items + + /** + * @return NULL if the answer is TRUE, in case of FALSE, the reason as + * String is returned + */ + public String shallStoreCacheForProxy() { + + // check profile (disabled: we will check this in the plasmaSwitchboard) + // if (!this.profile.storeHTCache()) { return "storage_not_wanted"; } + + // decide upon header information if a specific file should be stored to + // the cache or not + // if the storage was requested by prefetching, the request map is null + + // check status code + if ((this.resInfo != null) + && (!this.resInfo.validResponseStatus(this.responseStatus))) { + return "bad_status_" + this.responseStatus.substring(0, 3); + } + + // check storage location + // sometimes a file name is equal to a path name in the same directory; + // or sometimes a file name is equal a directory name created earlier; + // we cannot match that here in the cache file path and therefore omit + // writing into the cache + if (this.cacheFile.getParentFile().isFile() + || this.cacheFile.isDirectory()) { + return "path_ambiguous"; + } + if (this.cacheFile.toString().indexOf("..") >= 0) { + return "path_dangerous"; + } + if (this.cacheFile.getAbsolutePath().length() > serverSystem.maxPathLength) { + return "path too long"; + } + + // -CGI access in request + // CGI access makes the page very individual, and therefore not usable + // in caches + if (this.url.isPOST() && !this.profile.crawlingQ()) { + return "dynamic_post"; + } + if (this.url.isCGI()) { + return "dynamic_cgi"; + } + + if (this.resInfo != null) { + return this.resInfo.shallStoreCacheForProxy(); + } + + return null; + } + + /** + * decide upon header information if a specific file should be taken from + * the cache or not + * + * @return whether the file should be taken from the cache + */ + public boolean shallUseCacheForProxy() { + + // -CGI access in request + // CGI access makes the page very individual, and therefore not usable + // in caches + if (this.url.isPOST()) { + return false; + } + if (this.url.isCGI()) { + return false; + } + + if (this.resInfo != null) { + return this.resInfo.shallUseCacheForProxy(); + } + + return true; + } + +} diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java index b0f7893b2..3a1323a30 100644 --- a/source/de/anomic/http/httpdProxyHandler.java +++ b/source/de/anomic/http/httpdProxyHandler.java @@ -400,9 +400,8 @@ public final class httpdProxyHandler { // 4. cache stale - refill - superfluous // in two of these cases we trigger a scheduler to handle newly arrived files: // case 1 and case 3 - final plasmaHTCache.Entry cacheEntry = (cachedResponseHeader == null) ? null : + final httpdProxyCacheEntry cacheEntry = (cachedResponseHeader == null) ? null : plasmaHTCache.newEntry( - requestDate, // init date 0, // crawling depth url, // url "", // name of the url is unknown @@ -526,10 +525,8 @@ public final class httpdProxyHandler { } // reserver cache entry - final Date requestDate = new Date(((Long)conProp.get(httpHeader.CONNECTION_PROP_REQUEST_START)).longValue()); final IResourceInfo resInfo = new ResourceInfo(url,requestHeader,responseHeader); - final plasmaHTCache.Entry cacheEntry = plasmaHTCache.newEntry( - requestDate, + final httpdProxyCacheEntry cacheEntry = plasmaHTCache.newEntry( 0, url, "", diff --git a/source/de/anomic/icap/icapd.java b/source/de/anomic/icap/icapd.java index a632042ec..10b393667 100644 --- a/source/de/anomic/icap/icapd.java +++ b/source/de/anomic/icap/icapd.java @@ -40,6 +40,7 @@ import java.util.Properties; import de.anomic.http.HttpClient; import de.anomic.http.httpChunkedInputStream; import de.anomic.http.httpHeader; +import de.anomic.http.httpdProxyCacheEntry; import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaParser; import de.anomic.plasma.plasmaSwitchboard; @@ -376,8 +377,7 @@ public class icapd implements serverHandler, Cloneable { // generating a htcache entry object final IResourceInfo resInfo = new ResourceInfo(httpRequestURL,httpReqHeader,httpResHeader); - final plasmaHTCache.Entry cacheEntry = plasmaHTCache.newEntry( - new Date(), + final httpdProxyCacheEntry cacheEntry = plasmaHTCache.newEntry( 0, httpRequestURL, "", diff --git a/source/de/anomic/kelondro/kelondroBLOBArray.java b/source/de/anomic/kelondro/kelondroBLOBArray.java new file mode 100755 index 000000000..0bd2bb7d7 --- /dev/null +++ b/source/de/anomic/kelondro/kelondroBLOBArray.java @@ -0,0 +1,242 @@ +// kelondroBLOBArray.java +// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 19.08.2008 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ +// $LastChangedRevision: 1986 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.kelondro; + +import java.io.File; +import java.io.IOException; +import java.text.ParseException; +import java.util.ArrayList; +import java.util.Date; +import java.util.Iterator; +import java.util.List; +import java.util.TreeMap; +import java.util.concurrent.CopyOnWriteArrayList; + +import de.anomic.server.serverDate; + +public class kelondroBLOBArray implements kelondroBLOB { + + /* + * This class implements a BLOB using a set of kelondroBLOBHeap objects + * In addition to a kelondroBLOBHeap this BLOB can delete large amounts of data using a given time limit. + * This is realized by creating separate BLOB files. New Files are created when either + * - a given time limit is reached + * - a given space limit is reached + * To organize such an array of BLOB files, the following file name structure is used: + * /.blob + * That means all BLOB files are inside a directory that has the name of the BLOBArray. + * To delete content that is out-dated, one special method is implemented that deletes content by a given + * time-out. Deletions are not made automatically, they must be triggered using this method. + */ + + private int keylength; + private kelondroByteOrder ordering; + private File heapLocation; + private long maxage; + private long maxsize; + private List blobs; + + public kelondroBLOBArray( + final File heapLocation, + final int keylength, final kelondroByteOrder ordering, + long maxage, long maxsize + ) throws IOException { + this.keylength = keylength; + this.ordering = ordering; + this.heapLocation = heapLocation; + this.maxage = maxage; + this.maxsize = maxsize; + + // check existence of the heap directory + if (heapLocation.exists()) { + if (!heapLocation.isDirectory()) throw new IOException("the BLOBArray directory " + heapLocation.toString() + " does not exist (is blocked by a file with same name"); + } else { + heapLocation.mkdirs(); + } + + // register all blob files inside this directory + String[] files = heapLocation.list(); + Date d; + TreeMap sortedItems = new TreeMap(); + kelondroBLOB oneBlob; + File f; + for (int i = 0; i < files.length; i++) { + if (files[i].length() == 17 && files[i].endsWith("blob")) { + try { + d = serverDate.parseShortSecond(files[i].substring(0, 12)); + } catch (ParseException e) {continue;} + f = new File(heapLocation, files[i]); + oneBlob = new kelondroBLOBHeap(f, keylength, ordering); + sortedItems.put(new Long(d.getTime()), new blobItem(d, f, oneBlob)); + } + } + + // read the blob tree in a sorted way and write them into an array + blobs = new CopyOnWriteArrayList(); + for (blobItem bi : sortedItems.values()) { + blobs.add(bi); + } + } + + private class blobItem { + Date creation; + File location; + kelondroBLOB blob; + public blobItem(Date creation, File location, kelondroBLOB blob) { + this.creation = creation; + this.location = location; + this.blob = blob; + } + public blobItem() throws IOException { + // make a new blob file and assign it in this item + this.creation = new Date(); + this.location = new File(heapLocation, serverDate.formatShortSecond(creation) + ".blob"); + this.blob = new kelondroBLOBHeap(location, keylength, ordering);; + } + } + + /** + * ask for the length of the primary key + * @return the length of the key + */ + public int keylength() { + return this.keylength; + } + + /** + * clears the content of the database + * @throws IOException + */ + public void clear() throws IOException { + for (blobItem bi: blobs) bi.blob.clear(); + blobs.clear(); + } + + /** + * ask for the number of entries + * @return the number of entries in the table + */ + public int size() { + int s = 0; + for (blobItem bi: blobs) s += bi.blob.size(); + return s; + } + + /** + * iterator over all keys + * @param up + * @param rotating + * @return + * @throws IOException + */ + public kelondroCloneableIterator keys(boolean up, boolean rotating) throws IOException { + assert rotating = false; + final List> c = new ArrayList>(blobs.size()); + final Iterator i = blobs.iterator(); + while (i.hasNext()) { + c.add(i.next().blob.keys(up, rotating)); + } + return kelondroMergeIterator.cascade(c, this.ordering, kelondroMergeIterator.simpleMerge, up); + } + + /** + * iterate over all keys + * @param up + * @param firstKey + * @return + * @throws IOException + */ + public kelondroCloneableIterator keys(boolean up, byte[] firstKey) throws IOException { + final List> c = new ArrayList>(blobs.size()); + final Iterator i = blobs.iterator(); + while (i.hasNext()) { + c.add(i.next().blob.keys(up, firstKey)); + } + return kelondroMergeIterator.cascade(c, this.ordering, kelondroMergeIterator.simpleMerge, up); + } + + /** + * check if a specific key is in the database + * @param key the primary key + * @return + * @throws IOException + */ + public boolean has(byte[] key) throws IOException { + for (blobItem bi: blobs) if (bi.blob.has(key)) return true; + return false; + } + + /** + * retrieve the whole BLOB from the table + * @param key the primary key + * @return + * @throws IOException + */ + public byte[] get(byte[] key) throws IOException { + byte[] b; + for (blobItem bi: blobs) { + b = bi.blob.get(key); + if (b != null) return b; + } + return null; + } + + /** + * write a whole byte array as BLOB to the table + * @param key the primary key + * @param b + * @throws IOException + */ + public void put(byte[] key, byte[] b) throws IOException { + blobItem bi = (blobs.size() == 0) ? null : blobs.get(blobs.size() - 1); + if ((bi == null) || (bi.creation.getTime() - System.currentTimeMillis() > this.maxage) || (bi.location.length() > this.maxsize)) { + // add a new blob to the array + bi = new blobItem(); + blobs.add(bi); + } + bi.blob.put(key, b); + } + + /** + * remove a BLOB + * @param key the primary key + * @throws IOException + */ + public void remove(byte[] key) throws IOException { + for (blobItem bi: blobs) bi.blob.remove(key); + } + + /** + * close the BLOB + */ + public void close() { + for (blobItem bi: blobs) bi.blob.close(); + blobs.clear(); + blobs = null; + } + +} diff --git a/source/de/anomic/kelondro/kelondroMergeIterator.java b/source/de/anomic/kelondro/kelondroMergeIterator.java index a09bd86ae..e8e83254a 100644 --- a/source/de/anomic/kelondro/kelondroMergeIterator.java +++ b/source/de/anomic/kelondro/kelondroMergeIterator.java @@ -24,10 +24,10 @@ package de.anomic.kelondro; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; +import java.util.Collection; import java.util.Comparator; import java.util.ConcurrentModificationException; import java.util.Iterator; -import java.util.Set; public class kelondroMergeIterator implements kelondroCloneableIterator { @@ -119,7 +119,7 @@ public class kelondroMergeIterator implements kelondroCloneableIterator { throw new java.lang.UnsupportedOperationException("merge does not support remove"); } - public static kelondroCloneableIterator cascade(final Set> iterators, final kelondroOrder c, final Method merger, final boolean up) { + public static kelondroCloneableIterator cascade(final Collection> iterators, final kelondroOrder c, final Method merger, final boolean up) { // this extends the ability to combine two iterators // to the ability of combining a set of iterators if (iterators == null) return null; diff --git a/source/de/anomic/kelondro/kelondroSplitTable.java b/source/de/anomic/kelondro/kelondroSplitTable.java index 682f79cd4..97d6d5ede 100644 --- a/source/de/anomic/kelondro/kelondroSplitTable.java +++ b/source/de/anomic/kelondro/kelondroSplitTable.java @@ -32,7 +32,6 @@ import java.util.ArrayList; import java.util.Calendar; import java.util.Date; import java.util.HashMap; -import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -380,21 +379,21 @@ public class kelondroSplitTable implements kelondroIndex { } public synchronized kelondroCloneableIterator keys(final boolean up, final byte[] firstKey) throws IOException { - final HashSet> set = new HashSet>(); + final List> c = new ArrayList>(tables.size()); final Iterator i = tables.values().iterator(); while (i.hasNext()) { - set.add(i.next().keys(up, firstKey)); + c.add(i.next().keys(up, firstKey)); } - return kelondroMergeIterator.cascade(set, rowdef.objectOrder, kelondroMergeIterator.simpleMerge, up); + return kelondroMergeIterator.cascade(c, rowdef.objectOrder, kelondroMergeIterator.simpleMerge, up); } public synchronized kelondroCloneableIterator rows(final boolean up, final byte[] firstKey) throws IOException { - final HashSet> set = new HashSet>(); + final List> c = new ArrayList>(tables.size()); final Iterator i = tables.values().iterator(); while (i.hasNext()) { - set.add(i.next().rows(up, firstKey)); + c.add(i.next().rows(up, firstKey)); } - return kelondroMergeIterator.cascade(set, entryOrder, kelondroMergeIterator.simpleMerge, up); + return kelondroMergeIterator.cascade(c, entryOrder, kelondroMergeIterator.simpleMerge, up); } public final int cacheObjectChunkSize() { diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java index 0fa85d47f..c364c6ba8 100644 --- a/source/de/anomic/plasma/plasmaHTCache.java +++ b/source/de/anomic/plasma/plasmaHTCache.java @@ -42,7 +42,6 @@ import java.io.IOException; import java.io.InputStream; import java.net.InetAddress; import java.util.Collections; -import java.util.Date; import java.util.HashMap; import java.util.Map; import java.util.SortedMap; @@ -53,6 +52,7 @@ import java.util.regex.Pattern; import de.anomic.crawler.CrawlProfile; import de.anomic.http.httpHeader; +import de.anomic.http.httpdProxyCacheEntry; import de.anomic.kelondro.kelondroBLOB; import de.anomic.kelondro.kelondroBLOBHeap; import de.anomic.kelondro.kelondroBLOBTree; @@ -66,7 +66,6 @@ import de.anomic.server.serverCodings; import de.anomic.server.serverDomains; import de.anomic.server.serverFileUtils; import de.anomic.server.serverInstantBusyThread; -import de.anomic.server.serverSystem; import de.anomic.server.serverThread; import de.anomic.server.logging.serverLog; import de.anomic.tools.enumerateFiles; @@ -82,7 +81,7 @@ public final class plasmaHTCache { public static final long oneday = 1000 * 60 * 60 * 24; // milliseconds of a day private static kelondroMap responseHeaderDB = null; - private static final ConcurrentLinkedQueue cacheStack = new ConcurrentLinkedQueue(); + private static final ConcurrentLinkedQueue cacheStack = new ConcurrentLinkedQueue(); private static final SortedMap cacheAge = Collections.synchronizedSortedMap(new TreeMap()); // a - relation public static long curCacheSize = 0; public static long maxCacheSize = 0l; @@ -93,19 +92,6 @@ public final class plasmaHTCache { private static ResourceInfoFactory objFactory = new ResourceInfoFactory(); private static serverThread cacheScanThread = null; - // doctypes: - public static final char DT_PDFPS = 'p'; - public static final char DT_TEXT = 't'; - public static final char DT_HTML = 'h'; - public static final char DT_DOC = 'd'; - public static final char DT_IMAGE = 'i'; - public static final char DT_MOVIE = 'm'; - public static final char DT_FLASH = 'f'; - public static final char DT_SHARE = 's'; - public static final char DT_AUDIO = 'a'; - public static final char DT_BINARY = 'b'; - public static final char DT_UNKNOWN = 'u'; - // URL attributes public static final int UA_LOCAL = 0; // URL was crawled locally public static final int UA_TILDE = 1; // tilde appears in URL @@ -115,76 +101,6 @@ public final class plasmaHTCache { public static final char LT_LOCAL = 'L'; public static final char LT_GLOBAL = 'G'; - // doctype calculation - public static char docType(final yacyURL url) { - final String path = url.getPath().toLowerCase(); - // serverLog.logFinest("PLASMA", "docType URL=" + path); - char doctype = DT_UNKNOWN; - if (path.endsWith(".gif")) { doctype = DT_IMAGE; } - else if (path.endsWith(".ico")) { doctype = DT_IMAGE; } - else if (path.endsWith(".bmp")) { doctype = DT_IMAGE; } - else if (path.endsWith(".jpg")) { doctype = DT_IMAGE; } - else if (path.endsWith(".jpeg")) { doctype = DT_IMAGE; } - else if (path.endsWith(".png")) { doctype = DT_IMAGE; } - else if (path.endsWith(".html")) { doctype = DT_HTML; } - else if (path.endsWith(".txt")) { doctype = DT_TEXT; } - else if (path.endsWith(".doc")) { doctype = DT_DOC; } - else if (path.endsWith(".rtf")) { doctype = DT_DOC; } - else if (path.endsWith(".pdf")) { doctype = DT_PDFPS; } - else if (path.endsWith(".ps")) { doctype = DT_PDFPS; } - else if (path.endsWith(".avi")) { doctype = DT_MOVIE; } - else if (path.endsWith(".mov")) { doctype = DT_MOVIE; } - else if (path.endsWith(".qt")) { doctype = DT_MOVIE; } - else if (path.endsWith(".mpg")) { doctype = DT_MOVIE; } - else if (path.endsWith(".md5")) { doctype = DT_SHARE; } - else if (path.endsWith(".mpeg")) { doctype = DT_MOVIE; } - else if (path.endsWith(".asf")) { doctype = DT_FLASH; } - return doctype; - } - - public static char docType(final String mime) { - // serverLog.logFinest("PLASMA", "docType mime=" + mime); - char doctype = DT_UNKNOWN; - if (mime == null) doctype = DT_UNKNOWN; - else if (mime.startsWith("image/")) doctype = DT_IMAGE; - else if (mime.endsWith("/gif")) doctype = DT_IMAGE; - else if (mime.endsWith("/jpeg")) doctype = DT_IMAGE; - else if (mime.endsWith("/png")) doctype = DT_IMAGE; - else if (mime.endsWith("/html")) doctype = DT_HTML; - else if (mime.endsWith("/rtf")) doctype = DT_DOC; - else if (mime.endsWith("/pdf")) doctype = DT_PDFPS; - else if (mime.endsWith("/octet-stream")) doctype = DT_BINARY; - else if (mime.endsWith("/x-shockwave-flash")) doctype = DT_FLASH; - else if (mime.endsWith("/msword")) doctype = DT_DOC; - else if (mime.endsWith("/mspowerpoint")) doctype = DT_DOC; - else if (mime.endsWith("/postscript")) doctype = DT_PDFPS; - else if (mime.startsWith("text/")) doctype = DT_TEXT; - else if (mime.startsWith("image/")) doctype = DT_IMAGE; - else if (mime.startsWith("audio/")) doctype = DT_AUDIO; - else if (mime.startsWith("video/")) doctype = DT_MOVIE; - //bz2 = application/x-bzip2 - //dvi = application/x-dvi - //gz = application/gzip - //hqx = application/mac-binhex40 - //lha = application/x-lzh - //lzh = application/x-lzh - //pac = application/x-ns-proxy-autoconfig - //php = application/x-httpd-php - //phtml = application/x-httpd-php - //rss = application/xml - //tar = application/tar - //tex = application/x-tex - //tgz = application/tar - //torrent = application/x-bittorrent - //xhtml = application/xhtml+xml - //xla = application/msexcel - //xls = application/msexcel - //xsl = application/xml - //xml = application/xml - //Z = application/x-compress - //zip = application/zip - return doctype; - } public static void init(final File htCachePath, final long CacheSizeMax) { @@ -301,11 +217,11 @@ public final class plasmaHTCache { return responseHeaderDB.size(); } - public static void push(final Entry entry) { + public static void push(final httpdProxyCacheEntry entry) { cacheStack.add(entry); } - public static Entry pop() { + public static httpdProxyCacheEntry pop() { return cacheStack.poll(); } @@ -889,8 +805,7 @@ public final class plasmaHTCache { return 0; } - public static Entry newEntry( - final Date initDate, + public static httpdProxyCacheEntry newEntry( final int depth, final yacyURL url, final String name, @@ -899,8 +814,7 @@ public final class plasmaHTCache { final String initiator, final CrawlProfile.entry profile ) { - final Entry entry = new Entry( - initDate, + final httpdProxyCacheEntry entry = new httpdProxyCacheEntry( depth, url, name, @@ -909,244 +823,28 @@ public final class plasmaHTCache { initiator, profile ); - return entry; - } - - /** - * @return the responseHeaderDB - */ - static kelondroMap getResponseHeaderDB() { - return responseHeaderDB; - } - - public final static class Entry { - - // the class objects - private final Date initDate; // the date when the request happened; will be used as a key - private final int depth; // the depth of prefetching - private final String responseStatus; - private final File cacheFile; // the cache file - private byte[] cacheArray; // or the cache as byte-array - private final yacyURL url; - private final String name; // the name of the link, read as anchor from an -tag - private final Date lastModified; - private char doctype; - private final String language; - private final CrawlProfile.entry profile; - private final String initiator; - - /** - * protocolspecific information about the resource - */ - private final IResourceInfo resInfo; - - protected Entry clone() { - return new Entry( - this.initDate, - this.depth, - this.url, - this.name, - this.responseStatus, - this.resInfo, - this.initiator, - this.profile - ); - } - - public Entry(final Date initDate, - final int depth, - final yacyURL url, - final String name, - final String responseStatus, - final IResourceInfo resourceInfo, - final String initiator, - final CrawlProfile.entry profile - ) { - if (resourceInfo == null){ - System.out.println("Content information object is null. " + url); - System.exit(0); - } - this.resInfo = resourceInfo; - this.url = url; - this.name = name; - this.cacheFile = getCachePath(this.url); - - // assigned: - this.initDate = initDate; - this.depth = depth; - this.responseStatus = responseStatus; - this.profile = profile; - this.initiator = (initiator == null) ? null : ((initiator.length() == 0) ? null : initiator); - - // getting the last modified date - this.lastModified = resourceInfo.getModificationDate(); - - // getting the doctype - this.doctype = docType(resourceInfo.getMimeType()); - if (this.doctype == DT_UNKNOWN) this.doctype = docType(url); - this.language = yacyURL.language(url); - - // to be defined later: - this.cacheArray = null; - - writeResourceInfo(); - } - - public String name() { - // the anchor name; can be either the text inside the anchor tag or the page description after loading of the page - return this.name; - } - - public yacyURL url() { - return this.url; - } - - public String urlHash() { - return this.url.hash(); - } - - public Date lastModified() { - return this.lastModified; - } - - public String language() { - return this.language; - } - - public CrawlProfile.entry profile() { - return this.profile; - } - - public String initiator() { - return this.initiator; - } - public boolean proxy() { - return initiator() == null; - } - public long size() { - if (this.cacheArray == null) return 0; - return this.cacheArray.length; - } - - public int depth() { - return this.depth; - } - - public yacyURL referrerURL() { - return (this.resInfo == null) ? null : this.resInfo.getRefererUrl(); - } - - public File cacheFile() { - return this.cacheFile; - } - - public void setCacheArray(final byte[] data) { - this.cacheArray = data; - } - - public byte[] cacheArray() { - return this.cacheArray; - } - - public IResourceInfo getDocumentInfo() { - return this.resInfo; - } - - private boolean writeResourceInfo() { - if (this.resInfo == null) return false; - try { + if (docInfo != null) try { final HashMap hm = new HashMap(); - hm.putAll(this.resInfo.getMap()); - hm.put("@@URL", this.url.toNormalform(false, false)); - hm.put("@@DEPTH", Integer.toString(this.depth)); - if (this.initiator != null) hm.put("@@INITIATOR", this.initiator); - plasmaHTCache.getResponseHeaderDB().put(this.url.hash(), hm); + hm.putAll(docInfo.getMap()); + hm.put("@@URL", url.toNormalform(false, false)); + hm.put("@@DEPTH", Integer.toString(depth)); + if (initiator != null) + hm.put("@@INITIATOR", initiator); + plasmaHTCache.getResponseHeaderDB().put(url.hash(), hm); } catch (final Exception e) { - log.logWarning("could not write ResourceInfo: "+ e.getClass() +": "+ e.getMessage()); + plasmaHTCache.log.logWarning("could not write ResourceInfo: " + + e.getClass() + ": " + e.getMessage()); plasmaHTCache.resetResponseHeaderDB(); - return false; - } - return true; - } - - public String getMimeType() { - return (this.resInfo == null) ? null : this.resInfo.getMimeType(); - } - - public Date ifModifiedSince() { - return (this.resInfo == null) ? null : this.resInfo.ifModifiedSince(); - } - - public boolean requestWithCookie() { - return (this.resInfo == null) ? false : this.resInfo.requestWithCookie(); - } - - public boolean requestProhibitsIndexing() { - return (this.resInfo == null) ? false : this.resInfo.requestProhibitsIndexing(); - } - - /* - public boolean update() { - return ((status == CACHE_FILL) || (status == CACHE_STALE_RELOAD_GOOD)); - } - */ - - // the following three methods for cache read/write granting shall be as loose as possible - // but also as strict as necessary to enable caching of most items - - /** - * @return NULL if the answer is TRUE, in case of FALSE, the reason as String is returned - */ - public String shallStoreCacheForProxy() { - - // check profile (disabled: we will check this in the plasmaSwitchboard) - //if (!this.profile.storeHTCache()) { return "storage_not_wanted"; } - - // decide upon header information if a specific file should be stored to the cache or not - // if the storage was requested by prefetching, the request map is null - - // check status code - if ((this.resInfo != null) && (!this.resInfo.validResponseStatus(this.responseStatus))) { - return "bad_status_" + this.responseStatus.substring(0,3); } - - // check storage location - // sometimes a file name is equal to a path name in the same directory; - // or sometimes a file name is equal a directory name created earlier; - // we cannot match that here in the cache file path and therefore omit writing into the cache - if (this.cacheFile.getParentFile().isFile() || this.cacheFile.isDirectory()) { return "path_ambiguous"; } - if (this.cacheFile.toString().indexOf("..") >= 0) { return "path_dangerous"; } - if (this.cacheFile.getAbsolutePath().length() > serverSystem.maxPathLength) { return "path too long"; } - - // -CGI access in request - // CGI access makes the page very individual, and therefore not usable in caches - if (this.url.isPOST() && !this.profile.crawlingQ()) { return "dynamic_post"; } - if (this.url.isCGI()) { return "dynamic_cgi"; } - if (this.resInfo != null) { - return this.resInfo.shallStoreCacheForProxy(); - } - - return null; + return entry; } /** - * decide upon header information if a specific file should be taken from the cache or not - * @return whether the file should be taken from the cache + * @return the responseHeaderDB */ - public boolean shallUseCacheForProxy() { - - // -CGI access in request - // CGI access makes the page very individual, and therefore not usable in caches - if (this.url.isPOST()) { return false; } - if (this.url.isCGI()) { return false; } - - if (this.resInfo != null) { - return this.resInfo.shallUseCacheForProxy(); - } - - return true; + static kelondroMap getResponseHeaderDB() { + return responseHeaderDB; } - } // class Entry } diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index 662d9fd1a..fb42c03a3 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -39,6 +39,7 @@ import java.util.TreeSet; import de.anomic.htmlFilter.htmlFilterImageEntry; import de.anomic.http.HttpClient; import de.anomic.http.httpHeader; +import de.anomic.http.httpdProxyCacheEntry; import de.anomic.index.indexURLReference; import de.anomic.index.indexWord; import de.anomic.kelondro.kelondroMScoreCluster; @@ -284,7 +285,7 @@ public class plasmaSnippetCache { // if not found try to download it // download resource using the crawler and keep resource in memory if possible - final plasmaHTCache.Entry entry = plasmaSwitchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, timeout, true, true, reindexing); + final httpdProxyCacheEntry entry = plasmaSwitchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, timeout, true, true, reindexing); // getting resource metadata (e.g. the http headers for http resources) if (entry != null) { @@ -395,7 +396,7 @@ public class plasmaSnippetCache { // if not found try to download it // download resource using the crawler and keep resource in memory if possible - final plasmaHTCache.Entry entry = plasmaSwitchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, timeout, true, forText, global); + final httpdProxyCacheEntry entry = plasmaSwitchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, timeout, true, forText, global); // getting resource metadata (e.g. the http headers for http resources) if (entry != null) { @@ -853,7 +854,7 @@ public class plasmaSnippetCache { // if the content is not available in cache try to download it from web // try to download the resource using a crawler - final plasmaHTCache.Entry entry = plasmaSwitchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, (socketTimeout < 0) ? -1 : socketTimeout, true, forText, reindexing); + final httpdProxyCacheEntry entry = plasmaSwitchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, (socketTimeout < 0) ? -1 : socketTimeout, true, forText, reindexing); if (entry == null) return null; // not found in web // read resource body (if it is there) diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 5e1768d6a..6a2f8716d 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -136,6 +136,7 @@ import de.anomic.http.JakartaCommonsHttpClient; import de.anomic.http.httpHeader; import de.anomic.http.httpRemoteProxyConfig; import de.anomic.http.httpd; +import de.anomic.http.httpdProxyCacheEntry; import de.anomic.http.httpdRobotsTxtConfig; import de.anomic.index.indexReferenceBlacklist; import de.anomic.index.indexURLReference; @@ -965,7 +966,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch