From dae763d8e3df0e5281bbc7018577db8da06f6ea1 Mon Sep 17 00:00:00 2001 From: theli Date: Wed, 6 Sep 2006 14:31:17 +0000 Subject: [PATCH] git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2495 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/CacheAdmin_p.java | 7 +- htroot/ViewFile.html | 2 + htroot/ViewFile.java | 309 ++++++------ source/de/anomic/http/httpdProxyHandler.java | 19 +- source/de/anomic/icap/icapd.java | 8 +- .../de/anomic/plasma/cache/IResourceInfo.java | 136 +++++ .../plasma/cache/ResourceInfoFactory.java | 86 ++++ .../plasma/cache/http/ResourceInfo.java | 467 ++++++++++++++++++ .../plasma/crawler/http/CrawlWorker.java | 14 +- .../de/anomic/plasma/plasmaCrawlStacker.java | 2 +- source/de/anomic/plasma/plasmaHTCache.java | 380 +++++--------- .../de/anomic/plasma/plasmaSnippetCache.java | 29 +- .../de/anomic/plasma/plasmaSwitchboard.java | 9 +- .../anomic/plasma/plasmaSwitchboardQueue.java | 152 ++---- 14 files changed, 1057 insertions(+), 563 deletions(-) create mode 100644 source/de/anomic/plasma/cache/IResourceInfo.java create mode 100644 source/de/anomic/plasma/cache/ResourceInfoFactory.java create mode 100644 source/de/anomic/plasma/cache/http/ResourceInfo.java diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java index 77bed6ae1..84d7f69d7 100644 --- a/htroot/CacheAdmin_p.java +++ b/htroot/CacheAdmin_p.java @@ -59,6 +59,7 @@ import de.anomic.index.indexURL; import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.plasma.cache.IResourceInfo; import de.anomic.server.serverCore; import de.anomic.server.serverFileUtils; import de.anomic.server.serverObjects; @@ -102,8 +103,8 @@ public class CacheAdmin_p { info.ensureCapacity(40000); try { - final httpHeader fileheader = switchboard.cacheManager.getCachedResponse(indexURL.urlHash(url)); - info.append("HTTP Header:
").append(formatHeader(fileheader)).append("
"); + final IResourceInfo resInfo = switchboard.cacheManager.loadResourceInfo(url); + info.append("HTTP Header:
").append(formatHeader(resInfo.getMap())).append("
"); final String ff = file.toString(); final int dotpos = ff.lastIndexOf('.'); final String ext = (dotpos >= 0) ? ff.substring(dotpos + 1).toLowerCase() : ""; @@ -198,7 +199,7 @@ public class CacheAdmin_p { return new String(s); } - private static String formatHeader(httpHeader header) { + private static String formatHeader(Map header) { final StringBuffer result = new StringBuffer(2048); if (header == null) { result.append("- no header in header cache -
"); diff --git a/htroot/ViewFile.html b/htroot/ViewFile.html index 648192f1b..87830d891 100644 --- a/htroot/ViewFile.html +++ b/htroot/ViewFile.html @@ -56,6 +56,8 @@ Invalid URL Unable to download resource content. :: Unable to parse resource content. +:: +Unsupported protocol. #(/error)#

diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index 3ae7bff55..f1b7b06a5 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -4,23 +4,23 @@ //(C) by Michael Peter Christen; mc@anomic.de //first published on http://www.anomic.de //Frankfurt, Germany, 2004 -// + //last major change: 12.07.2004 -// + //This program is free software; you can redistribute it and/or modify //it under the terms of the GNU General Public License as published by //the Free Software Foundation; either version 2 of the License, or //(at your option) any later version. -// + //This program is distributed in the hope that it will be useful, //but WITHOUT ANY WARRANTY; without even the implied warranty of //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //GNU General Public License for more details. -// + //You should have received a copy of the GNU General Public License //along with this program; if not, write to the Free Software //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// + //Using this software in any meaning (reading, learning, copying, compiling, //running) means that you agree that the Author(s) is (are) not responsible //for cost, loss of data or any harm that may be caused directly or indirectly @@ -32,7 +32,7 @@ //(are) also not responsible for proper configuration and usage of the //software, even if provoked by documentation provided together with //the software. -// + //Any changes to this file according to the GPL as documented in the file //gpl.txt aside this file in the shipment you received can be done to the //lines that follows this copyright notice here, but changes must not be @@ -56,18 +56,19 @@ import de.anomic.http.httpc; import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.plasma.cache.IResourceInfo; import de.anomic.plasma.plasmaCrawlLURL.Entry; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; public class ViewFile { - + public static final int VIEW_MODE_NO_TEXT = 0; public static final int VIEW_MODE_AS_PLAIN_TEXT = 1; public static final int VIEW_MODE_AS_PARSED_TEXT = 2; public static final int VIEW_MODE_AS_PARSED_SENTENCES = 3; public static final int VIEW_MODE_AS_IFRAME = 4; - + public static final String[] highlightingColors = new String[] { "255,255,100", "255,155,155", @@ -78,12 +79,12 @@ public class ViewFile { }; public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) { - + serverObjects prop = new serverObjects(); plasmaSwitchboard sb = (plasmaSwitchboard)env; - - + + if (post.containsKey("words")) try { prop.put("error_words",URLEncoder.encode((String) post.get("words"), "UTF-8")); @@ -91,148 +92,168 @@ public class ViewFile { // TODO Auto-generated catch block e1.printStackTrace(); } - - if (post != null) { - // getting the url hash from which the content should be loaded - String urlHash = post.get("urlHash",""); - if (urlHash.equals("")) { - prop.put("error",1); - prop.put("viewMode",VIEW_MODE_NO_TEXT); - return prop; - } - - String viewMode = post.get("viewMode","sentences"); - - // getting the urlEntry that belongs to the url hash - Entry urlEntry = null; - try { - urlEntry = sb.urlPool.loadedURL.getEntry(urlHash, null); - } catch (IOException e) { - prop.put("error",2); - prop.put("viewMode",VIEW_MODE_NO_TEXT); - return prop; - } - - // gettin the url that belongs to the entry - URL url = urlEntry.url(); - if (url == null) { - prop.put("error",3); - prop.put("viewMode",VIEW_MODE_NO_TEXT); - return prop; - } - - // loading the resource content as byte array - byte[] resource = null; - httpHeader resHeader = null; - String resMime = null; - try { - resource = sb.cacheManager.loadResource(url); - if (resource == null) { - plasmaHTCache.Entry entry = sb.snippetCache.loadResourceFromWeb(url, 5000); - - if (entry != null) { - resHeader = entry.responseHeader(); - } - - resource = sb.cacheManager.loadResource(url); - if (resource == null) { - prop.put("error",4); - prop.put("viewMode",VIEW_MODE_NO_TEXT); - return prop; - } + + if (post != null) { + // getting the url hash from which the content should be loaded + String urlHash = post.get("urlHash",""); + if (urlHash.equals("")) { + prop.put("error",1); + prop.put("viewMode",VIEW_MODE_NO_TEXT); + return prop; } - if (resHeader == null) { - resHeader = sb.cacheManager.getCachedResponse(urlEntry.hash()); - if (resHeader == null) { - resHeader = httpc.whead(url,url.getHost(),5000,null,null,sb.remoteProxyConfig); + + String viewMode = post.get("viewMode","sentences"); + + // getting the urlEntry that belongs to the url hash + Entry urlEntry = null; + try { + urlEntry = sb.urlPool.loadedURL.getEntry(urlHash, null); + } catch (IOException e) { + prop.put("error",2); + prop.put("viewMode",VIEW_MODE_NO_TEXT); + return prop; + } + + // gettin the url that belongs to the entry + URL url = urlEntry.url(); + if (url == null) { + prop.put("error",3); + prop.put("viewMode",VIEW_MODE_NO_TEXT); + return prop; + } + + // loading the resource content as byte array + byte[] resource = null; + IResourceInfo resInfo = null; + String resMime = null; + try { + // trying to load the resource body + resource = sb.cacheManager.loadResourceContent(url); + + // if the resource body was not cached we try to load it from web + if (resource == null) { + plasmaHTCache.Entry entry = sb.snippetCache.loadResourceFromWeb(url, 5000); + + if (entry != null) { + resInfo = entry.getDocumentInfo(); + resource = sb.cacheManager.loadResourceContent(url); + } + if (resource == null) { prop.put("error",4); prop.put("viewMode",VIEW_MODE_NO_TEXT); return prop; } - resMime = resHeader.mime(); } - } - } catch (IOException e) { - if (url == null) { - prop.put("error",4); - prop.put("viewMode",VIEW_MODE_NO_TEXT); - return prop; - } - } - if (viewMode.equals("plain")) { - String content = new String(resource); - content = content.replaceAll("<","<") - .replaceAll(">",">") - .replaceAll("\"",""") - .replaceAll("\n","
") - .replaceAll("\t","    "); - - prop.put("error",0); - prop.put("viewMode",VIEW_MODE_AS_PLAIN_TEXT); - prop.put("viewMode_plainText",content); - } else if (viewMode.equals("parsed") || viewMode.equals("sentences") || viewMode.equals("iframe")) { - // parsing the resource content - plasmaParserDocument document = sb.snippetCache.parseDocument(url, resource,resHeader); - if (document == null) { - prop.put("error",5); - prop.put("viewMode",VIEW_MODE_NO_TEXT); - return prop; - } - resMime = document.getMimeType(); - - if (viewMode.equals("parsed")) { - String content = new String(document.getText()); - content = wikiCode.replaceHTML(content); //added by Marc Nause - content = content.replaceAll("\n","
") - .replaceAll("\t","    "); - - prop.put("viewMode",VIEW_MODE_AS_PARSED_TEXT); - prop.put("viewMode_parsedText",content); - } else if (viewMode.equals("iframe")) { - prop.put("viewMode",VIEW_MODE_AS_IFRAME); - prop.put("viewMode_url",url.toString()); - } else { - prop.put("viewMode",VIEW_MODE_AS_PARSED_SENTENCES); - String[] sentences = document.getSentences(); - - boolean dark = true; - for (int i=0; i < sentences.length; i++) { - String currentSentence = wikiCode.replaceHTML(sentences[i]); - - // Search word highlighting - String words = post.get("words",null); - if (words != null) { - try { - words = URLDecoder.decode(words,"UTF-8"); - } catch (UnsupportedEncodingException e) {} - - String[] wordArray = words.substring(1,words.length()-1).split(","); - for (int j=0; j < wordArray.length; j++) { - String currentWord = wordArray[j].trim(); - currentSentence = currentSentence.replaceAll(currentWord, - "" + currentWord + ""); + + // try to load resource metadata + if (resInfo == null) { + + // try to load the metadata from cache + try { + resInfo = sb.cacheManager.loadResourceInfo(urlEntry.url()); + } catch (Exception e) { /* ignore this */} + + // if the metadata where not cached try to load it from web + if (resInfo == null) { + String protocol = url.getProtocol(); + if (!((protocol.equals("http") || protocol.equals("https")))) { + prop.put("error",6); + prop.put("viewMode",VIEW_MODE_NO_TEXT); + return prop; } + + httpHeader responseHeader = httpc.whead(url,url.getHost(),5000,null,null,sb.remoteProxyConfig); + if (responseHeader == null) { + prop.put("error",4); + prop.put("viewMode",VIEW_MODE_NO_TEXT); + return prop; + } + resMime = responseHeader.mime(); } - - prop.put("viewMode_sentences_" + i + "_nr",Integer.toString(i+1)); - prop.put("viewMode_sentences_" + i + "_text",currentSentence); - prop.put("viewMode_sentences_" + i + "_dark",((dark) ? 1 : 0) ); dark=!dark; + } else { + resMime = resInfo.getMimeType(); } - prop.put("viewMode_sentences",sentences.length); - - } - } - prop.put("error",0); - prop.put("error_url",url.toString()); - prop.put("error_hash",urlHash); - prop.put("error_wordCount",Integer.toString(urlEntry.wordCount())); - prop.put("error_desc",urlEntry.descr()); - prop.put("error_size",urlEntry.size()); - prop.put("error_mimeType",resMime); - } - - return prop; + } catch (IOException e) { + if (url == null) { + prop.put("error",4); + prop.put("viewMode",VIEW_MODE_NO_TEXT); + return prop; + } + } + if (viewMode.equals("plain")) { + String content = new String(resource); + content = content.replaceAll("<","<") + .replaceAll(">",">") + .replaceAll("\"",""") + .replaceAll("\n","
") + .replaceAll("\t","    "); + + prop.put("error",0); + prop.put("viewMode",VIEW_MODE_AS_PLAIN_TEXT); + prop.put("viewMode_plainText",content); + } else if (viewMode.equals("parsed") || viewMode.equals("sentences") || viewMode.equals("iframe")) { + // parsing the resource content + plasmaParserDocument document = sb.snippetCache.parseDocument(url, resource,resInfo); + if (document == null) { + prop.put("error",5); + prop.put("viewMode",VIEW_MODE_NO_TEXT); + return prop; + } + resMime = document.getMimeType(); + + if (viewMode.equals("parsed")) { + String content = new String(document.getText()); + content = wikiCode.replaceHTML(content); //added by Marc Nause + content = content.replaceAll("\n","
") + .replaceAll("\t","    "); + + prop.put("viewMode",VIEW_MODE_AS_PARSED_TEXT); + prop.put("viewMode_parsedText",content); + } else if (viewMode.equals("iframe")) { + prop.put("viewMode",VIEW_MODE_AS_IFRAME); + prop.put("viewMode_url",url.toString()); + } else { + prop.put("viewMode",VIEW_MODE_AS_PARSED_SENTENCES); + String[] sentences = document.getSentences(); + + boolean dark = true; + for (int i=0; i < sentences.length; i++) { + String currentSentence = wikiCode.replaceHTML(sentences[i]); + + // Search word highlighting + String words = post.get("words",null); + if (words != null) { + try { + words = URLDecoder.decode(words,"UTF-8"); + } catch (UnsupportedEncodingException e) {} + + String[] wordArray = words.substring(1,words.length()-1).split(","); + for (int j=0; j < wordArray.length; j++) { + String currentWord = wordArray[j].trim(); + currentSentence = currentSentence.replaceAll(currentWord, + "" + currentWord + ""); + } + } + + prop.put("viewMode_sentences_" + i + "_nr",Integer.toString(i+1)); + prop.put("viewMode_sentences_" + i + "_text",currentSentence); + prop.put("viewMode_sentences_" + i + "_dark",((dark) ? 1 : 0) ); dark=!dark; + } + prop.put("viewMode_sentences",sentences.length); + + } + } + prop.put("error",0); + prop.put("error_url",url.toString()); + prop.put("error_hash",urlHash); + prop.put("error_wordCount",Integer.toString(urlEntry.wordCount())); + prop.put("error_desc",urlEntry.descr()); + prop.put("error_size",urlEntry.size()); + prop.put("error_mimeType",resMime); + } + + return prop; } - + } diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java index d6315ed1a..bf703b1f6 100644 --- a/source/de/anomic/http/httpdProxyHandler.java +++ b/source/de/anomic/http/httpdProxyHandler.java @@ -96,6 +96,8 @@ import de.anomic.index.indexURL; import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaParser; import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.plasma.cache.IResourceInfo; +import de.anomic.plasma.cache.http.ResourceInfo; import de.anomic.plasma.urlPattern.plasmaURLPattern; import de.anomic.server.serverCore; import de.anomic.server.serverFileUtils; @@ -413,8 +415,8 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt // decide wether to use a cache entry or connect to the network File cacheFile = cacheManager.getCachePath(url); - String urlHash = indexURL.urlHash(url); - httpHeader cachedResponseHeader = cacheManager.getCachedResponse(urlHash); + ResourceInfo cachedResInfo = (ResourceInfo) cacheManager.loadResourceInfo(url); + httpHeader cachedResponseHeader = (cachedResInfo == null)?null:cachedResInfo.getResponseHeader(); boolean cacheExists = ((cacheFile.isFile()) && (cachedResponseHeader != null)); // why are files unzipped upon arrival? why not zip all files in cache? @@ -445,9 +447,10 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt 0, // crawling depth url, // url "", // name of the url is unknown - requestHeader, // request headers + //requestHeader, // request headers "200 OK", // request status - cachedResponseHeader, // response headers + //cachedResponseHeader, // response headers + cachedResInfo, null, // initiator switchboard.defaultProxyProfile // profile ); @@ -579,15 +582,17 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt } // reserver cache entry - Date requestDate = new Date(((Long)conProp.get(httpHeader.CONNECTION_PROP_REQUEST_START)).longValue()); + Date requestDate = new Date(((Long)conProp.get(httpHeader.CONNECTION_PROP_REQUEST_START)).longValue()); + IResourceInfo resInfo = new ResourceInfo(url,requestHeader,res.responseHeader); plasmaHTCache.Entry cacheEntry = cacheManager.newEntry( requestDate, 0, url, "", - requestHeader, + //requestHeader, res.status, - res.responseHeader, + //res.responseHeader, + resInfo, null, switchboard.defaultProxyProfile ); diff --git a/source/de/anomic/icap/icapd.java b/source/de/anomic/icap/icapd.java index de4ea546f..648d0fd2b 100644 --- a/source/de/anomic/icap/icapd.java +++ b/source/de/anomic/icap/icapd.java @@ -64,6 +64,8 @@ import de.anomic.http.httpc; import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaParser; import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.plasma.cache.IResourceInfo; +import de.anomic.plasma.cache.http.ResourceInfo; import de.anomic.server.serverCore; import de.anomic.server.serverFileUtils; import de.anomic.server.serverHandler; @@ -385,14 +387,14 @@ public class icapd implements serverHandler { * ========================================================================= */ // generating a htcache entry object + IResourceInfo resInfo = new ResourceInfo(httpRequestURL,httpReqHeader,httpResHeader); plasmaHTCache.Entry cacheEntry = cacheManager.newEntry( new Date(), 0, httpRequestURL, "", - httpReqHeader, - httpRespStatusLine, - httpResHeader, + httpRespStatusLine, + resInfo, null, switchboard.defaultProxyProfile ); diff --git a/source/de/anomic/plasma/cache/IResourceInfo.java b/source/de/anomic/plasma/cache/IResourceInfo.java new file mode 100644 index 000000000..72c344933 --- /dev/null +++ b/source/de/anomic/plasma/cache/IResourceInfo.java @@ -0,0 +1,136 @@ +// IResourceInfo.java +// ------------------------------------- +// part of YACY +// (C) by Michael Peter Christen; mc@anomic.de +// first published on http://www.anomic.de +// Frankfurt, Germany, 2006 +// +// This file ist contributed by Martin Thelian +// +// $LastChangedDate: 2006-02-20 23:57:42 +0100 (Mo, 20 Feb 2006) $ +// $LastChangedRevision: 1715 $ +// $LastChangedBy: theli $ +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// Using this software in any meaning (reading, learning, copying, compiling, +// running) means that you agree that the Author(s) is (are) not responsible +// for cost, loss of data or any harm that may be caused directly or indirectly +// by usage of this softare or this documentation. The usage of this software +// is on your own risk. The installation and usage (starting/running) of this +// software may allow other people or application to access your computer and +// any attached devices and is highly dependent on the configuration of the +// software which must be done by the user of the software; the author(s) is +// (are) also not responsible for proper configuration and usage of the +// software, even if provoked by documentation provided together with +// the software. +// +// Any changes to this file according to the GPL as documented in the file +// gpl.txt aside this file in the shipment you received can be done to the +// lines that follows this copyright notice here, but changes must not be +// done inside the copyright notive above. A re-distribution must contain +// the intact and unchanged copyright notice. +// Contributions and changes to the program code must be marked as such. + + + +package de.anomic.plasma.cache; + +import java.util.Date; +import java.util.Map; + +import de.anomic.net.URL; + +public interface IResourceInfo { + + /** + * Return the resource information as map + * @return + */ + public Map getMap(); + + /** + * Returns the URL of this content + * @return + */ + public URL getUrl(); + + /** + * Returns the referer URL of this URL + * @return referer URL + */ + public URL getRefererUrl(); + + /** + * Returns the mimetype of the cached object + * @return mimetype + */ + public String getMimeType(); + + /** + * Returns the modification date of the cached object + * @return the modifiaction date + */ + public Date getModificationDate(); + + /** + * Returns the url hash of the content URL + * @return + */ + public String getUrlHash(); + + /** + * Specifies if the resource was requested with a + * if modified since date + * @return + */ + public Date ifModifiedSince(); + + /** + * Specifies if the resource was requested with + * client specific information (e.g. cookies for http) + * @return + */ + public boolean requestWithCookie(); + + /** + * Specifies if the request prohibits indexing + * @return + */ + public boolean requestProhibitsIndexing(); + + /** + * Determines if a resource that was downloaded by the crawler + * is allowed to be indexed. + * + * @return an error string describing the reason why the + * resourse should not be indexed or null if indexing is allowed + */ + public String shallIndexCacheForCrawler(); + + /** + * Determines if a resource that was downloaded by the proxy + * is allowed to be indexed. + * + * @return an error string describing the reason why the + * resourse should not be indexed or null if indexing is allowed + */ + public String shallIndexCacheForProxy(); + + public String shallStoreCacheForProxy(); + public boolean shallUseCacheForProxy(); + + public boolean validResponseStatus(String responseStatus); +} diff --git a/source/de/anomic/plasma/cache/ResourceInfoFactory.java b/source/de/anomic/plasma/cache/ResourceInfoFactory.java new file mode 100644 index 000000000..75c0a2d07 --- /dev/null +++ b/source/de/anomic/plasma/cache/ResourceInfoFactory.java @@ -0,0 +1,86 @@ +// RespourceInfoFactory.java +// ------------------------------------- +// part of YACY +// (C) by Michael Peter Christen; mc@anomic.de +// first published on http://www.anomic.de +// Frankfurt, Germany, 2006 +// +// This file ist contributed by Martin Thelian +// +// $LastChangedDate: 2006-02-20 23:57:42 +0100 (Mo, 20 Feb 2006) $ +// $LastChangedRevision: 1715 $ +// $LastChangedBy: theli $ +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// Using this software in any meaning (reading, learning, copying, compiling, +// running) means that you agree that the Author(s) is (are) not responsible +// for cost, loss of data or any harm that may be caused directly or indirectly +// by usage of this softare or this documentation. The usage of this software +// is on your own risk. The installation and usage (starting/running) of this +// software may allow other people or application to access your computer and +// any attached devices and is highly dependent on the configuration of the +// software which must be done by the user of the software; the author(s) is +// (are) also not responsible for proper configuration and usage of the +// software, even if provoked by documentation provided together with +// the software. +// +// Any changes to this file according to the GPL as documented in the file +// gpl.txt aside this file in the shipment you received can be done to the +// lines that follows this copyright notice here, but changes must not be +// done inside the copyright notive above. A re-distribution must contain +// the intact and unchanged copyright notice. +// Contributions and changes to the program code must be marked as such. + + + +package de.anomic.plasma.cache; + +import java.lang.reflect.Constructor; +import java.util.Map; + +import de.anomic.net.URL; + +public class ResourceInfoFactory { + public IResourceInfo buildResourceInfoObj( + URL resourceURL, + Map resourceMetadata + ) throws Exception { + + String protocString = resourceURL.getProtocol(); + + // the full qualified class name + String className = this.getClass().getPackage().getName() + "." + protocString + ".ResourceInfo"; + + // loading class by name + Class moduleClass = Class.forName(className); + + // getting the constructor + Constructor classConstructor = moduleClass.getConstructor( new Class[] { + URL.class, + Map.class + } ); + + // instantiating class + IResourceInfo infoObject = (IResourceInfo) classConstructor.newInstance(new Object[] { + resourceURL, + resourceMetadata + }); + + // return the newly created object + return infoObject; + + } +} diff --git a/source/de/anomic/plasma/cache/http/ResourceInfo.java b/source/de/anomic/plasma/cache/http/ResourceInfo.java new file mode 100644 index 000000000..3b1c2d4b3 --- /dev/null +++ b/source/de/anomic/plasma/cache/http/ResourceInfo.java @@ -0,0 +1,467 @@ +// ResourceInfo.java +// ------------------------------------- +// part of YACY +// (C) by Michael Peter Christen; mc@anomic.de +// first published on http://www.anomic.de +// Frankfurt, Germany, 2006 +// +// This file ist contributed by Martin Thelian +// +// $LastChangedDate: 2006-02-20 23:57:42 +0100 (Mo, 20 Feb 2006) $ +// $LastChangedRevision: 1715 $ +// $LastChangedBy: theli $ +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// Using this software in any meaning (reading, learning, copying, compiling, +// running) means that you agree that the Author(s) is (are) not responsible +// for cost, loss of data or any harm that may be caused directly or indirectly +// by usage of this softare or this documentation. The usage of this software +// is on your own risk. The installation and usage (starting/running) of this +// software may allow other people or application to access your computer and +// any attached devices and is highly dependent on the configuration of the +// software which must be done by the user of the software; the author(s) is +// (are) also not responsible for proper configuration and usage of the +// software, even if provoked by documentation provided together with +// the software. +// +// Any changes to this file according to the GPL as documented in the file +// gpl.txt aside this file in the shipment you received can be done to the +// lines that follows this copyright notice here, but changes must not be +// done inside the copyright notive above. A re-distribution must contain +// the intact and unchanged copyright notice. +// Contributions and changes to the program code must be marked as such. + + + +package de.anomic.plasma.cache.http; + +import java.util.Date; +import java.util.Map; + +import de.anomic.http.httpHeader; +import de.anomic.index.indexURL; +import de.anomic.net.URL; +import de.anomic.plasma.plasmaHTCache; +import de.anomic.plasma.cache.ResourceInfoFactory; +import de.anomic.plasma.cache.IResourceInfo; +import de.anomic.server.serverDate; + +public class ResourceInfo implements IResourceInfo { + private URL url; + private String urlHash; + private httpHeader responseHeader; + private httpHeader requestHeader; + + /** + * Constructor used by the {@link ResourceInfoFactory} + * @param objectURL + * @param objectInfo + */ + public ResourceInfo(URL objectURL, Map objectInfo) { + if (objectURL == null) throw new NullPointerException(); + if (objectInfo == null) throw new NullPointerException(); + + // generating the url hash + this.url = objectURL; + this.urlHash = indexURL.urlHash(this.url.toNormalform()); + + // create the http header object + this.responseHeader = new httpHeader(null, objectInfo); + } + + public ResourceInfo(URL objectURL, httpHeader requestHeaders, httpHeader responseHeaders) { + if (objectURL == null) throw new NullPointerException(); + if (responseHeaders == null) throw new NullPointerException(); + + // generating the url hash + this.url = objectURL; + this.urlHash = indexURL.urlHash(this.url.toNormalform()); + + this.requestHeader = requestHeaders; + this.responseHeader = responseHeaders; + } + + public Map getMap() { + return this.responseHeader; + } + + /** + * @see de.anomic.plasma.cache.IResourceInfo#getMimeType() + */ + public String getMimeType() { + if (this.responseHeader == null) return null; + + String mimeType = this.responseHeader.mime(); + mimeType = mimeType.trim().toLowerCase(); + + int pos = mimeType.indexOf(';'); + return ((pos < 0) ? mimeType : mimeType.substring(0, pos)); + } + + /** + * @see de.anomic.plasma.cache.IResourceInfo#getModificationDate() + */ + public Date getModificationDate() { + Date docDate = null; + + if (this.responseHeader != null) { + docDate = this.responseHeader.lastModified(); + if (docDate == null) docDate = this.responseHeader.date(); + } + if (docDate == null) docDate = new Date(serverDate.correctedUTCTime()); + + return docDate; + } + + public URL getRefererUrl() { + if (this.requestHeader == null) return null; + try { + return new URL((String) this.requestHeader.get(httpHeader.REFERER, "")); + } catch (Exception e) { + return null; + } + } + + /** + * @see de.anomic.plasma.cache.IResourceInfo#getUrl() + */ + public URL getUrl() { + return this.url; + } + + /** + * @see de.anomic.plasma.cache.IResourceInfo#getUrlHash() + */ + public String getUrlHash() { + return this.urlHash; + } + + /** + * @see de.anomic.plasma.cache.IResourceInfo#shallIndexCacheForCrawler() + */ + public String shallIndexCacheForCrawler() { + String mimeType = this.getMimeType(); + if (plasmaHTCache.isPicture(mimeType)) { return "Media_Content_(Picture)"; } + if (!plasmaHTCache.isText(mimeType)) { return "Media_Content_(not_text)"; } + return null; + } + + /** + * @see de.anomic.plasma.cache.IResourceInfo#shallIndexCacheForProxy() + */ + public String shallIndexCacheForProxy() { + // -set-cookie in response + // the set-cookie from the server does not indicate that the content is special + // thus we do not care about it here for indexing + + // a picture cannot be indexed + String mimeType = this.getMimeType(); + if (plasmaHTCache.isPicture(mimeType)) { + return "Media_Content_(Picture)"; + } + if (!plasmaHTCache.isText(mimeType)) { + return "Media_Content_(not_text)"; + } + + // -if-modified-since in request + // if the page is fresh at the very moment we can index it + Date ifModifiedSince = getModificationDate(); + if ((ifModifiedSince != null) && (this.responseHeader.containsKey(httpHeader.LAST_MODIFIED))) { + // parse date + Date d = this.responseHeader.lastModified(); + if (d == null) { + d = new Date(serverDate.correctedUTCTime()); + } + // finally, we shall treat the cache as stale if the modification time is after the if-.. time + if (d.after(ifModifiedSince)) { + //System.out.println("***not indexed because if-modified-since"); + return "Stale_(Last-Modified>Modified-Since)"; + } + } + + // -pragma in cached response + if (this.responseHeader.containsKey(httpHeader.PRAGMA) && + ((String) this.responseHeader.get(httpHeader.PRAGMA)).toUpperCase().equals("NO-CACHE")) { + return "Denied_(pragma_no_cache)"; + } + + // see for documentation also: + // http://www.web-caching.com/cacheability.html + + // look for freshnes information + + // -expires in cached response + // the expires value gives us a very easy hint when the cache is stale + // sometimes, the expires date is set to the past to prevent that a page is cached + // we use that information to see if we should index it + final Date expires = this.responseHeader.expires(); + if (expires != null && expires.before(new Date(serverDate.correctedUTCTime()))) { + return "Stale_(Expired)"; + } + + // -lastModified in cached response + // this information is too weak to use it to prevent indexing + // even if we can apply a TTL heuristic for cache usage + + // -cache-control in cached response + // the cache-control has many value options. + String cacheControl = (String) this.responseHeader.get(httpHeader.CACHE_CONTROL); + if (cacheControl != null) { + cacheControl = cacheControl.trim().toUpperCase(); + /* we have the following cases for cache-control: + "public" -- can be indexed + "private", "no-cache", "no-store" -- cannot be indexed + "max-age=" -- stale/fresh dependent on date + */ + if (cacheControl.startsWith("PRIVATE") || + cacheControl.startsWith("NO-CACHE") || + cacheControl.startsWith("NO-STORE")) { + // easy case + return "Stale_(denied_by_cache-control=" + cacheControl + ")"; +// } else if (cacheControl.startsWith("PUBLIC")) { +// // ok, do nothing + } else if (cacheControl.startsWith("MAX-AGE=")) { + // we need also the load date + final Date date = this.responseHeader.date(); + if (date == null) { + return "Stale_(no_date_given_in_response)"; + } + try { + final long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live + if (serverDate.correctedUTCTime() - date.getTime() > ttl) { + //System.out.println("***not indexed because cache-control"); + return "Stale_(expired_by_cache-control)"; + } + } catch (Exception e) { + return "Error_(" + e.getMessage() + ")"; + } + } + } + return null; + } + + public String shallStoreCacheForProxy() { + if (this.requestHeader != null) { + // -authorization cases in request + // authorization makes pages very individual, and therefore we cannot use the + // content in the cache + if (this.requestHeader.containsKey(httpHeader.AUTHORIZATION)) { return "personalized"; } + // -ranges in request and response + // we do not cache partial content + if (this.requestHeader.containsKey(httpHeader.RANGE)) { return "partial"; } + } + + if (this.responseHeader != null) { + // -ranges in request and response + // we do not cache partial content + if (this.responseHeader.containsKey(httpHeader.CONTENT_RANGE)) { return "partial"; } + + // -if-modified-since in request + // we do not care about if-modified-since, because this case only occurres if the + // cache file does not exist, and we need as much info as possible for the indexing + + // -cookies in request + // we do not care about cookies, because that would prevent loading more pages + // from one domain once a request resulted in a client-side stored cookie + + // -set-cookie in response + // we do not care about cookies in responses, because that info comes along + // any/many pages from a server and does not express the validity of the page + // in modes of life-time/expiration or individuality + + // -pragma in response + // if we have a pragma non-cache, we don't cache. usually if this is wanted from + // the server, it makes sense + String cacheControl = (String) this.responseHeader.get(httpHeader.PRAGMA); + if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return "controlled_no_cache"; } + + // -expires in response + // we do not care about expires, because at the time this is called the data is + // obvious valid and that header info is used in the indexing later on + + // -cache-control in response + // the cache-control has many value options. + cacheControl = (String) this.responseHeader.get(httpHeader.CACHE_CONTROL); + if (cacheControl != null) { + cacheControl = cacheControl.trim().toUpperCase(); + if (cacheControl.startsWith("MAX-AGE=")) { + // we need also the load date + Date date = this.responseHeader.date(); + if (date == null) return "stale_no_date_given_in_response"; + try { + long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live + if (serverDate.correctedUTCTime() - date.getTime() > ttl) { + //System.out.println("***not indexed because cache-control"); + return "stale_expired"; + } + } catch (Exception e) { + return "stale_error_" + e.getMessage() + ")"; + } + } + } + } + return null; + } + + public boolean shallUseCacheForProxy() { + + String cacheControl; + if (this.requestHeader != null) { + // -authorization cases in request + if (this.requestHeader.containsKey(httpHeader.AUTHORIZATION)) { return false; } + + // -ranges in request + // we do not cache partial content + if (this.requestHeader.containsKey(httpHeader.RANGE)) { return false; } + + // if the client requests a un-cached copy of the resource ... + cacheControl = (String) this.requestHeader.get(httpHeader.PRAGMA); + if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return false; } + + cacheControl = (String) this.requestHeader.get(httpHeader.CACHE_CONTROL); + if (cacheControl != null) { + cacheControl = cacheControl.trim().toUpperCase(); + if (cacheControl.startsWith("NO-CACHE") || cacheControl.startsWith("MAX-AGE=0")) { return false; } + } + } + + // -if-modified-since in request + // The entity has to be transferred only if it has + // been modified since the date given by the If-Modified-Since header. + if (this.requestHeader.containsKey(httpHeader.IF_MODIFIED_SINCE)) { + // checking this makes only sense if the cached response contains + // a Last-Modified field. If the field does not exist, we go the safe way + if (!this.responseHeader.containsKey(httpHeader.LAST_MODIFIED)) { return false; } + // parse date + Date d1, d2; + d2 = this.responseHeader.lastModified(); if (d2 == null) { d2 = new Date(serverDate.correctedUTCTime()); } + d1 = this.requestHeader.ifModifiedSince(); if (d1 == null) { d1 = new Date(serverDate.correctedUTCTime()); } + // finally, we shall treat the cache as stale if the modification time is after the if-.. time + if (d2.after(d1)) { return false; } + } + + String mimeType = this.getMimeType(); + if (!plasmaHTCache.isPicture(mimeType)) { + // -cookies in request + // unfortunately, we should reload in case of a cookie + // but we think that pictures can still be considered as fresh + // -set-cookie in cached response + // this is a similar case as for COOKIE. + if (this.requestHeader.containsKey(httpHeader.COOKIE) || + this.responseHeader.containsKey(httpHeader.SET_COOKIE) || + this.responseHeader.containsKey(httpHeader.SET_COOKIE2)) { + return false; // too strong + } + } + + // -pragma in cached response + // logically, we would not need to care about no-cache pragmas in cached response headers, + // because they cannot exist since they are not written to the cache. + // So this IF should always fail.. + cacheControl = (String) this.responseHeader.get(httpHeader.PRAGMA); + if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return false; } + + // see for documentation also: + // http://www.web-caching.com/cacheability.html + // http://vancouver-webpages.com/CacheNow/ + + // look for freshnes information + // if we don't have any freshnes indication, we treat the file as stale. + // no handle for freshness control: + + // -expires in cached response + // the expires value gives us a very easy hint when the cache is stale + Date expires = this.responseHeader.expires(); + if (expires != null) { +// System.out.println("EXPIRES-TEST: expires=" + expires + ", NOW=" + serverDate.correctedGMTDate() + ", url=" + url); + if (expires.before(new Date(serverDate.correctedUTCTime()))) { return false; } + } + Date lastModified = this.responseHeader.lastModified(); + cacheControl = (String) this.responseHeader.get(httpHeader.CACHE_CONTROL); + if (cacheControl == null && lastModified == null && expires == null) { return false; } + + // -lastModified in cached response + // we can apply a TTL (Time To Live) heuristic here. We call the time delta between the last read + // of the file and the last modified date as the age of the file. If we consider the file as + // middel-aged then, the maximum TTL would be cache-creation plus age. + // This would be a TTL factor of 100% we want no more than 10% TTL, so that a 10 month old cache + // file may only be treated as fresh for one more month, not more. + Date date = this.responseHeader.date(); + if (lastModified != null) { + if (date == null) { date = new Date(serverDate.correctedUTCTime()); } + long age = date.getTime() - lastModified.getTime(); + if (age < 0) { return false; } + // TTL (Time-To-Live) is age/10 = (d2.getTime() - d1.getTime()) / 10 + // the actual living-time is serverDate.correctedGMTDate().getTime() - d2.getTime() + // therefore the cache is stale, if serverDate.correctedGMTDate().getTime() - d2.getTime() > age/10 + if (serverDate.correctedUTCTime() - date.getTime() > age / 10) { return false; } + } + + // -cache-control in cached response + // the cache-control has many value options. + if (cacheControl != null) { + cacheControl = cacheControl.trim().toUpperCase(); + if (cacheControl.startsWith("PRIVATE") || + cacheControl.startsWith("NO-CACHE") || + cacheControl.startsWith("NO-STORE")) { + // easy case + return false; +// } else if (cacheControl.startsWith("PUBLIC")) { +// // ok, do nothing + } else if (cacheControl.startsWith("MAX-AGE=")) { + // we need also the load date + if (date == null) { return false; } + try { + final long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live + if (serverDate.correctedUTCTime() - date.getTime() > ttl) { + return false; + } + } catch (Exception e) { + return false; + } + } + } + return true; + } + + public boolean validResponseStatus(String responseStatus) { + return responseStatus.startsWith("200") || + responseStatus.startsWith("203"); + } + + public Date ifModifiedSince() { + return (this.requestHeader == null) ? null : this.requestHeader.ifModifiedSince(); + } + + public boolean requestWithCookie() { + return (this.requestHeader == null) ? false : this.requestHeader.containsKey(httpHeader.COOKIE); + } + + public boolean requestProhibitsIndexing() { + return (this.requestHeader == null) + ? false + : this.requestHeader.containsKey(httpHeader.X_YACY_INDEX_CONTROL) && + ((String)this.requestHeader.get(httpHeader.X_YACY_INDEX_CONTROL)).toUpperCase().equals("NO-INDEX"); + } + + public httpHeader getRequestHeader() { + return this.requestHeader; + } + + public httpHeader getResponseHeader() { + return this.responseHeader; + } +} diff --git a/source/de/anomic/plasma/crawler/http/CrawlWorker.java b/source/de/anomic/plasma/crawler/http/CrawlWorker.java index 5eaafd77b..dcddcdc1f 100644 --- a/source/de/anomic/plasma/crawler/http/CrawlWorker.java +++ b/source/de/anomic/plasma/crawler/http/CrawlWorker.java @@ -64,6 +64,8 @@ import de.anomic.plasma.plasmaCrawlLoader; import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaParser; import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.plasma.cache.IResourceInfo; +import de.anomic.plasma.cache.http.ResourceInfo; import de.anomic.plasma.crawler.AbstractCrawlWorker; import de.anomic.plasma.crawler.plasmaCrawlerPool; import de.anomic.plasma.urlPattern.plasmaURLPattern; @@ -129,15 +131,15 @@ public final class CrawlWorker extends AbstractCrawlWorker { return load(DEFAULT_CRAWLING_RETRY_COUNT); } - protected plasmaHTCache.Entry createCacheEntry(Date requestDate, httpHeader requestHeader, httpc.response response) { + protected plasmaHTCache.Entry createCacheEntry(URL requestUrl, Date requestDate, httpHeader requestHeader, httpc.response response) { + IResourceInfo resourceInfo = new ResourceInfo(requestUrl,requestHeader,response.responseHeader); return this.cacheManager.newEntry( requestDate, this.depth, this.url, - this.name, - requestHeader, - response.status, - response.responseHeader, + this.name, + response.status, + resourceInfo, this.initiator, this.profile ); @@ -197,7 +199,7 @@ public final class CrawlWorker extends AbstractCrawlWorker { // the transfer is ok // create a new cache entry - htCache = createCacheEntry(requestDate, requestHeader, res); + htCache = createCacheEntry(this.url,requestDate, requestHeader, res); // aborting download if content is to long ... if (htCache.cacheFile().getAbsolutePath().length() > serverSystem.maxPathLength) { diff --git a/source/de/anomic/plasma/plasmaCrawlStacker.java b/source/de/anomic/plasma/plasmaCrawlStacker.java index 0b444dc50..103962769 100644 --- a/source/de/anomic/plasma/plasmaCrawlStacker.java +++ b/source/de/anomic/plasma/plasmaCrawlStacker.java @@ -295,7 +295,7 @@ public final class plasmaCrawlStacker { } // check if ip is local ip address - checkInterruption(); + checkInterruption(); // TODO: this is protocol specific InetAddress hostAddress = httpc.dnsResolve(nexturl.getHost()); if (hostAddress == null) { // if a http proxy is configured name resolution may not work diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java index c0b40bca4..99b3bf474 100644 --- a/source/de/anomic/plasma/plasmaHTCache.java +++ b/source/de/anomic/plasma/plasmaHTCache.java @@ -54,14 +54,12 @@ package de.anomic.plasma; import de.anomic.http.httpc; -import de.anomic.http.httpHeader; import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexURL; import de.anomic.kelondro.kelondroDyn; import de.anomic.kelondro.kelondroMap; import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.server.logging.serverLog; -import de.anomic.server.serverDate; import de.anomic.server.serverFileUtils; import de.anomic.server.serverInstantThread; import de.anomic.server.serverSystem; @@ -73,6 +71,9 @@ import java.io.IOException; import java.net.InetAddress; import java.net.MalformedURLException; import de.anomic.net.URL; +import de.anomic.plasma.cache.IResourceInfo; +import de.anomic.plasma.cache.ResourceInfoFactory; + import java.util.Date; import java.util.HashSet; import java.util.Iterator; @@ -87,7 +88,7 @@ public final class plasmaHTCache { private static final int stackLimit = 150; // if we exceed that limit, we do not check idle public static final long oneday = 1000 * 60 * 60 * 24; // milliseconds of a day - private kelondroMap responseHeaderDB = null; + kelondroMap responseHeaderDB = null; private final LinkedList cacheStack; private final TreeMap cacheAge; // a - relation public long curCacheSize; @@ -96,11 +97,16 @@ public final class plasmaHTCache { public final serverLog log; public static final HashSet filesInUse = new HashSet(); // can we delete this file + private ResourceInfoFactory objFactory; + public plasmaHTCache(File htCachePath, long maxCacheSize, int bufferkb, long preloadTime) { // this.switchboard = switchboard; this.log = new serverLog("HTCACHE"); this.cachePath = htCachePath; + + // create the object factory + this.objFactory = new ResourceInfoFactory(); // reset old HTCache ? String[] list = this.cachePath.list(); @@ -229,10 +235,6 @@ public final class plasmaHTCache { } } - public void storeHeader(String urlHash, httpHeader responseHeader) throws IOException { - this.responseHeaderDB.set(urlHash, responseHeader); - } - /** * This method changes the HTCache size.
* @param new cache size in bytes @@ -249,7 +251,7 @@ public final class plasmaHTCache { return (this.curCacheSize >= this.maxCacheSize) ? 0 : this.maxCacheSize - this.curCacheSize; } - public boolean writeFile(URL url, byte[] array) { + public boolean writeResourceContent(URL url, byte[] array) { if (array == null) return false; File file = getCachePath(url); try { @@ -445,10 +447,24 @@ public final class plasmaHTCache { return prefix + s.substring(0, p); } - public httpHeader getCachedResponse(String urlHash) throws IOException { + /** + * Returns an object containing metadata about a cached resource + * @param url the url of the resource + * @return an {@link IResourceInfo info object} + * @throws Exception of the info object could not be created, e.g. if the protocol is not supported + */ + public IResourceInfo loadResourceInfo(URL url) throws Exception { + + // getting the URL hash + String urlHash = indexURL.urlHash(url.toNormalform()); + + // loading data from database Map hdb = this.responseHeaderDB.get(urlHash); if (hdb == null) return null; - return new httpHeader(null, hdb); + + // generate the cached object + IResourceInfo cachedObj = this.objFactory.buildResourceInfoObj(url, hdb); + return cachedObj; } public boolean full() { @@ -459,18 +475,17 @@ public final class plasmaHTCache { return (this.cacheStack.size() == 0); } - public static boolean isPicture(httpHeader response) { - Object ct = response.get(httpHeader.CONTENT_TYPE); - if (ct == null) return false; - return ((String)ct).toUpperCase().startsWith("IMAGE"); + public static boolean isPicture(String mimeType) { + if (mimeType == null) return false; + return mimeType.toUpperCase().startsWith("IMAGE"); } - public static boolean isText(httpHeader response) { + public static boolean isText(String mimeType) { // Object ct = response.get(httpHeader.CONTENT_TYPE); // if (ct == null) return false; // String t = ((String)ct).toLowerCase(); // return ((t.startsWith("text")) || (t.equals("application/xhtml+xml"))); - return plasmaParser.supportedMimeTypesContains(response.mime()); + return plasmaParser.supportedMimeTypesContains(mimeType); } public static boolean noIndexingURL(String urlString) { @@ -568,9 +583,8 @@ public final class plasmaHTCache { } if (port < 0) { return new File(this.cachePath, protocol + "/" + host + path); - } else { - return new File(this.cachePath, protocol + "/" + host + "!" + port + path); } + return new File(this.cachePath, protocol + "/" + host + "!" + port + path); } /** @@ -663,7 +677,7 @@ public final class plasmaHTCache { return null; } - public byte[] loadResource(URL url) { + public byte[] loadResourceContent(URL url) { // load the url as resource from the cache File f = getCachePath(url); if (f.exists()) try { @@ -690,12 +704,30 @@ public final class plasmaHTCache { (ls.indexOf("memberlist.php?sid=") >= 0)); } - public Entry newEntry(Date initDate, int depth, URL url, String name, - httpHeader requestHeader, - String responseStatus, httpHeader responseHeader, - String initiator, - plasmaCrawlProfile.entry profile) { - return new Entry(initDate, depth, url, name, requestHeader, responseStatus, responseHeader, initiator, profile); + public Entry newEntry( + Date initDate, + int depth, + URL url, + String name, + //httpHeader requestHeader, + String responseStatus, + //httpHeader responseHeader, + IResourceInfo docInfo, + String initiator, + plasmaCrawlProfile.entry profile + ) { + return new Entry( + initDate, + depth, + url, + name, + //requestHeader, + responseStatus, + //responseHeader, + docInfo, + initiator, + profile + ); } public final class Entry { @@ -703,9 +735,9 @@ public final class plasmaHTCache { // the class objects private Date initDate; // the date when the request happened; will be used as a key private int depth; // the depth of prefetching - private httpHeader requestHeader; // we carry also the header to prevent too many file system access - private String responseStatus; - private httpHeader responseHeader; // we carry also the header to prevent too many file system access +// private httpHeader requestHeader; // we carry also the header to prevent too many file system access +// private httpHeader responseHeader; // we carry also the header to prevent too many file system access + private String responseStatus; private File cacheFile; // the cache file private byte[] cacheArray; // or the cache as byte-array private URL url; @@ -718,6 +750,11 @@ public final class plasmaHTCache { private String language; private plasmaCrawlProfile.entry profile; private String initiator; + + /** + * protocolspecific information about the resource + */ + private IResourceInfo resInfo; protected Object clone() throws CloneNotSupportedException { return new Entry( @@ -725,9 +762,10 @@ public final class plasmaHTCache { this.depth, this.url, this.name, - this.requestHeader, + //this.requestHeader, this.responseStatus, - this.responseHeader, + //this.responseHeader, + this.resInfo, this.initiator, this.profile ); @@ -737,15 +775,21 @@ public final class plasmaHTCache { int depth, URL url, String name, - httpHeader requestHeader, - String responseStatus, - httpHeader responseHeader, + //httpHeader requestHeader, + String responseStatus, + //httpHeader responseHeader, + IResourceInfo resourceInfo, String initiator, plasmaCrawlProfile.entry profile ) { - + if (resourceInfo == null){ + System.out.println("Content information object is null. " + url); + System.exit(0); + } + this.resInfo = resourceInfo; + + // normalize url -// serverLog.logFine("PLASMA", "Entry: URL=" + url.toString()); this.nomalizedURLString = url.toNormalform(); try { @@ -761,28 +805,17 @@ public final class plasmaHTCache { // assigned: this.initDate = initDate; this.depth = depth; - this.requestHeader = requestHeader; + //this.requestHeader = requestHeader; this.responseStatus = responseStatus; - this.responseHeader = responseHeader; + //this.responseHeader = responseHeader; this.profile = profile; this.initiator = (initiator == null) ? null : ((initiator.length() == 0) ? null : initiator); - // calculated: - if (responseHeader == null) { - try { - throw new RuntimeException("RESPONSE HEADER = NULL"); - } catch (Exception e) { - System.out.println("RESPONSE HEADER = NULL in " + url); - e.printStackTrace(); - System.exit(0); - } - - this.lastModified = new Date(serverDate.correctedUTCTime()); - } else { - this.lastModified = responseHeader.lastModified(); - if (this.lastModified == null) this.lastModified = new Date(serverDate.correctedUTCTime()); // does not exist in header - } - this.doctype = indexEntryAttribute.docType(responseHeader.mime()); + // getting the last modified date + this.lastModified = resourceInfo.getModificationDate(); + + // getting the doctype + this.doctype = indexEntryAttribute.docType(resourceInfo.getMimeType()); if (this.doctype == indexEntryAttribute.DT_UNKNOWN) this.doctype = indexEntryAttribute.docType(url); this.language = indexEntryAttribute.language(url); @@ -822,12 +855,7 @@ public final class plasmaHTCache { } public URL referrerURL() { - if (this.requestHeader == null) return null; - try { - return new URL((String) this.requestHeader.get(httpHeader.REFERER, "")); - } catch (Exception e) { - return null; - } + return (this.resInfo==null)?null:this.resInfo.getRefererUrl(); } public File cacheFile() { @@ -846,27 +874,36 @@ public final class plasmaHTCache { // return this.requestHeader; // } - public httpHeader responseHeader() { - return this.responseHeader; +// public httpHeader responseHeader() { +// return this.responseHeader; +// } + + public IResourceInfo getDocumentInfo() { + return this.resInfo; } + public boolean writeResourceInfo() throws IOException { + assert(this.nomalizedURLHash != null) : "URL Hash is null"; + if (this.resInfo == null) return false; + + plasmaHTCache.this.responseHeaderDB.set(this.nomalizedURLHash, this.resInfo.getMap()); + return true; + } + public String getMimeType() { - return (this.responseHeader == null) ? null : this.responseHeader.mime(); + return (this.resInfo == null) ? null : this.resInfo.getMimeType(); } public Date ifModifiedSince() { - return (this.requestHeader == null) ? null : this.requestHeader.ifModifiedSince(); + return (this.resInfo == null) ? null : this.resInfo.ifModifiedSince(); } public boolean requestWithCookie() { - return (this.requestHeader == null) ? false : this.requestHeader.containsKey(httpHeader.COOKIE); + return (this.resInfo == null) ? false : this.resInfo.requestWithCookie(); } public boolean requestProhibitsIndexing() { - return (this.requestHeader == null) - ? false - : this.requestHeader.containsKey(httpHeader.X_YACY_INDEX_CONTROL) && - ((String)this.requestHeader.get(httpHeader.X_YACY_INDEX_CONTROL)).toUpperCase().equals("NO-INDEX"); + return (this.resInfo == null) ? false : this.resInfo.requestProhibitsIndexing(); } /* @@ -878,9 +915,10 @@ public final class plasmaHTCache { // the following three methods for cache read/write granting shall be as loose as possible // but also as strict as necessary to enable caching of most items + /** + * @return NULL if the answer is TRUE, in case of FALSE, the reason as String is returned + */ public String shallStoreCacheForProxy() { - // returns NULL if the answer is TRUE - // in case of FALSE, the reason as String is returned // check profile (disabled: we will check this in the plasmaSwitchboard) //if (!this.profile.storeHTCache()) { return "storage_not_wanted"; } @@ -889,8 +927,11 @@ public final class plasmaHTCache { // if the storage was requested by prefetching, the request map is null // check status code - if (!(this.responseStatus.startsWith("200") || - this.responseStatus.startsWith("203"))) { return "bad_status_" + this.responseStatus.substring(0,3); } + if ((this.resInfo != null) && (!this.resInfo.validResponseStatus(this.responseStatus))) { + return "bad_status_" + this.responseStatus.substring(0,3); + } +// if (!(this.responseStatus.startsWith("200") || +// this.responseStatus.startsWith("203"))) { return "bad_status_" + this.responseStatus.substring(0,3); } // check storage location // sometimes a file name is equal to a path name in the same directory; @@ -905,62 +946,10 @@ public final class plasmaHTCache { if (isPOST(this.nomalizedURLString) && !this.profile.crawlingQ()) { return "dynamic_post"; } if (isCGI(this.nomalizedURLString)) { return "dynamic_cgi"; } - if (this.requestHeader != null) { - // -authorization cases in request - // authorization makes pages very individual, and therefore we cannot use the - // content in the cache - if (this.requestHeader.containsKey(httpHeader.AUTHORIZATION)) { return "personalized"; } - // -ranges in request and response - // we do not cache partial content - if (this.requestHeader.containsKey(httpHeader.RANGE)) { return "partial"; } - } - // -ranges in request and response - // we do not cache partial content - if (this.responseHeader != null && this.responseHeader.containsKey(httpHeader.CONTENT_RANGE)) { return "partial"; } - - // -if-modified-since in request - // we do not care about if-modified-since, because this case only occurres if the - // cache file does not exist, and we need as much info as possible for the indexing - - // -cookies in request - // we do not care about cookies, because that would prevent loading more pages - // from one domain once a request resulted in a client-side stored cookie - - // -set-cookie in response - // we do not care about cookies in responses, because that info comes along - // any/many pages from a server and does not express the validity of the page - // in modes of life-time/expiration or individuality - - // -pragma in response - // if we have a pragma non-cache, we don't cache. usually if this is wanted from - // the server, it makes sense - String cacheControl = (String) this.responseHeader.get(httpHeader.PRAGMA); - if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return "controlled_no_cache"; } - - // -expires in response - // we do not care about expires, because at the time this is called the data is - // obvious valid and that header info is used in the indexing later on - - // -cache-control in response - // the cache-control has many value options. - cacheControl = (String) this.responseHeader.get(httpHeader.CACHE_CONTROL); - if (cacheControl != null) { - cacheControl = cacheControl.trim().toUpperCase(); - if (cacheControl.startsWith("MAX-AGE=")) { - // we need also the load date - Date date = this.responseHeader.date(); - if (date == null) return "stale_no_date_given_in_response"; - try { - long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live - if (serverDate.correctedUTCTime() - date.getTime() > ttl) { - //System.out.println("***not indexed because cache-control"); - return "stale_expired"; - } - } catch (Exception e) { - return "stale_error_" + e.getMessage() + ")"; - } - } + if (this.resInfo != null) { + return this.resInfo.shallStoreCacheForProxy(); } + return null; } @@ -971,146 +960,17 @@ public final class plasmaHTCache { public boolean shallUseCacheForProxy() { // System.out.println("SHALL READ CACHE: requestHeader = " + requestHeader.toString() + ", responseHeader = " + responseHeader.toString()); - String cacheControl; - if (this.requestHeader != null) { - // -authorization cases in request - if (this.requestHeader.containsKey(httpHeader.AUTHORIZATION)) { return false; } - - // -ranges in request - // we do not cache partial content - if (this.requestHeader.containsKey(httpHeader.RANGE)) { return false; } - - // if the client requests a un-cached copy of the resource ... - cacheControl = (String) this.requestHeader.get(httpHeader.PRAGMA); - if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return false; } - - cacheControl = (String) this.requestHeader.get(httpHeader.CACHE_CONTROL); - if (cacheControl != null) { - cacheControl = cacheControl.trim().toUpperCase(); - if (cacheControl.startsWith("NO-CACHE") || cacheControl.startsWith("MAX-AGE=0")) { return false; } - } - } - // -CGI access in request // CGI access makes the page very individual, and therefore not usable in caches if (isPOST(this.nomalizedURLString)) { return false; } if (isCGI(this.nomalizedURLString)) { return false; } - - // -if-modified-since in request - // The entity has to be transferred only if it has - // been modified since the date given by the If-Modified-Since header. - if (this.requestHeader.containsKey(httpHeader.IF_MODIFIED_SINCE)) { - // checking this makes only sense if the cached response contains - // a Last-Modified field. If the field does not exist, we go the safe way - if (!this.responseHeader.containsKey(httpHeader.LAST_MODIFIED)) { return false; } - // parse date - Date d1, d2; - d2 = this.responseHeader.lastModified(); if (d2 == null) { d2 = new Date(serverDate.correctedUTCTime()); } - d1 = this.requestHeader.ifModifiedSince(); if (d1 == null) { d1 = new Date(serverDate.correctedUTCTime()); } - // finally, we shall treat the cache as stale if the modification time is after the if-.. time - if (d2.after(d1)) { return false; } - } - - if (!isPicture(this.responseHeader)) { - // -cookies in request - // unfortunately, we should reload in case of a cookie - // but we think that pictures can still be considered as fresh - // -set-cookie in cached response - // this is a similar case as for COOKIE. - if (this.requestHeader.containsKey(httpHeader.COOKIE) || - this.responseHeader.containsKey(httpHeader.SET_COOKIE) || - this.responseHeader.containsKey(httpHeader.SET_COOKIE2)) { - return false; // too strong - } - } - - // -pragma in cached response - // logically, we would not need to care about no-cache pragmas in cached response headers, - // because they cannot exist since they are not written to the cache. - // So this IF should always fail.. - cacheControl = (String) this.responseHeader.get(httpHeader.PRAGMA); - if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return false; } - - // see for documentation also: - // http://www.web-caching.com/cacheability.html - // http://vancouver-webpages.com/CacheNow/ - - // look for freshnes information - // if we don't have any freshnes indication, we treat the file as stale. - // no handle for freshness control: - - // -expires in cached response - // the expires value gives us a very easy hint when the cache is stale - Date expires = this.responseHeader.expires(); - if (expires != null) { -// System.out.println("EXPIRES-TEST: expires=" + expires + ", NOW=" + serverDate.correctedGMTDate() + ", url=" + url); - if (expires.before(new Date(serverDate.correctedUTCTime()))) { return false; } - } - Date lastModified = this.responseHeader.lastModified(); - cacheControl = (String) this.responseHeader.get(httpHeader.CACHE_CONTROL); - if (cacheControl == null && lastModified == null && expires == null) { return false; } - - // -lastModified in cached response - // we can apply a TTL (Time To Live) heuristic here. We call the time delta between the last read - // of the file and the last modified date as the age of the file. If we consider the file as - // middel-aged then, the maximum TTL would be cache-creation plus age. - // This would be a TTL factor of 100% we want no more than 10% TTL, so that a 10 month old cache - // file may only be treated as fresh for one more month, not more. - Date date = this.responseHeader.date(); - if (lastModified != null) { - if (date == null) { date = new Date(serverDate.correctedUTCTime()); } - long age = date.getTime() - lastModified.getTime(); - if (age < 0) { return false; } - // TTL (Time-To-Live) is age/10 = (d2.getTime() - d1.getTime()) / 10 - // the actual living-time is serverDate.correctedGMTDate().getTime() - d2.getTime() - // therefore the cache is stale, if serverDate.correctedGMTDate().getTime() - d2.getTime() > age/10 - if (serverDate.correctedUTCTime() - date.getTime() > age / 10) { return false; } - } - - // -cache-control in cached response - // the cache-control has many value options. - if (cacheControl != null) { - cacheControl = cacheControl.trim().toUpperCase(); - if (cacheControl.startsWith("PRIVATE") || - cacheControl.startsWith("NO-CACHE") || - cacheControl.startsWith("NO-STORE")) { - // easy case - return false; -// } else if (cacheControl.startsWith("PUBLIC")) { -// // ok, do nothing - } else if (cacheControl.startsWith("MAX-AGE=")) { - // we need also the load date - if (date == null) { return false; } - try { - final long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live - if (serverDate.correctedUTCTime() - date.getTime() > ttl) { - return false; - } - } catch (Exception e) { - return false; - } - } + + if (this.resInfo != null) { + return this.resInfo.shallUseCacheForProxy(); } + return true; } } // class Entry - - /* - public static void main(String[] args) { - //String[] s = TimeZone.getAvailableIDs(); - //for (int i = 0; i < s.length; i++) System.out.println("ZONE=" + s[i]); - Calendar c = GregorianCalendar.getInstance(); - int zoneOffset = c.get(Calendar.ZONE_OFFSET)/(60*60*1000); - int DSTOffset = c.get(Calendar.DST_OFFSET)/(60*60*1000); - System.out.println("This Offset = " + (zoneOffset + DSTOffset)); - for (int i = 0; i < 12; i++) { - c = new GregorianCalendar(TimeZone.getTimeZone("Etc/GMT-" + i)); - //c.setTimeZone(TimeZone.getTimeZone("Etc/GMT+0")); - System.out.println("Zone offset: "+ - c.get(Calendar.ZONE_OFFSET)/(60*60*1000)); - System.out.println(c.get(GregorianCalendar.HOUR) + ", " + c.getTime() + ", " + c.getTimeInMillis()); - } - } - **/ } diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index 676f3e047..f6c0fe2b3 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -44,6 +44,7 @@ package de.anomic.plasma; import java.io.IOException; import de.anomic.net.URL; +import de.anomic.plasma.cache.IResourceInfo; import de.anomic.plasma.crawler.http.CrawlWorker; import java.util.Enumeration; @@ -167,15 +168,15 @@ public class plasmaSnippetCache { // if the snippet is not in the cache, we can try to get it from the htcache byte[] resource = null; - httpHeader header = null; + IResourceInfo docInfo = null; try { - resource = cacheManager.loadResource(url); + resource = this.cacheManager.loadResourceContent(url); if ((fetchOnline) && (resource == null)) { plasmaHTCache.Entry entry = loadResourceFromWeb(url, 5000); if (entry != null) { - header = entry.responseHeader(); + docInfo = entry.getDocumentInfo(); } - resource = cacheManager.loadResource(url); + resource = this.cacheManager.loadResourceContent(url); source = SOURCE_WEB; } } catch (IOException e) { @@ -185,7 +186,7 @@ public class plasmaSnippetCache { //System.out.println("cannot load document for URL " + url); return new result(null, ERROR_RESOURCE_LOADING, "error loading resource from web, cacheManager returned NULL"); } - plasmaParserDocument document = parseDocument(url, resource, header); + plasmaParserDocument document = parseDocument(url, resource, docInfo); if (document == null) return new result(null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed //System.out.println("loaded document for URL " + url); @@ -360,18 +361,18 @@ public class plasmaSnippetCache { return parseDocument(url, resource, null); } - public plasmaParserDocument parseDocument(URL url, byte[] resource, httpHeader header) { + public plasmaParserDocument parseDocument(URL url, byte[] resource, IResourceInfo docInfo) { try { if (resource == null) return null; // try to get the header from the htcache directory - if (header == null) { + if (docInfo == null) { try { - header = this.cacheManager.getCachedResponse(indexURL.urlHash(url)); - } catch (IOException e) {} + docInfo = this.cacheManager.loadResourceInfo(url); + } catch (Exception e) {} } - if (header == null) { + if (docInfo == null) { String filename = this.cacheManager.getCachePath(url).getName(); int p = filename.lastIndexOf('.'); if ( // if no extension is available @@ -394,8 +395,8 @@ public class plasmaSnippetCache { } return null; } - if (plasmaParser.supportedMimeTypesContains(header.mime())) { - return this.parser.parseSource(url, header.mime(), resource); + if (plasmaParser.supportedMimeTypesContains(docInfo.getMimeType())) { + return this.parser.parseSource(url, docInfo.getMimeType(), resource); } return null; } catch (InterruptedException e) { @@ -407,10 +408,10 @@ public class plasmaSnippetCache { public byte[] getResource(URL url, boolean fetchOnline, int socketTimeout) { // load the url as resource from the web try { - byte[] resource = cacheManager.loadResource(url); + byte[] resource = cacheManager.loadResourceContent(url); if ((fetchOnline) && (resource == null)) { loadResourceFromWeb(url, (socketTimeout < 0) ? -1 : socketTimeout); - resource = cacheManager.loadResource(url); + resource = cacheManager.loadResourceContent(url); } return resource; } catch (IOException e) { diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index fea42ed2d..49f3e15f9 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -829,7 +829,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser /* ========================================================================= * LOCAL IP ADDRESS CHECK * - * check if ip is local ip address + * check if ip is local ip address // TODO: remove this procotol specific code here * ========================================================================= */ InetAddress hostAddress = httpc.dnsResolve(entry.url().getHost()); if (hostAddress == null) { @@ -856,9 +856,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser (entry.profile().storeHTCache()) || (doIndexing && isSupportedContent) ) { - // store response header - if (entry.responseHeader() != null) { - this.cacheManager.storeHeader(entry.urlHash(), entry.responseHeader()); + // store response header + if (entry.writeResourceInfo()) { this.log.logInfo("WROTE HEADER for " + entry.cacheFile()); } @@ -868,7 +867,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } else { String error = entry.shallStoreCacheForProxy(); if (error == null) { - this.cacheManager.writeFile(entry.url(), entry.cacheArray()); + this.cacheManager.writeResourceContent(entry.url(), entry.cacheArray()); this.log.logFine("WROTE FILE (" + entry.cacheArray().length + " bytes) for " + entry.cacheFile()); } else { this.log.logFine("WRITE OF FILE " + entry.cacheFile() + " FORBIDDEN: " + error); diff --git a/source/de/anomic/plasma/plasmaSwitchboardQueue.java b/source/de/anomic/plasma/plasmaSwitchboardQueue.java index eaa0e5c9c..eda6f0e90 100644 --- a/source/de/anomic/plasma/plasmaSwitchboardQueue.java +++ b/source/de/anomic/plasma/plasmaSwitchboardQueue.java @@ -44,28 +44,27 @@ package de.anomic.plasma; -import de.anomic.http.httpHeader; +import java.io.File; +import java.io.IOException; +import java.net.MalformedURLException; +import java.util.ArrayList; +import java.util.Date; + import de.anomic.index.indexURL; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroException; -import de.anomic.kelondro.kelondroStack; import de.anomic.kelondro.kelondroRow; +import de.anomic.kelondro.kelondroStack; +import de.anomic.net.URL; +import de.anomic.plasma.cache.IResourceInfo; import de.anomic.server.logging.serverLog; -import de.anomic.server.serverDate; import de.anomic.yacy.yacySeedDB; -import java.io.File; -import java.io.IOException; -import java.net.MalformedURLException; -import de.anomic.net.URL; -import java.util.ArrayList; -import java.util.Date; - public class plasmaSwitchboardQueue { private kelondroStack sbQueueStack; private plasmaCrawlProfile profiles; - private plasmaHTCache htCache; + plasmaHTCache htCache; private plasmaCrawlLURL lurls; private File sbQueueStackPath; @@ -191,7 +190,7 @@ public class plasmaSwitchboardQueue { // computed values private plasmaCrawlProfile.entry profileEntry; - private httpHeader responseHeader; + private IResourceInfo contentInfo; private URL referrerURL; public Entry(URL url, String referrer, Date ifModifiedSince, boolean requestWithCookie, @@ -206,7 +205,7 @@ public class plasmaSwitchboardQueue { this.anchorName = (anchorName==null)?"":anchorName.trim(); this.profileEntry = null; - this.responseHeader = null; + this.contentInfo = null; this.referrerURL = null; } @@ -227,7 +226,7 @@ public class plasmaSwitchboardQueue { this.anchorName = row.getColString(7, "UTF-8"); this.profileEntry = null; - this.responseHeader = null; + this.contentInfo = null; this.referrerURL = null; } @@ -248,7 +247,7 @@ public class plasmaSwitchboardQueue { this.anchorName = (row[7] == null) ? null : (new String(row[7], "UTF-8")).trim(); this.profileEntry = null; - this.responseHeader = null; + this.contentInfo = null; this.referrerURL = null; } @@ -306,32 +305,24 @@ public class plasmaSwitchboardQueue { return profileEntry; } - private httpHeader responseHeader() { - if (responseHeader == null) try { - responseHeader = htCache.getCachedResponse(indexURL.urlHash(url)); - } catch (IOException e) { + private IResourceInfo getCachedObjectInfo() { + if (this.contentInfo == null) try { + this.contentInfo = plasmaSwitchboardQueue.this.htCache.loadResourceInfo(this.url); + } catch (Exception e) { serverLog.logSevere("PLASMA", "responseHeader: failed to get header", e); return null; } - return responseHeader; + return this.contentInfo; } public String getMimeType() { - httpHeader headers = this.responseHeader(); - return (headers == null) ? null : headers.mime(); + IResourceInfo info = this.getCachedObjectInfo(); + return (info == null) ? null : info.getMimeType(); } public Date getModificationDate() { - Date docDate = null; - - httpHeader headers = this.responseHeader(); - if (headers != null) { - docDate = headers.lastModified(); - if (docDate == null) docDate = headers.date(); - } - if (docDate == null) docDate = new Date(); - - return docDate; + IResourceInfo info = this.getCachedObjectInfo(); + return (info == null) ? new Date() : info.getModificationDate(); } public URL referrerURL() { @@ -360,6 +351,8 @@ public class plasmaSwitchboardQueue { * this method returns null if the answer is 'YES'! * if the answer is 'NO' (do not index), it returns a string with the reason * to reject the crawling demand in clear text + * + * This function is used by plasmaSwitchboard#processResourceStack */ public final String shallIndexCacheForProxy() { if (profile() == null) { @@ -402,91 +395,8 @@ public class plasmaSwitchboardQueue { return "Dynamic_(Requested_With_Cookie)"; } - // -set-cookie in response - // the set-cookie from the server does not indicate that the content is special - // thus we do not care about it here for indexing - if (responseHeader() != null) { - // a picture cannot be indexed - if (plasmaHTCache.isPicture(responseHeader())) { - return "Media_Content_(Picture)"; - } - if (!plasmaHTCache.isText(responseHeader())) { - return "Media_Content_(not_text)"; - } - - // -if-modified-since in request - // if the page is fresh at the very moment we can index it - if ((ifModifiedSince != null) && (responseHeader().containsKey(httpHeader.LAST_MODIFIED))) { - // parse date - Date d = responseHeader().lastModified(); - if (d == null) { - d = new Date(serverDate.correctedUTCTime()); - } - // finally, we shall treat the cache as stale if the modification time is after the if-.. time - if (d.after(ifModifiedSince)) { - //System.out.println("***not indexed because if-modified-since"); - return "Stale_(Last-Modified>Modified-Since)"; - } - } - - // -pragma in cached response - if (responseHeader().containsKey(httpHeader.PRAGMA) && - ((String) responseHeader().get(httpHeader.PRAGMA)).toUpperCase().equals("NO-CACHE")) { - return "Denied_(pragma_no_cache)"; - } - - // see for documentation also: - // http://www.web-caching.com/cacheability.html - - // look for freshnes information - - // -expires in cached response - // the expires value gives us a very easy hint when the cache is stale - // sometimes, the expires date is set to the past to prevent that a page is cached - // we use that information to see if we should index it - final Date expires = responseHeader().expires(); - if (expires != null && expires.before(new Date(serverDate.correctedUTCTime()))) { - return "Stale_(Expired)"; - } - - // -lastModified in cached response - // this information is too weak to use it to prevent indexing - // even if we can apply a TTL heuristic for cache usage - - // -cache-control in cached response - // the cache-control has many value options. - String cacheControl = (String) responseHeader.get(httpHeader.CACHE_CONTROL); - if (cacheControl != null) { - cacheControl = cacheControl.trim().toUpperCase(); - /* we have the following cases for cache-control: - "public" -- can be indexed - "private", "no-cache", "no-store" -- cannot be indexed - "max-age=" -- stale/fresh dependent on date - */ - if (cacheControl.startsWith("PRIVATE") || - cacheControl.startsWith("NO-CACHE") || - cacheControl.startsWith("NO-STORE")) { - // easy case - return "Stale_(denied_by_cache-control=" + cacheControl + ")"; -// } else if (cacheControl.startsWith("PUBLIC")) { -// // ok, do nothing - } else if (cacheControl.startsWith("MAX-AGE=")) { - // we need also the load date - final Date date = responseHeader().date(); - if (date == null) { - return "Stale_(no_date_given_in_response)"; - } - try { - final long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live - if (serverDate.correctedUTCTime() - date.getTime() > ttl) { - //System.out.println("***not indexed because cache-control"); - return "Stale_(expired_by_cache-control)"; - } - } catch (Exception e) { - return "Error_(" + e.getMessage() + ")"; - } - } - } + if (getCachedObjectInfo() != null) { + return this.getCachedObjectInfo().shallIndexCacheForProxy(); } return null; } @@ -496,6 +406,8 @@ public class plasmaSwitchboardQueue { * this method returns null if the answer is 'YES'! * if the answer is 'NO' (do not index), it returns a string with the reason * to reject the crawling demand in clear text + * + * This function is used by plasmaSwitchboard#processResourceStack */ public final String shallIndexCacheForCrawler() { if (profile() == null) { @@ -520,9 +432,9 @@ public class plasmaSwitchboardQueue { // we checked that in shallStoreCache // a picture cannot be indexed - if (responseHeader() != null) { - if (plasmaHTCache.isPicture(responseHeader())) { return "Media_Content_(Picture)"; } - if (!plasmaHTCache.isText(responseHeader())) { return "Media_Content_(not_text)"; } + if (getCachedObjectInfo() != null) { + String status = this.getCachedObjectInfo().shallIndexCacheForProxy(); + if (status != null) return status; } if (plasmaHTCache.noIndexingURL(nURL)) { return "Media_Content_(forbidden)"; }