From dae763d8e3df0e5281bbc7018577db8da06f6ea1 Mon Sep 17 00:00:00 2001
From: theli
Date: Wed, 6 Sep 2006 14:31:17 +0000
Subject: [PATCH] git-svn-id:
https://svn.berlios.de/svnroot/repos/yacy/trunk@2495
6c8d7289-2bf4-0310-a012-ef5d649a1542
---
htroot/CacheAdmin_p.java | 7 +-
htroot/ViewFile.html | 2 +
htroot/ViewFile.java | 309 ++++++------
source/de/anomic/http/httpdProxyHandler.java | 19 +-
source/de/anomic/icap/icapd.java | 8 +-
.../de/anomic/plasma/cache/IResourceInfo.java | 136 +++++
.../plasma/cache/ResourceInfoFactory.java | 86 ++++
.../plasma/cache/http/ResourceInfo.java | 467 ++++++++++++++++++
.../plasma/crawler/http/CrawlWorker.java | 14 +-
.../de/anomic/plasma/plasmaCrawlStacker.java | 2 +-
source/de/anomic/plasma/plasmaHTCache.java | 380 +++++---------
.../de/anomic/plasma/plasmaSnippetCache.java | 29 +-
.../de/anomic/plasma/plasmaSwitchboard.java | 9 +-
.../anomic/plasma/plasmaSwitchboardQueue.java | 152 ++----
14 files changed, 1057 insertions(+), 563 deletions(-)
create mode 100644 source/de/anomic/plasma/cache/IResourceInfo.java
create mode 100644 source/de/anomic/plasma/cache/ResourceInfoFactory.java
create mode 100644 source/de/anomic/plasma/cache/http/ResourceInfo.java
diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java
index 77bed6ae1..84d7f69d7 100644
--- a/htroot/CacheAdmin_p.java
+++ b/htroot/CacheAdmin_p.java
@@ -59,6 +59,7 @@ import de.anomic.index.indexURL;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSwitchboard;
+import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.server.serverCore;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverObjects;
@@ -102,8 +103,8 @@ public class CacheAdmin_p {
info.ensureCapacity(40000);
try {
- final httpHeader fileheader = switchboard.cacheManager.getCachedResponse(indexURL.urlHash(url));
- info.append("HTTP Header:
").append(formatHeader(fileheader)).append("
");
+ final IResourceInfo resInfo = switchboard.cacheManager.loadResourceInfo(url);
+ info.append("HTTP Header:
").append(formatHeader(resInfo.getMap())).append("
");
final String ff = file.toString();
final int dotpos = ff.lastIndexOf('.');
final String ext = (dotpos >= 0) ? ff.substring(dotpos + 1).toLowerCase() : "";
@@ -198,7 +199,7 @@ public class CacheAdmin_p {
return new String(s);
}
- private static String formatHeader(httpHeader header) {
+ private static String formatHeader(Map header) {
final StringBuffer result = new StringBuffer(2048);
if (header == null) {
result.append("- no header in header cache -
");
diff --git a/htroot/ViewFile.html b/htroot/ViewFile.html
index 648192f1b..87830d891 100644
--- a/htroot/ViewFile.html
+++ b/htroot/ViewFile.html
@@ -56,6 +56,8 @@ Invalid URL
Unable to download resource content.
::
Unable to parse resource content.
+::
+Unsupported protocol.
#(/error)#
diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java
index 3ae7bff55..f1b7b06a5 100644
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@@ -4,23 +4,23 @@
//(C) by Michael Peter Christen; mc@anomic.de
//first published on http://www.anomic.de
//Frankfurt, Germany, 2004
-//
+
//last major change: 12.07.2004
-//
+
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version.
-//
+
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
-//
+
//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-//
+
//Using this software in any meaning (reading, learning, copying, compiling,
//running) means that you agree that the Author(s) is (are) not responsible
//for cost, loss of data or any harm that may be caused directly or indirectly
@@ -32,7 +32,7 @@
//(are) also not responsible for proper configuration and usage of the
//software, even if provoked by documentation provided together with
//the software.
-//
+
//Any changes to this file according to the GPL as documented in the file
//gpl.txt aside this file in the shipment you received can be done to the
//lines that follows this copyright notice here, but changes must not be
@@ -56,18 +56,19 @@ import de.anomic.http.httpc;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSwitchboard;
+import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.plasma.plasmaCrawlLURL.Entry;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
public class ViewFile {
-
+
public static final int VIEW_MODE_NO_TEXT = 0;
public static final int VIEW_MODE_AS_PLAIN_TEXT = 1;
public static final int VIEW_MODE_AS_PARSED_TEXT = 2;
public static final int VIEW_MODE_AS_PARSED_SENTENCES = 3;
public static final int VIEW_MODE_AS_IFRAME = 4;
-
+
public static final String[] highlightingColors = new String[] {
"255,255,100",
"255,155,155",
@@ -78,12 +79,12 @@ public class ViewFile {
};
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
-
+
serverObjects prop = new serverObjects();
plasmaSwitchboard sb = (plasmaSwitchboard)env;
-
-
+
+
if (post.containsKey("words"))
try {
prop.put("error_words",URLEncoder.encode((String) post.get("words"), "UTF-8"));
@@ -91,148 +92,168 @@ public class ViewFile {
// TODO Auto-generated catch block
e1.printStackTrace();
}
-
- if (post != null) {
- // getting the url hash from which the content should be loaded
- String urlHash = post.get("urlHash","");
- if (urlHash.equals("")) {
- prop.put("error",1);
- prop.put("viewMode",VIEW_MODE_NO_TEXT);
- return prop;
- }
-
- String viewMode = post.get("viewMode","sentences");
-
- // getting the urlEntry that belongs to the url hash
- Entry urlEntry = null;
- try {
- urlEntry = sb.urlPool.loadedURL.getEntry(urlHash, null);
- } catch (IOException e) {
- prop.put("error",2);
- prop.put("viewMode",VIEW_MODE_NO_TEXT);
- return prop;
- }
-
- // gettin the url that belongs to the entry
- URL url = urlEntry.url();
- if (url == null) {
- prop.put("error",3);
- prop.put("viewMode",VIEW_MODE_NO_TEXT);
- return prop;
- }
-
- // loading the resource content as byte array
- byte[] resource = null;
- httpHeader resHeader = null;
- String resMime = null;
- try {
- resource = sb.cacheManager.loadResource(url);
- if (resource == null) {
- plasmaHTCache.Entry entry = sb.snippetCache.loadResourceFromWeb(url, 5000);
-
- if (entry != null) {
- resHeader = entry.responseHeader();
- }
-
- resource = sb.cacheManager.loadResource(url);
- if (resource == null) {
- prop.put("error",4);
- prop.put("viewMode",VIEW_MODE_NO_TEXT);
- return prop;
- }
+
+ if (post != null) {
+ // getting the url hash from which the content should be loaded
+ String urlHash = post.get("urlHash","");
+ if (urlHash.equals("")) {
+ prop.put("error",1);
+ prop.put("viewMode",VIEW_MODE_NO_TEXT);
+ return prop;
}
- if (resHeader == null) {
- resHeader = sb.cacheManager.getCachedResponse(urlEntry.hash());
- if (resHeader == null) {
- resHeader = httpc.whead(url,url.getHost(),5000,null,null,sb.remoteProxyConfig);
+
+ String viewMode = post.get("viewMode","sentences");
+
+ // getting the urlEntry that belongs to the url hash
+ Entry urlEntry = null;
+ try {
+ urlEntry = sb.urlPool.loadedURL.getEntry(urlHash, null);
+ } catch (IOException e) {
+ prop.put("error",2);
+ prop.put("viewMode",VIEW_MODE_NO_TEXT);
+ return prop;
+ }
+
+ // gettin the url that belongs to the entry
+ URL url = urlEntry.url();
+ if (url == null) {
+ prop.put("error",3);
+ prop.put("viewMode",VIEW_MODE_NO_TEXT);
+ return prop;
+ }
+
+ // loading the resource content as byte array
+ byte[] resource = null;
+ IResourceInfo resInfo = null;
+ String resMime = null;
+ try {
+ // trying to load the resource body
+ resource = sb.cacheManager.loadResourceContent(url);
+
+ // if the resource body was not cached we try to load it from web
+ if (resource == null) {
+ plasmaHTCache.Entry entry = sb.snippetCache.loadResourceFromWeb(url, 5000);
+
+ if (entry != null) {
+ resInfo = entry.getDocumentInfo();
+ resource = sb.cacheManager.loadResourceContent(url);
+ }
+
if (resource == null) {
prop.put("error",4);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
- resMime = resHeader.mime();
}
- }
- } catch (IOException e) {
- if (url == null) {
- prop.put("error",4);
- prop.put("viewMode",VIEW_MODE_NO_TEXT);
- return prop;
- }
- }
- if (viewMode.equals("plain")) {
- String content = new String(resource);
- content = content.replaceAll("<","<")
- .replaceAll(">",">")
- .replaceAll("\"",""")
- .replaceAll("\n","
")
- .replaceAll("\t"," ");
-
- prop.put("error",0);
- prop.put("viewMode",VIEW_MODE_AS_PLAIN_TEXT);
- prop.put("viewMode_plainText",content);
- } else if (viewMode.equals("parsed") || viewMode.equals("sentences") || viewMode.equals("iframe")) {
- // parsing the resource content
- plasmaParserDocument document = sb.snippetCache.parseDocument(url, resource,resHeader);
- if (document == null) {
- prop.put("error",5);
- prop.put("viewMode",VIEW_MODE_NO_TEXT);
- return prop;
- }
- resMime = document.getMimeType();
-
- if (viewMode.equals("parsed")) {
- String content = new String(document.getText());
- content = wikiCode.replaceHTML(content); //added by Marc Nause
- content = content.replaceAll("\n","
")
- .replaceAll("\t"," ");
-
- prop.put("viewMode",VIEW_MODE_AS_PARSED_TEXT);
- prop.put("viewMode_parsedText",content);
- } else if (viewMode.equals("iframe")) {
- prop.put("viewMode",VIEW_MODE_AS_IFRAME);
- prop.put("viewMode_url",url.toString());
- } else {
- prop.put("viewMode",VIEW_MODE_AS_PARSED_SENTENCES);
- String[] sentences = document.getSentences();
-
- boolean dark = true;
- for (int i=0; i < sentences.length; i++) {
- String currentSentence = wikiCode.replaceHTML(sentences[i]);
-
- // Search word highlighting
- String words = post.get("words",null);
- if (words != null) {
- try {
- words = URLDecoder.decode(words,"UTF-8");
- } catch (UnsupportedEncodingException e) {}
-
- String[] wordArray = words.substring(1,words.length()-1).split(",");
- for (int j=0; j < wordArray.length; j++) {
- String currentWord = wordArray[j].trim();
- currentSentence = currentSentence.replaceAll(currentWord,
- "" + currentWord + "");
+
+ // try to load resource metadata
+ if (resInfo == null) {
+
+ // try to load the metadata from cache
+ try {
+ resInfo = sb.cacheManager.loadResourceInfo(urlEntry.url());
+ } catch (Exception e) { /* ignore this */}
+
+ // if the metadata where not cached try to load it from web
+ if (resInfo == null) {
+ String protocol = url.getProtocol();
+ if (!((protocol.equals("http") || protocol.equals("https")))) {
+ prop.put("error",6);
+ prop.put("viewMode",VIEW_MODE_NO_TEXT);
+ return prop;
}
+
+ httpHeader responseHeader = httpc.whead(url,url.getHost(),5000,null,null,sb.remoteProxyConfig);
+ if (responseHeader == null) {
+ prop.put("error",4);
+ prop.put("viewMode",VIEW_MODE_NO_TEXT);
+ return prop;
+ }
+ resMime = responseHeader.mime();
}
-
- prop.put("viewMode_sentences_" + i + "_nr",Integer.toString(i+1));
- prop.put("viewMode_sentences_" + i + "_text",currentSentence);
- prop.put("viewMode_sentences_" + i + "_dark",((dark) ? 1 : 0) ); dark=!dark;
+ } else {
+ resMime = resInfo.getMimeType();
}
- prop.put("viewMode_sentences",sentences.length);
-
- }
- }
- prop.put("error",0);
- prop.put("error_url",url.toString());
- prop.put("error_hash",urlHash);
- prop.put("error_wordCount",Integer.toString(urlEntry.wordCount()));
- prop.put("error_desc",urlEntry.descr());
- prop.put("error_size",urlEntry.size());
- prop.put("error_mimeType",resMime);
- }
-
- return prop;
+ } catch (IOException e) {
+ if (url == null) {
+ prop.put("error",4);
+ prop.put("viewMode",VIEW_MODE_NO_TEXT);
+ return prop;
+ }
+ }
+ if (viewMode.equals("plain")) {
+ String content = new String(resource);
+ content = content.replaceAll("<","<")
+ .replaceAll(">",">")
+ .replaceAll("\"",""")
+ .replaceAll("\n","
")
+ .replaceAll("\t"," ");
+
+ prop.put("error",0);
+ prop.put("viewMode",VIEW_MODE_AS_PLAIN_TEXT);
+ prop.put("viewMode_plainText",content);
+ } else if (viewMode.equals("parsed") || viewMode.equals("sentences") || viewMode.equals("iframe")) {
+ // parsing the resource content
+ plasmaParserDocument document = sb.snippetCache.parseDocument(url, resource,resInfo);
+ if (document == null) {
+ prop.put("error",5);
+ prop.put("viewMode",VIEW_MODE_NO_TEXT);
+ return prop;
+ }
+ resMime = document.getMimeType();
+
+ if (viewMode.equals("parsed")) {
+ String content = new String(document.getText());
+ content = wikiCode.replaceHTML(content); //added by Marc Nause
+ content = content.replaceAll("\n","
")
+ .replaceAll("\t"," ");
+
+ prop.put("viewMode",VIEW_MODE_AS_PARSED_TEXT);
+ prop.put("viewMode_parsedText",content);
+ } else if (viewMode.equals("iframe")) {
+ prop.put("viewMode",VIEW_MODE_AS_IFRAME);
+ prop.put("viewMode_url",url.toString());
+ } else {
+ prop.put("viewMode",VIEW_MODE_AS_PARSED_SENTENCES);
+ String[] sentences = document.getSentences();
+
+ boolean dark = true;
+ for (int i=0; i < sentences.length; i++) {
+ String currentSentence = wikiCode.replaceHTML(sentences[i]);
+
+ // Search word highlighting
+ String words = post.get("words",null);
+ if (words != null) {
+ try {
+ words = URLDecoder.decode(words,"UTF-8");
+ } catch (UnsupportedEncodingException e) {}
+
+ String[] wordArray = words.substring(1,words.length()-1).split(",");
+ for (int j=0; j < wordArray.length; j++) {
+ String currentWord = wordArray[j].trim();
+ currentSentence = currentSentence.replaceAll(currentWord,
+ "" + currentWord + "");
+ }
+ }
+
+ prop.put("viewMode_sentences_" + i + "_nr",Integer.toString(i+1));
+ prop.put("viewMode_sentences_" + i + "_text",currentSentence);
+ prop.put("viewMode_sentences_" + i + "_dark",((dark) ? 1 : 0) ); dark=!dark;
+ }
+ prop.put("viewMode_sentences",sentences.length);
+
+ }
+ }
+ prop.put("error",0);
+ prop.put("error_url",url.toString());
+ prop.put("error_hash",urlHash);
+ prop.put("error_wordCount",Integer.toString(urlEntry.wordCount()));
+ prop.put("error_desc",urlEntry.descr());
+ prop.put("error_size",urlEntry.size());
+ prop.put("error_mimeType",resMime);
+ }
+
+ return prop;
}
-
+
}
diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java
index d6315ed1a..bf703b1f6 100644
--- a/source/de/anomic/http/httpdProxyHandler.java
+++ b/source/de/anomic/http/httpdProxyHandler.java
@@ -96,6 +96,8 @@ import de.anomic.index.indexURL;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaSwitchboard;
+import de.anomic.plasma.cache.IResourceInfo;
+import de.anomic.plasma.cache.http.ResourceInfo;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverCore;
import de.anomic.server.serverFileUtils;
@@ -413,8 +415,8 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
// decide wether to use a cache entry or connect to the network
File cacheFile = cacheManager.getCachePath(url);
- String urlHash = indexURL.urlHash(url);
- httpHeader cachedResponseHeader = cacheManager.getCachedResponse(urlHash);
+ ResourceInfo cachedResInfo = (ResourceInfo) cacheManager.loadResourceInfo(url);
+ httpHeader cachedResponseHeader = (cachedResInfo == null)?null:cachedResInfo.getResponseHeader();
boolean cacheExists = ((cacheFile.isFile()) && (cachedResponseHeader != null));
// why are files unzipped upon arrival? why not zip all files in cache?
@@ -445,9 +447,10 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
0, // crawling depth
url, // url
"", // name of the url is unknown
- requestHeader, // request headers
+ //requestHeader, // request headers
"200 OK", // request status
- cachedResponseHeader, // response headers
+ //cachedResponseHeader, // response headers
+ cachedResInfo,
null, // initiator
switchboard.defaultProxyProfile // profile
);
@@ -579,15 +582,17 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
}
// reserver cache entry
- Date requestDate = new Date(((Long)conProp.get(httpHeader.CONNECTION_PROP_REQUEST_START)).longValue());
+ Date requestDate = new Date(((Long)conProp.get(httpHeader.CONNECTION_PROP_REQUEST_START)).longValue());
+ IResourceInfo resInfo = new ResourceInfo(url,requestHeader,res.responseHeader);
plasmaHTCache.Entry cacheEntry = cacheManager.newEntry(
requestDate,
0,
url,
"",
- requestHeader,
+ //requestHeader,
res.status,
- res.responseHeader,
+ //res.responseHeader,
+ resInfo,
null,
switchboard.defaultProxyProfile
);
diff --git a/source/de/anomic/icap/icapd.java b/source/de/anomic/icap/icapd.java
index de4ea546f..648d0fd2b 100644
--- a/source/de/anomic/icap/icapd.java
+++ b/source/de/anomic/icap/icapd.java
@@ -64,6 +64,8 @@ import de.anomic.http.httpc;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaSwitchboard;
+import de.anomic.plasma.cache.IResourceInfo;
+import de.anomic.plasma.cache.http.ResourceInfo;
import de.anomic.server.serverCore;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverHandler;
@@ -385,14 +387,14 @@ public class icapd implements serverHandler {
* ========================================================================= */
// generating a htcache entry object
+ IResourceInfo resInfo = new ResourceInfo(httpRequestURL,httpReqHeader,httpResHeader);
plasmaHTCache.Entry cacheEntry = cacheManager.newEntry(
new Date(),
0,
httpRequestURL,
"",
- httpReqHeader,
- httpRespStatusLine,
- httpResHeader,
+ httpRespStatusLine,
+ resInfo,
null,
switchboard.defaultProxyProfile
);
diff --git a/source/de/anomic/plasma/cache/IResourceInfo.java b/source/de/anomic/plasma/cache/IResourceInfo.java
new file mode 100644
index 000000000..72c344933
--- /dev/null
+++ b/source/de/anomic/plasma/cache/IResourceInfo.java
@@ -0,0 +1,136 @@
+// IResourceInfo.java
+// -------------------------------------
+// part of YACY
+// (C) by Michael Peter Christen; mc@anomic.de
+// first published on http://www.anomic.de
+// Frankfurt, Germany, 2006
+//
+// This file ist contributed by Martin Thelian
+//
+// $LastChangedDate: 2006-02-20 23:57:42 +0100 (Mo, 20 Feb 2006) $
+// $LastChangedRevision: 1715 $
+// $LastChangedBy: theli $
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+// Using this software in any meaning (reading, learning, copying, compiling,
+// running) means that you agree that the Author(s) is (are) not responsible
+// for cost, loss of data or any harm that may be caused directly or indirectly
+// by usage of this softare or this documentation. The usage of this software
+// is on your own risk. The installation and usage (starting/running) of this
+// software may allow other people or application to access your computer and
+// any attached devices and is highly dependent on the configuration of the
+// software which must be done by the user of the software; the author(s) is
+// (are) also not responsible for proper configuration and usage of the
+// software, even if provoked by documentation provided together with
+// the software.
+//
+// Any changes to this file according to the GPL as documented in the file
+// gpl.txt aside this file in the shipment you received can be done to the
+// lines that follows this copyright notice here, but changes must not be
+// done inside the copyright notive above. A re-distribution must contain
+// the intact and unchanged copyright notice.
+// Contributions and changes to the program code must be marked as such.
+
+
+
+package de.anomic.plasma.cache;
+
+import java.util.Date;
+import java.util.Map;
+
+import de.anomic.net.URL;
+
+public interface IResourceInfo {
+
+ /**
+ * Return the resource information as map
+ * @return
+ */
+ public Map getMap();
+
+ /**
+ * Returns the URL of this content
+ * @return
+ */
+ public URL getUrl();
+
+ /**
+ * Returns the referer URL of this URL
+ * @return referer URL
+ */
+ public URL getRefererUrl();
+
+ /**
+ * Returns the mimetype of the cached object
+ * @return mimetype
+ */
+ public String getMimeType();
+
+ /**
+ * Returns the modification date of the cached object
+ * @return the modifiaction date
+ */
+ public Date getModificationDate();
+
+ /**
+ * Returns the url hash of the content URL
+ * @return
+ */
+ public String getUrlHash();
+
+ /**
+ * Specifies if the resource was requested with a
+ * if modified since date
+ * @return
+ */
+ public Date ifModifiedSince();
+
+ /**
+ * Specifies if the resource was requested with
+ * client specific information (e.g. cookies for http)
+ * @return
+ */
+ public boolean requestWithCookie();
+
+ /**
+ * Specifies if the request prohibits indexing
+ * @return
+ */
+ public boolean requestProhibitsIndexing();
+
+ /**
+ * Determines if a resource that was downloaded by the crawler
+ * is allowed to be indexed.
+ *
+ * @return an error string describing the reason why the
+ * resourse should not be indexed or null if indexing is allowed
+ */
+ public String shallIndexCacheForCrawler();
+
+ /**
+ * Determines if a resource that was downloaded by the proxy
+ * is allowed to be indexed.
+ *
+ * @return an error string describing the reason why the
+ * resourse should not be indexed or null if indexing is allowed
+ */
+ public String shallIndexCacheForProxy();
+
+ public String shallStoreCacheForProxy();
+ public boolean shallUseCacheForProxy();
+
+ public boolean validResponseStatus(String responseStatus);
+}
diff --git a/source/de/anomic/plasma/cache/ResourceInfoFactory.java b/source/de/anomic/plasma/cache/ResourceInfoFactory.java
new file mode 100644
index 000000000..75c0a2d07
--- /dev/null
+++ b/source/de/anomic/plasma/cache/ResourceInfoFactory.java
@@ -0,0 +1,86 @@
+// RespourceInfoFactory.java
+// -------------------------------------
+// part of YACY
+// (C) by Michael Peter Christen; mc@anomic.de
+// first published on http://www.anomic.de
+// Frankfurt, Germany, 2006
+//
+// This file ist contributed by Martin Thelian
+//
+// $LastChangedDate: 2006-02-20 23:57:42 +0100 (Mo, 20 Feb 2006) $
+// $LastChangedRevision: 1715 $
+// $LastChangedBy: theli $
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+// Using this software in any meaning (reading, learning, copying, compiling,
+// running) means that you agree that the Author(s) is (are) not responsible
+// for cost, loss of data or any harm that may be caused directly or indirectly
+// by usage of this softare or this documentation. The usage of this software
+// is on your own risk. The installation and usage (starting/running) of this
+// software may allow other people or application to access your computer and
+// any attached devices and is highly dependent on the configuration of the
+// software which must be done by the user of the software; the author(s) is
+// (are) also not responsible for proper configuration and usage of the
+// software, even if provoked by documentation provided together with
+// the software.
+//
+// Any changes to this file according to the GPL as documented in the file
+// gpl.txt aside this file in the shipment you received can be done to the
+// lines that follows this copyright notice here, but changes must not be
+// done inside the copyright notive above. A re-distribution must contain
+// the intact and unchanged copyright notice.
+// Contributions and changes to the program code must be marked as such.
+
+
+
+package de.anomic.plasma.cache;
+
+import java.lang.reflect.Constructor;
+import java.util.Map;
+
+import de.anomic.net.URL;
+
+public class ResourceInfoFactory {
+ public IResourceInfo buildResourceInfoObj(
+ URL resourceURL,
+ Map resourceMetadata
+ ) throws Exception {
+
+ String protocString = resourceURL.getProtocol();
+
+ // the full qualified class name
+ String className = this.getClass().getPackage().getName() + "." + protocString + ".ResourceInfo";
+
+ // loading class by name
+ Class moduleClass = Class.forName(className);
+
+ // getting the constructor
+ Constructor classConstructor = moduleClass.getConstructor( new Class[] {
+ URL.class,
+ Map.class
+ } );
+
+ // instantiating class
+ IResourceInfo infoObject = (IResourceInfo) classConstructor.newInstance(new Object[] {
+ resourceURL,
+ resourceMetadata
+ });
+
+ // return the newly created object
+ return infoObject;
+
+ }
+}
diff --git a/source/de/anomic/plasma/cache/http/ResourceInfo.java b/source/de/anomic/plasma/cache/http/ResourceInfo.java
new file mode 100644
index 000000000..3b1c2d4b3
--- /dev/null
+++ b/source/de/anomic/plasma/cache/http/ResourceInfo.java
@@ -0,0 +1,467 @@
+// ResourceInfo.java
+// -------------------------------------
+// part of YACY
+// (C) by Michael Peter Christen; mc@anomic.de
+// first published on http://www.anomic.de
+// Frankfurt, Germany, 2006
+//
+// This file ist contributed by Martin Thelian
+//
+// $LastChangedDate: 2006-02-20 23:57:42 +0100 (Mo, 20 Feb 2006) $
+// $LastChangedRevision: 1715 $
+// $LastChangedBy: theli $
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+// Using this software in any meaning (reading, learning, copying, compiling,
+// running) means that you agree that the Author(s) is (are) not responsible
+// for cost, loss of data or any harm that may be caused directly or indirectly
+// by usage of this softare or this documentation. The usage of this software
+// is on your own risk. The installation and usage (starting/running) of this
+// software may allow other people or application to access your computer and
+// any attached devices and is highly dependent on the configuration of the
+// software which must be done by the user of the software; the author(s) is
+// (are) also not responsible for proper configuration and usage of the
+// software, even if provoked by documentation provided together with
+// the software.
+//
+// Any changes to this file according to the GPL as documented in the file
+// gpl.txt aside this file in the shipment you received can be done to the
+// lines that follows this copyright notice here, but changes must not be
+// done inside the copyright notive above. A re-distribution must contain
+// the intact and unchanged copyright notice.
+// Contributions and changes to the program code must be marked as such.
+
+
+
+package de.anomic.plasma.cache.http;
+
+import java.util.Date;
+import java.util.Map;
+
+import de.anomic.http.httpHeader;
+import de.anomic.index.indexURL;
+import de.anomic.net.URL;
+import de.anomic.plasma.plasmaHTCache;
+import de.anomic.plasma.cache.ResourceInfoFactory;
+import de.anomic.plasma.cache.IResourceInfo;
+import de.anomic.server.serverDate;
+
+public class ResourceInfo implements IResourceInfo {
+ private URL url;
+ private String urlHash;
+ private httpHeader responseHeader;
+ private httpHeader requestHeader;
+
+ /**
+ * Constructor used by the {@link ResourceInfoFactory}
+ * @param objectURL
+ * @param objectInfo
+ */
+ public ResourceInfo(URL objectURL, Map objectInfo) {
+ if (objectURL == null) throw new NullPointerException();
+ if (objectInfo == null) throw new NullPointerException();
+
+ // generating the url hash
+ this.url = objectURL;
+ this.urlHash = indexURL.urlHash(this.url.toNormalform());
+
+ // create the http header object
+ this.responseHeader = new httpHeader(null, objectInfo);
+ }
+
+ public ResourceInfo(URL objectURL, httpHeader requestHeaders, httpHeader responseHeaders) {
+ if (objectURL == null) throw new NullPointerException();
+ if (responseHeaders == null) throw new NullPointerException();
+
+ // generating the url hash
+ this.url = objectURL;
+ this.urlHash = indexURL.urlHash(this.url.toNormalform());
+
+ this.requestHeader = requestHeaders;
+ this.responseHeader = responseHeaders;
+ }
+
+ public Map getMap() {
+ return this.responseHeader;
+ }
+
+ /**
+ * @see de.anomic.plasma.cache.IResourceInfo#getMimeType()
+ */
+ public String getMimeType() {
+ if (this.responseHeader == null) return null;
+
+ String mimeType = this.responseHeader.mime();
+ mimeType = mimeType.trim().toLowerCase();
+
+ int pos = mimeType.indexOf(';');
+ return ((pos < 0) ? mimeType : mimeType.substring(0, pos));
+ }
+
+ /**
+ * @see de.anomic.plasma.cache.IResourceInfo#getModificationDate()
+ */
+ public Date getModificationDate() {
+ Date docDate = null;
+
+ if (this.responseHeader != null) {
+ docDate = this.responseHeader.lastModified();
+ if (docDate == null) docDate = this.responseHeader.date();
+ }
+ if (docDate == null) docDate = new Date(serverDate.correctedUTCTime());
+
+ return docDate;
+ }
+
+ public URL getRefererUrl() {
+ if (this.requestHeader == null) return null;
+ try {
+ return new URL((String) this.requestHeader.get(httpHeader.REFERER, ""));
+ } catch (Exception e) {
+ return null;
+ }
+ }
+
+ /**
+ * @see de.anomic.plasma.cache.IResourceInfo#getUrl()
+ */
+ public URL getUrl() {
+ return this.url;
+ }
+
+ /**
+ * @see de.anomic.plasma.cache.IResourceInfo#getUrlHash()
+ */
+ public String getUrlHash() {
+ return this.urlHash;
+ }
+
+ /**
+ * @see de.anomic.plasma.cache.IResourceInfo#shallIndexCacheForCrawler()
+ */
+ public String shallIndexCacheForCrawler() {
+ String mimeType = this.getMimeType();
+ if (plasmaHTCache.isPicture(mimeType)) { return "Media_Content_(Picture)"; }
+ if (!plasmaHTCache.isText(mimeType)) { return "Media_Content_(not_text)"; }
+ return null;
+ }
+
+ /**
+ * @see de.anomic.plasma.cache.IResourceInfo#shallIndexCacheForProxy()
+ */
+ public String shallIndexCacheForProxy() {
+ // -set-cookie in response
+ // the set-cookie from the server does not indicate that the content is special
+ // thus we do not care about it here for indexing
+
+ // a picture cannot be indexed
+ String mimeType = this.getMimeType();
+ if (plasmaHTCache.isPicture(mimeType)) {
+ return "Media_Content_(Picture)";
+ }
+ if (!plasmaHTCache.isText(mimeType)) {
+ return "Media_Content_(not_text)";
+ }
+
+ // -if-modified-since in request
+ // if the page is fresh at the very moment we can index it
+ Date ifModifiedSince = getModificationDate();
+ if ((ifModifiedSince != null) && (this.responseHeader.containsKey(httpHeader.LAST_MODIFIED))) {
+ // parse date
+ Date d = this.responseHeader.lastModified();
+ if (d == null) {
+ d = new Date(serverDate.correctedUTCTime());
+ }
+ // finally, we shall treat the cache as stale if the modification time is after the if-.. time
+ if (d.after(ifModifiedSince)) {
+ //System.out.println("***not indexed because if-modified-since");
+ return "Stale_(Last-Modified>Modified-Since)";
+ }
+ }
+
+ // -pragma in cached response
+ if (this.responseHeader.containsKey(httpHeader.PRAGMA) &&
+ ((String) this.responseHeader.get(httpHeader.PRAGMA)).toUpperCase().equals("NO-CACHE")) {
+ return "Denied_(pragma_no_cache)";
+ }
+
+ // see for documentation also:
+ // http://www.web-caching.com/cacheability.html
+
+ // look for freshnes information
+
+ // -expires in cached response
+ // the expires value gives us a very easy hint when the cache is stale
+ // sometimes, the expires date is set to the past to prevent that a page is cached
+ // we use that information to see if we should index it
+ final Date expires = this.responseHeader.expires();
+ if (expires != null && expires.before(new Date(serverDate.correctedUTCTime()))) {
+ return "Stale_(Expired)";
+ }
+
+ // -lastModified in cached response
+ // this information is too weak to use it to prevent indexing
+ // even if we can apply a TTL heuristic for cache usage
+
+ // -cache-control in cached response
+ // the cache-control has many value options.
+ String cacheControl = (String) this.responseHeader.get(httpHeader.CACHE_CONTROL);
+ if (cacheControl != null) {
+ cacheControl = cacheControl.trim().toUpperCase();
+ /* we have the following cases for cache-control:
+ "public" -- can be indexed
+ "private", "no-cache", "no-store" -- cannot be indexed
+ "max-age=" -- stale/fresh dependent on date
+ */
+ if (cacheControl.startsWith("PRIVATE") ||
+ cacheControl.startsWith("NO-CACHE") ||
+ cacheControl.startsWith("NO-STORE")) {
+ // easy case
+ return "Stale_(denied_by_cache-control=" + cacheControl + ")";
+// } else if (cacheControl.startsWith("PUBLIC")) {
+// // ok, do nothing
+ } else if (cacheControl.startsWith("MAX-AGE=")) {
+ // we need also the load date
+ final Date date = this.responseHeader.date();
+ if (date == null) {
+ return "Stale_(no_date_given_in_response)";
+ }
+ try {
+ final long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live
+ if (serverDate.correctedUTCTime() - date.getTime() > ttl) {
+ //System.out.println("***not indexed because cache-control");
+ return "Stale_(expired_by_cache-control)";
+ }
+ } catch (Exception e) {
+ return "Error_(" + e.getMessage() + ")";
+ }
+ }
+ }
+ return null;
+ }
+
+ public String shallStoreCacheForProxy() {
+ if (this.requestHeader != null) {
+ // -authorization cases in request
+ // authorization makes pages very individual, and therefore we cannot use the
+ // content in the cache
+ if (this.requestHeader.containsKey(httpHeader.AUTHORIZATION)) { return "personalized"; }
+ // -ranges in request and response
+ // we do not cache partial content
+ if (this.requestHeader.containsKey(httpHeader.RANGE)) { return "partial"; }
+ }
+
+ if (this.responseHeader != null) {
+ // -ranges in request and response
+ // we do not cache partial content
+ if (this.responseHeader.containsKey(httpHeader.CONTENT_RANGE)) { return "partial"; }
+
+ // -if-modified-since in request
+ // we do not care about if-modified-since, because this case only occurres if the
+ // cache file does not exist, and we need as much info as possible for the indexing
+
+ // -cookies in request
+ // we do not care about cookies, because that would prevent loading more pages
+ // from one domain once a request resulted in a client-side stored cookie
+
+ // -set-cookie in response
+ // we do not care about cookies in responses, because that info comes along
+ // any/many pages from a server and does not express the validity of the page
+ // in modes of life-time/expiration or individuality
+
+ // -pragma in response
+ // if we have a pragma non-cache, we don't cache. usually if this is wanted from
+ // the server, it makes sense
+ String cacheControl = (String) this.responseHeader.get(httpHeader.PRAGMA);
+ if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return "controlled_no_cache"; }
+
+ // -expires in response
+ // we do not care about expires, because at the time this is called the data is
+ // obvious valid and that header info is used in the indexing later on
+
+ // -cache-control in response
+ // the cache-control has many value options.
+ cacheControl = (String) this.responseHeader.get(httpHeader.CACHE_CONTROL);
+ if (cacheControl != null) {
+ cacheControl = cacheControl.trim().toUpperCase();
+ if (cacheControl.startsWith("MAX-AGE=")) {
+ // we need also the load date
+ Date date = this.responseHeader.date();
+ if (date == null) return "stale_no_date_given_in_response";
+ try {
+ long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live
+ if (serverDate.correctedUTCTime() - date.getTime() > ttl) {
+ //System.out.println("***not indexed because cache-control");
+ return "stale_expired";
+ }
+ } catch (Exception e) {
+ return "stale_error_" + e.getMessage() + ")";
+ }
+ }
+ }
+ }
+ return null;
+ }
+
+ public boolean shallUseCacheForProxy() {
+
+ String cacheControl;
+ if (this.requestHeader != null) {
+ // -authorization cases in request
+ if (this.requestHeader.containsKey(httpHeader.AUTHORIZATION)) { return false; }
+
+ // -ranges in request
+ // we do not cache partial content
+ if (this.requestHeader.containsKey(httpHeader.RANGE)) { return false; }
+
+ // if the client requests a un-cached copy of the resource ...
+ cacheControl = (String) this.requestHeader.get(httpHeader.PRAGMA);
+ if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return false; }
+
+ cacheControl = (String) this.requestHeader.get(httpHeader.CACHE_CONTROL);
+ if (cacheControl != null) {
+ cacheControl = cacheControl.trim().toUpperCase();
+ if (cacheControl.startsWith("NO-CACHE") || cacheControl.startsWith("MAX-AGE=0")) { return false; }
+ }
+ }
+
+ // -if-modified-since in request
+ // The entity has to be transferred only if it has
+ // been modified since the date given by the If-Modified-Since header.
+ if (this.requestHeader.containsKey(httpHeader.IF_MODIFIED_SINCE)) {
+ // checking this makes only sense if the cached response contains
+ // a Last-Modified field. If the field does not exist, we go the safe way
+ if (!this.responseHeader.containsKey(httpHeader.LAST_MODIFIED)) { return false; }
+ // parse date
+ Date d1, d2;
+ d2 = this.responseHeader.lastModified(); if (d2 == null) { d2 = new Date(serverDate.correctedUTCTime()); }
+ d1 = this.requestHeader.ifModifiedSince(); if (d1 == null) { d1 = new Date(serverDate.correctedUTCTime()); }
+ // finally, we shall treat the cache as stale if the modification time is after the if-.. time
+ if (d2.after(d1)) { return false; }
+ }
+
+ String mimeType = this.getMimeType();
+ if (!plasmaHTCache.isPicture(mimeType)) {
+ // -cookies in request
+ // unfortunately, we should reload in case of a cookie
+ // but we think that pictures can still be considered as fresh
+ // -set-cookie in cached response
+ // this is a similar case as for COOKIE.
+ if (this.requestHeader.containsKey(httpHeader.COOKIE) ||
+ this.responseHeader.containsKey(httpHeader.SET_COOKIE) ||
+ this.responseHeader.containsKey(httpHeader.SET_COOKIE2)) {
+ return false; // too strong
+ }
+ }
+
+ // -pragma in cached response
+ // logically, we would not need to care about no-cache pragmas in cached response headers,
+ // because they cannot exist since they are not written to the cache.
+ // So this IF should always fail..
+ cacheControl = (String) this.responseHeader.get(httpHeader.PRAGMA);
+ if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return false; }
+
+ // see for documentation also:
+ // http://www.web-caching.com/cacheability.html
+ // http://vancouver-webpages.com/CacheNow/
+
+ // look for freshnes information
+ // if we don't have any freshnes indication, we treat the file as stale.
+ // no handle for freshness control:
+
+ // -expires in cached response
+ // the expires value gives us a very easy hint when the cache is stale
+ Date expires = this.responseHeader.expires();
+ if (expires != null) {
+// System.out.println("EXPIRES-TEST: expires=" + expires + ", NOW=" + serverDate.correctedGMTDate() + ", url=" + url);
+ if (expires.before(new Date(serverDate.correctedUTCTime()))) { return false; }
+ }
+ Date lastModified = this.responseHeader.lastModified();
+ cacheControl = (String) this.responseHeader.get(httpHeader.CACHE_CONTROL);
+ if (cacheControl == null && lastModified == null && expires == null) { return false; }
+
+ // -lastModified in cached response
+ // we can apply a TTL (Time To Live) heuristic here. We call the time delta between the last read
+ // of the file and the last modified date as the age of the file. If we consider the file as
+ // middel-aged then, the maximum TTL would be cache-creation plus age.
+ // This would be a TTL factor of 100% we want no more than 10% TTL, so that a 10 month old cache
+ // file may only be treated as fresh for one more month, not more.
+ Date date = this.responseHeader.date();
+ if (lastModified != null) {
+ if (date == null) { date = new Date(serverDate.correctedUTCTime()); }
+ long age = date.getTime() - lastModified.getTime();
+ if (age < 0) { return false; }
+ // TTL (Time-To-Live) is age/10 = (d2.getTime() - d1.getTime()) / 10
+ // the actual living-time is serverDate.correctedGMTDate().getTime() - d2.getTime()
+ // therefore the cache is stale, if serverDate.correctedGMTDate().getTime() - d2.getTime() > age/10
+ if (serverDate.correctedUTCTime() - date.getTime() > age / 10) { return false; }
+ }
+
+ // -cache-control in cached response
+ // the cache-control has many value options.
+ if (cacheControl != null) {
+ cacheControl = cacheControl.trim().toUpperCase();
+ if (cacheControl.startsWith("PRIVATE") ||
+ cacheControl.startsWith("NO-CACHE") ||
+ cacheControl.startsWith("NO-STORE")) {
+ // easy case
+ return false;
+// } else if (cacheControl.startsWith("PUBLIC")) {
+// // ok, do nothing
+ } else if (cacheControl.startsWith("MAX-AGE=")) {
+ // we need also the load date
+ if (date == null) { return false; }
+ try {
+ final long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live
+ if (serverDate.correctedUTCTime() - date.getTime() > ttl) {
+ return false;
+ }
+ } catch (Exception e) {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ public boolean validResponseStatus(String responseStatus) {
+ return responseStatus.startsWith("200") ||
+ responseStatus.startsWith("203");
+ }
+
+ public Date ifModifiedSince() {
+ return (this.requestHeader == null) ? null : this.requestHeader.ifModifiedSince();
+ }
+
+ public boolean requestWithCookie() {
+ return (this.requestHeader == null) ? false : this.requestHeader.containsKey(httpHeader.COOKIE);
+ }
+
+ public boolean requestProhibitsIndexing() {
+ return (this.requestHeader == null)
+ ? false
+ : this.requestHeader.containsKey(httpHeader.X_YACY_INDEX_CONTROL) &&
+ ((String)this.requestHeader.get(httpHeader.X_YACY_INDEX_CONTROL)).toUpperCase().equals("NO-INDEX");
+ }
+
+ public httpHeader getRequestHeader() {
+ return this.requestHeader;
+ }
+
+ public httpHeader getResponseHeader() {
+ return this.responseHeader;
+ }
+}
diff --git a/source/de/anomic/plasma/crawler/http/CrawlWorker.java b/source/de/anomic/plasma/crawler/http/CrawlWorker.java
index 5eaafd77b..dcddcdc1f 100644
--- a/source/de/anomic/plasma/crawler/http/CrawlWorker.java
+++ b/source/de/anomic/plasma/crawler/http/CrawlWorker.java
@@ -64,6 +64,8 @@ import de.anomic.plasma.plasmaCrawlLoader;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaSwitchboard;
+import de.anomic.plasma.cache.IResourceInfo;
+import de.anomic.plasma.cache.http.ResourceInfo;
import de.anomic.plasma.crawler.AbstractCrawlWorker;
import de.anomic.plasma.crawler.plasmaCrawlerPool;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
@@ -129,15 +131,15 @@ public final class CrawlWorker extends AbstractCrawlWorker {
return load(DEFAULT_CRAWLING_RETRY_COUNT);
}
- protected plasmaHTCache.Entry createCacheEntry(Date requestDate, httpHeader requestHeader, httpc.response response) {
+ protected plasmaHTCache.Entry createCacheEntry(URL requestUrl, Date requestDate, httpHeader requestHeader, httpc.response response) {
+ IResourceInfo resourceInfo = new ResourceInfo(requestUrl,requestHeader,response.responseHeader);
return this.cacheManager.newEntry(
requestDate,
this.depth,
this.url,
- this.name,
- requestHeader,
- response.status,
- response.responseHeader,
+ this.name,
+ response.status,
+ resourceInfo,
this.initiator,
this.profile
);
@@ -197,7 +199,7 @@ public final class CrawlWorker extends AbstractCrawlWorker {
// the transfer is ok
// create a new cache entry
- htCache = createCacheEntry(requestDate, requestHeader, res);
+ htCache = createCacheEntry(this.url,requestDate, requestHeader, res);
// aborting download if content is to long ...
if (htCache.cacheFile().getAbsolutePath().length() > serverSystem.maxPathLength) {
diff --git a/source/de/anomic/plasma/plasmaCrawlStacker.java b/source/de/anomic/plasma/plasmaCrawlStacker.java
index 0b444dc50..103962769 100644
--- a/source/de/anomic/plasma/plasmaCrawlStacker.java
+++ b/source/de/anomic/plasma/plasmaCrawlStacker.java
@@ -295,7 +295,7 @@ public final class plasmaCrawlStacker {
}
// check if ip is local ip address
- checkInterruption();
+ checkInterruption(); // TODO: this is protocol specific
InetAddress hostAddress = httpc.dnsResolve(nexturl.getHost());
if (hostAddress == null) {
// if a http proxy is configured name resolution may not work
diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java
index c0b40bca4..99b3bf474 100644
--- a/source/de/anomic/plasma/plasmaHTCache.java
+++ b/source/de/anomic/plasma/plasmaHTCache.java
@@ -54,14 +54,12 @@
package de.anomic.plasma;
import de.anomic.http.httpc;
-import de.anomic.http.httpHeader;
import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexURL;
import de.anomic.kelondro.kelondroDyn;
import de.anomic.kelondro.kelondroMap;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.server.logging.serverLog;
-import de.anomic.server.serverDate;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverInstantThread;
import de.anomic.server.serverSystem;
@@ -73,6 +71,9 @@ import java.io.IOException;
import java.net.InetAddress;
import java.net.MalformedURLException;
import de.anomic.net.URL;
+import de.anomic.plasma.cache.IResourceInfo;
+import de.anomic.plasma.cache.ResourceInfoFactory;
+
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
@@ -87,7 +88,7 @@ public final class plasmaHTCache {
private static final int stackLimit = 150; // if we exceed that limit, we do not check idle
public static final long oneday = 1000 * 60 * 60 * 24; // milliseconds of a day
- private kelondroMap responseHeaderDB = null;
+ kelondroMap responseHeaderDB = null;
private final LinkedList cacheStack;
private final TreeMap cacheAge; // a - relation
public long curCacheSize;
@@ -96,11 +97,16 @@ public final class plasmaHTCache {
public final serverLog log;
public static final HashSet filesInUse = new HashSet(); // can we delete this file
+ private ResourceInfoFactory objFactory;
+
public plasmaHTCache(File htCachePath, long maxCacheSize, int bufferkb, long preloadTime) {
// this.switchboard = switchboard;
this.log = new serverLog("HTCACHE");
this.cachePath = htCachePath;
+
+ // create the object factory
+ this.objFactory = new ResourceInfoFactory();
// reset old HTCache ?
String[] list = this.cachePath.list();
@@ -229,10 +235,6 @@ public final class plasmaHTCache {
}
}
- public void storeHeader(String urlHash, httpHeader responseHeader) throws IOException {
- this.responseHeaderDB.set(urlHash, responseHeader);
- }
-
/**
* This method changes the HTCache size.
* @param new cache size in bytes
@@ -249,7 +251,7 @@ public final class plasmaHTCache {
return (this.curCacheSize >= this.maxCacheSize) ? 0 : this.maxCacheSize - this.curCacheSize;
}
- public boolean writeFile(URL url, byte[] array) {
+ public boolean writeResourceContent(URL url, byte[] array) {
if (array == null) return false;
File file = getCachePath(url);
try {
@@ -445,10 +447,24 @@ public final class plasmaHTCache {
return prefix + s.substring(0, p);
}
- public httpHeader getCachedResponse(String urlHash) throws IOException {
+ /**
+ * Returns an object containing metadata about a cached resource
+ * @param url the url of the resource
+ * @return an {@link IResourceInfo info object}
+ * @throws Exception of the info object could not be created, e.g. if the protocol is not supported
+ */
+ public IResourceInfo loadResourceInfo(URL url) throws Exception {
+
+ // getting the URL hash
+ String urlHash = indexURL.urlHash(url.toNormalform());
+
+ // loading data from database
Map hdb = this.responseHeaderDB.get(urlHash);
if (hdb == null) return null;
- return new httpHeader(null, hdb);
+
+ // generate the cached object
+ IResourceInfo cachedObj = this.objFactory.buildResourceInfoObj(url, hdb);
+ return cachedObj;
}
public boolean full() {
@@ -459,18 +475,17 @@ public final class plasmaHTCache {
return (this.cacheStack.size() == 0);
}
- public static boolean isPicture(httpHeader response) {
- Object ct = response.get(httpHeader.CONTENT_TYPE);
- if (ct == null) return false;
- return ((String)ct).toUpperCase().startsWith("IMAGE");
+ public static boolean isPicture(String mimeType) {
+ if (mimeType == null) return false;
+ return mimeType.toUpperCase().startsWith("IMAGE");
}
- public static boolean isText(httpHeader response) {
+ public static boolean isText(String mimeType) {
// Object ct = response.get(httpHeader.CONTENT_TYPE);
// if (ct == null) return false;
// String t = ((String)ct).toLowerCase();
// return ((t.startsWith("text")) || (t.equals("application/xhtml+xml")));
- return plasmaParser.supportedMimeTypesContains(response.mime());
+ return plasmaParser.supportedMimeTypesContains(mimeType);
}
public static boolean noIndexingURL(String urlString) {
@@ -568,9 +583,8 @@ public final class plasmaHTCache {
}
if (port < 0) {
return new File(this.cachePath, protocol + "/" + host + path);
- } else {
- return new File(this.cachePath, protocol + "/" + host + "!" + port + path);
}
+ return new File(this.cachePath, protocol + "/" + host + "!" + port + path);
}
/**
@@ -663,7 +677,7 @@ public final class plasmaHTCache {
return null;
}
- public byte[] loadResource(URL url) {
+ public byte[] loadResourceContent(URL url) {
// load the url as resource from the cache
File f = getCachePath(url);
if (f.exists()) try {
@@ -690,12 +704,30 @@ public final class plasmaHTCache {
(ls.indexOf("memberlist.php?sid=") >= 0));
}
- public Entry newEntry(Date initDate, int depth, URL url, String name,
- httpHeader requestHeader,
- String responseStatus, httpHeader responseHeader,
- String initiator,
- plasmaCrawlProfile.entry profile) {
- return new Entry(initDate, depth, url, name, requestHeader, responseStatus, responseHeader, initiator, profile);
+ public Entry newEntry(
+ Date initDate,
+ int depth,
+ URL url,
+ String name,
+ //httpHeader requestHeader,
+ String responseStatus,
+ //httpHeader responseHeader,
+ IResourceInfo docInfo,
+ String initiator,
+ plasmaCrawlProfile.entry profile
+ ) {
+ return new Entry(
+ initDate,
+ depth,
+ url,
+ name,
+ //requestHeader,
+ responseStatus,
+ //responseHeader,
+ docInfo,
+ initiator,
+ profile
+ );
}
public final class Entry {
@@ -703,9 +735,9 @@ public final class plasmaHTCache {
// the class objects
private Date initDate; // the date when the request happened; will be used as a key
private int depth; // the depth of prefetching
- private httpHeader requestHeader; // we carry also the header to prevent too many file system access
- private String responseStatus;
- private httpHeader responseHeader; // we carry also the header to prevent too many file system access
+// private httpHeader requestHeader; // we carry also the header to prevent too many file system access
+// private httpHeader responseHeader; // we carry also the header to prevent too many file system access
+ private String responseStatus;
private File cacheFile; // the cache file
private byte[] cacheArray; // or the cache as byte-array
private URL url;
@@ -718,6 +750,11 @@ public final class plasmaHTCache {
private String language;
private plasmaCrawlProfile.entry profile;
private String initiator;
+
+ /**
+ * protocolspecific information about the resource
+ */
+ private IResourceInfo resInfo;
protected Object clone() throws CloneNotSupportedException {
return new Entry(
@@ -725,9 +762,10 @@ public final class plasmaHTCache {
this.depth,
this.url,
this.name,
- this.requestHeader,
+ //this.requestHeader,
this.responseStatus,
- this.responseHeader,
+ //this.responseHeader,
+ this.resInfo,
this.initiator,
this.profile
);
@@ -737,15 +775,21 @@ public final class plasmaHTCache {
int depth,
URL url,
String name,
- httpHeader requestHeader,
- String responseStatus,
- httpHeader responseHeader,
+ //httpHeader requestHeader,
+ String responseStatus,
+ //httpHeader responseHeader,
+ IResourceInfo resourceInfo,
String initiator,
plasmaCrawlProfile.entry profile
) {
-
+ if (resourceInfo == null){
+ System.out.println("Content information object is null. " + url);
+ System.exit(0);
+ }
+ this.resInfo = resourceInfo;
+
+
// normalize url
-// serverLog.logFine("PLASMA", "Entry: URL=" + url.toString());
this.nomalizedURLString = url.toNormalform();
try {
@@ -761,28 +805,17 @@ public final class plasmaHTCache {
// assigned:
this.initDate = initDate;
this.depth = depth;
- this.requestHeader = requestHeader;
+ //this.requestHeader = requestHeader;
this.responseStatus = responseStatus;
- this.responseHeader = responseHeader;
+ //this.responseHeader = responseHeader;
this.profile = profile;
this.initiator = (initiator == null) ? null : ((initiator.length() == 0) ? null : initiator);
- // calculated:
- if (responseHeader == null) {
- try {
- throw new RuntimeException("RESPONSE HEADER = NULL");
- } catch (Exception e) {
- System.out.println("RESPONSE HEADER = NULL in " + url);
- e.printStackTrace();
- System.exit(0);
- }
-
- this.lastModified = new Date(serverDate.correctedUTCTime());
- } else {
- this.lastModified = responseHeader.lastModified();
- if (this.lastModified == null) this.lastModified = new Date(serverDate.correctedUTCTime()); // does not exist in header
- }
- this.doctype = indexEntryAttribute.docType(responseHeader.mime());
+ // getting the last modified date
+ this.lastModified = resourceInfo.getModificationDate();
+
+ // getting the doctype
+ this.doctype = indexEntryAttribute.docType(resourceInfo.getMimeType());
if (this.doctype == indexEntryAttribute.DT_UNKNOWN) this.doctype = indexEntryAttribute.docType(url);
this.language = indexEntryAttribute.language(url);
@@ -822,12 +855,7 @@ public final class plasmaHTCache {
}
public URL referrerURL() {
- if (this.requestHeader == null) return null;
- try {
- return new URL((String) this.requestHeader.get(httpHeader.REFERER, ""));
- } catch (Exception e) {
- return null;
- }
+ return (this.resInfo==null)?null:this.resInfo.getRefererUrl();
}
public File cacheFile() {
@@ -846,27 +874,36 @@ public final class plasmaHTCache {
// return this.requestHeader;
// }
- public httpHeader responseHeader() {
- return this.responseHeader;
+// public httpHeader responseHeader() {
+// return this.responseHeader;
+// }
+
+ public IResourceInfo getDocumentInfo() {
+ return this.resInfo;
}
+ public boolean writeResourceInfo() throws IOException {
+ assert(this.nomalizedURLHash != null) : "URL Hash is null";
+ if (this.resInfo == null) return false;
+
+ plasmaHTCache.this.responseHeaderDB.set(this.nomalizedURLHash, this.resInfo.getMap());
+ return true;
+ }
+
public String getMimeType() {
- return (this.responseHeader == null) ? null : this.responseHeader.mime();
+ return (this.resInfo == null) ? null : this.resInfo.getMimeType();
}
public Date ifModifiedSince() {
- return (this.requestHeader == null) ? null : this.requestHeader.ifModifiedSince();
+ return (this.resInfo == null) ? null : this.resInfo.ifModifiedSince();
}
public boolean requestWithCookie() {
- return (this.requestHeader == null) ? false : this.requestHeader.containsKey(httpHeader.COOKIE);
+ return (this.resInfo == null) ? false : this.resInfo.requestWithCookie();
}
public boolean requestProhibitsIndexing() {
- return (this.requestHeader == null)
- ? false
- : this.requestHeader.containsKey(httpHeader.X_YACY_INDEX_CONTROL) &&
- ((String)this.requestHeader.get(httpHeader.X_YACY_INDEX_CONTROL)).toUpperCase().equals("NO-INDEX");
+ return (this.resInfo == null) ? false : this.resInfo.requestProhibitsIndexing();
}
/*
@@ -878,9 +915,10 @@ public final class plasmaHTCache {
// the following three methods for cache read/write granting shall be as loose as possible
// but also as strict as necessary to enable caching of most items
+ /**
+ * @return NULL if the answer is TRUE, in case of FALSE, the reason as String is returned
+ */
public String shallStoreCacheForProxy() {
- // returns NULL if the answer is TRUE
- // in case of FALSE, the reason as String is returned
// check profile (disabled: we will check this in the plasmaSwitchboard)
//if (!this.profile.storeHTCache()) { return "storage_not_wanted"; }
@@ -889,8 +927,11 @@ public final class plasmaHTCache {
// if the storage was requested by prefetching, the request map is null
// check status code
- if (!(this.responseStatus.startsWith("200") ||
- this.responseStatus.startsWith("203"))) { return "bad_status_" + this.responseStatus.substring(0,3); }
+ if ((this.resInfo != null) && (!this.resInfo.validResponseStatus(this.responseStatus))) {
+ return "bad_status_" + this.responseStatus.substring(0,3);
+ }
+// if (!(this.responseStatus.startsWith("200") ||
+// this.responseStatus.startsWith("203"))) { return "bad_status_" + this.responseStatus.substring(0,3); }
// check storage location
// sometimes a file name is equal to a path name in the same directory;
@@ -905,62 +946,10 @@ public final class plasmaHTCache {
if (isPOST(this.nomalizedURLString) && !this.profile.crawlingQ()) { return "dynamic_post"; }
if (isCGI(this.nomalizedURLString)) { return "dynamic_cgi"; }
- if (this.requestHeader != null) {
- // -authorization cases in request
- // authorization makes pages very individual, and therefore we cannot use the
- // content in the cache
- if (this.requestHeader.containsKey(httpHeader.AUTHORIZATION)) { return "personalized"; }
- // -ranges in request and response
- // we do not cache partial content
- if (this.requestHeader.containsKey(httpHeader.RANGE)) { return "partial"; }
- }
- // -ranges in request and response
- // we do not cache partial content
- if (this.responseHeader != null && this.responseHeader.containsKey(httpHeader.CONTENT_RANGE)) { return "partial"; }
-
- // -if-modified-since in request
- // we do not care about if-modified-since, because this case only occurres if the
- // cache file does not exist, and we need as much info as possible for the indexing
-
- // -cookies in request
- // we do not care about cookies, because that would prevent loading more pages
- // from one domain once a request resulted in a client-side stored cookie
-
- // -set-cookie in response
- // we do not care about cookies in responses, because that info comes along
- // any/many pages from a server and does not express the validity of the page
- // in modes of life-time/expiration or individuality
-
- // -pragma in response
- // if we have a pragma non-cache, we don't cache. usually if this is wanted from
- // the server, it makes sense
- String cacheControl = (String) this.responseHeader.get(httpHeader.PRAGMA);
- if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return "controlled_no_cache"; }
-
- // -expires in response
- // we do not care about expires, because at the time this is called the data is
- // obvious valid and that header info is used in the indexing later on
-
- // -cache-control in response
- // the cache-control has many value options.
- cacheControl = (String) this.responseHeader.get(httpHeader.CACHE_CONTROL);
- if (cacheControl != null) {
- cacheControl = cacheControl.trim().toUpperCase();
- if (cacheControl.startsWith("MAX-AGE=")) {
- // we need also the load date
- Date date = this.responseHeader.date();
- if (date == null) return "stale_no_date_given_in_response";
- try {
- long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live
- if (serverDate.correctedUTCTime() - date.getTime() > ttl) {
- //System.out.println("***not indexed because cache-control");
- return "stale_expired";
- }
- } catch (Exception e) {
- return "stale_error_" + e.getMessage() + ")";
- }
- }
+ if (this.resInfo != null) {
+ return this.resInfo.shallStoreCacheForProxy();
}
+
return null;
}
@@ -971,146 +960,17 @@ public final class plasmaHTCache {
public boolean shallUseCacheForProxy() {
// System.out.println("SHALL READ CACHE: requestHeader = " + requestHeader.toString() + ", responseHeader = " + responseHeader.toString());
- String cacheControl;
- if (this.requestHeader != null) {
- // -authorization cases in request
- if (this.requestHeader.containsKey(httpHeader.AUTHORIZATION)) { return false; }
-
- // -ranges in request
- // we do not cache partial content
- if (this.requestHeader.containsKey(httpHeader.RANGE)) { return false; }
-
- // if the client requests a un-cached copy of the resource ...
- cacheControl = (String) this.requestHeader.get(httpHeader.PRAGMA);
- if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return false; }
-
- cacheControl = (String) this.requestHeader.get(httpHeader.CACHE_CONTROL);
- if (cacheControl != null) {
- cacheControl = cacheControl.trim().toUpperCase();
- if (cacheControl.startsWith("NO-CACHE") || cacheControl.startsWith("MAX-AGE=0")) { return false; }
- }
- }
-
// -CGI access in request
// CGI access makes the page very individual, and therefore not usable in caches
if (isPOST(this.nomalizedURLString)) { return false; }
if (isCGI(this.nomalizedURLString)) { return false; }
-
- // -if-modified-since in request
- // The entity has to be transferred only if it has
- // been modified since the date given by the If-Modified-Since header.
- if (this.requestHeader.containsKey(httpHeader.IF_MODIFIED_SINCE)) {
- // checking this makes only sense if the cached response contains
- // a Last-Modified field. If the field does not exist, we go the safe way
- if (!this.responseHeader.containsKey(httpHeader.LAST_MODIFIED)) { return false; }
- // parse date
- Date d1, d2;
- d2 = this.responseHeader.lastModified(); if (d2 == null) { d2 = new Date(serverDate.correctedUTCTime()); }
- d1 = this.requestHeader.ifModifiedSince(); if (d1 == null) { d1 = new Date(serverDate.correctedUTCTime()); }
- // finally, we shall treat the cache as stale if the modification time is after the if-.. time
- if (d2.after(d1)) { return false; }
- }
-
- if (!isPicture(this.responseHeader)) {
- // -cookies in request
- // unfortunately, we should reload in case of a cookie
- // but we think that pictures can still be considered as fresh
- // -set-cookie in cached response
- // this is a similar case as for COOKIE.
- if (this.requestHeader.containsKey(httpHeader.COOKIE) ||
- this.responseHeader.containsKey(httpHeader.SET_COOKIE) ||
- this.responseHeader.containsKey(httpHeader.SET_COOKIE2)) {
- return false; // too strong
- }
- }
-
- // -pragma in cached response
- // logically, we would not need to care about no-cache pragmas in cached response headers,
- // because they cannot exist since they are not written to the cache.
- // So this IF should always fail..
- cacheControl = (String) this.responseHeader.get(httpHeader.PRAGMA);
- if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return false; }
-
- // see for documentation also:
- // http://www.web-caching.com/cacheability.html
- // http://vancouver-webpages.com/CacheNow/
-
- // look for freshnes information
- // if we don't have any freshnes indication, we treat the file as stale.
- // no handle for freshness control:
-
- // -expires in cached response
- // the expires value gives us a very easy hint when the cache is stale
- Date expires = this.responseHeader.expires();
- if (expires != null) {
-// System.out.println("EXPIRES-TEST: expires=" + expires + ", NOW=" + serverDate.correctedGMTDate() + ", url=" + url);
- if (expires.before(new Date(serverDate.correctedUTCTime()))) { return false; }
- }
- Date lastModified = this.responseHeader.lastModified();
- cacheControl = (String) this.responseHeader.get(httpHeader.CACHE_CONTROL);
- if (cacheControl == null && lastModified == null && expires == null) { return false; }
-
- // -lastModified in cached response
- // we can apply a TTL (Time To Live) heuristic here. We call the time delta between the last read
- // of the file and the last modified date as the age of the file. If we consider the file as
- // middel-aged then, the maximum TTL would be cache-creation plus age.
- // This would be a TTL factor of 100% we want no more than 10% TTL, so that a 10 month old cache
- // file may only be treated as fresh for one more month, not more.
- Date date = this.responseHeader.date();
- if (lastModified != null) {
- if (date == null) { date = new Date(serverDate.correctedUTCTime()); }
- long age = date.getTime() - lastModified.getTime();
- if (age < 0) { return false; }
- // TTL (Time-To-Live) is age/10 = (d2.getTime() - d1.getTime()) / 10
- // the actual living-time is serverDate.correctedGMTDate().getTime() - d2.getTime()
- // therefore the cache is stale, if serverDate.correctedGMTDate().getTime() - d2.getTime() > age/10
- if (serverDate.correctedUTCTime() - date.getTime() > age / 10) { return false; }
- }
-
- // -cache-control in cached response
- // the cache-control has many value options.
- if (cacheControl != null) {
- cacheControl = cacheControl.trim().toUpperCase();
- if (cacheControl.startsWith("PRIVATE") ||
- cacheControl.startsWith("NO-CACHE") ||
- cacheControl.startsWith("NO-STORE")) {
- // easy case
- return false;
-// } else if (cacheControl.startsWith("PUBLIC")) {
-// // ok, do nothing
- } else if (cacheControl.startsWith("MAX-AGE=")) {
- // we need also the load date
- if (date == null) { return false; }
- try {
- final long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live
- if (serverDate.correctedUTCTime() - date.getTime() > ttl) {
- return false;
- }
- } catch (Exception e) {
- return false;
- }
- }
+
+ if (this.resInfo != null) {
+ return this.resInfo.shallUseCacheForProxy();
}
+
return true;
}
} // class Entry
-
- /*
- public static void main(String[] args) {
- //String[] s = TimeZone.getAvailableIDs();
- //for (int i = 0; i < s.length; i++) System.out.println("ZONE=" + s[i]);
- Calendar c = GregorianCalendar.getInstance();
- int zoneOffset = c.get(Calendar.ZONE_OFFSET)/(60*60*1000);
- int DSTOffset = c.get(Calendar.DST_OFFSET)/(60*60*1000);
- System.out.println("This Offset = " + (zoneOffset + DSTOffset));
- for (int i = 0; i < 12; i++) {
- c = new GregorianCalendar(TimeZone.getTimeZone("Etc/GMT-" + i));
- //c.setTimeZone(TimeZone.getTimeZone("Etc/GMT+0"));
- System.out.println("Zone offset: "+
- c.get(Calendar.ZONE_OFFSET)/(60*60*1000));
- System.out.println(c.get(GregorianCalendar.HOUR) + ", " + c.getTime() + ", " + c.getTimeInMillis());
- }
- }
- **/
}
diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java
index 676f3e047..f6c0fe2b3 100644
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@@ -44,6 +44,7 @@ package de.anomic.plasma;
import java.io.IOException;
import de.anomic.net.URL;
+import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.plasma.crawler.http.CrawlWorker;
import java.util.Enumeration;
@@ -167,15 +168,15 @@ public class plasmaSnippetCache {
// if the snippet is not in the cache, we can try to get it from the htcache
byte[] resource = null;
- httpHeader header = null;
+ IResourceInfo docInfo = null;
try {
- resource = cacheManager.loadResource(url);
+ resource = this.cacheManager.loadResourceContent(url);
if ((fetchOnline) && (resource == null)) {
plasmaHTCache.Entry entry = loadResourceFromWeb(url, 5000);
if (entry != null) {
- header = entry.responseHeader();
+ docInfo = entry.getDocumentInfo();
}
- resource = cacheManager.loadResource(url);
+ resource = this.cacheManager.loadResourceContent(url);
source = SOURCE_WEB;
}
} catch (IOException e) {
@@ -185,7 +186,7 @@ public class plasmaSnippetCache {
//System.out.println("cannot load document for URL " + url);
return new result(null, ERROR_RESOURCE_LOADING, "error loading resource from web, cacheManager returned NULL");
}
- plasmaParserDocument document = parseDocument(url, resource, header);
+ plasmaParserDocument document = parseDocument(url, resource, docInfo);
if (document == null) return new result(null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
//System.out.println("loaded document for URL " + url);
@@ -360,18 +361,18 @@ public class plasmaSnippetCache {
return parseDocument(url, resource, null);
}
- public plasmaParserDocument parseDocument(URL url, byte[] resource, httpHeader header) {
+ public plasmaParserDocument parseDocument(URL url, byte[] resource, IResourceInfo docInfo) {
try {
if (resource == null) return null;
// try to get the header from the htcache directory
- if (header == null) {
+ if (docInfo == null) {
try {
- header = this.cacheManager.getCachedResponse(indexURL.urlHash(url));
- } catch (IOException e) {}
+ docInfo = this.cacheManager.loadResourceInfo(url);
+ } catch (Exception e) {}
}
- if (header == null) {
+ if (docInfo == null) {
String filename = this.cacheManager.getCachePath(url).getName();
int p = filename.lastIndexOf('.');
if ( // if no extension is available
@@ -394,8 +395,8 @@ public class plasmaSnippetCache {
}
return null;
}
- if (plasmaParser.supportedMimeTypesContains(header.mime())) {
- return this.parser.parseSource(url, header.mime(), resource);
+ if (plasmaParser.supportedMimeTypesContains(docInfo.getMimeType())) {
+ return this.parser.parseSource(url, docInfo.getMimeType(), resource);
}
return null;
} catch (InterruptedException e) {
@@ -407,10 +408,10 @@ public class plasmaSnippetCache {
public byte[] getResource(URL url, boolean fetchOnline, int socketTimeout) {
// load the url as resource from the web
try {
- byte[] resource = cacheManager.loadResource(url);
+ byte[] resource = cacheManager.loadResourceContent(url);
if ((fetchOnline) && (resource == null)) {
loadResourceFromWeb(url, (socketTimeout < 0) ? -1 : socketTimeout);
- resource = cacheManager.loadResource(url);
+ resource = cacheManager.loadResourceContent(url);
}
return resource;
} catch (IOException e) {
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index fea42ed2d..49f3e15f9 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -829,7 +829,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
/* =========================================================================
* LOCAL IP ADDRESS CHECK
*
- * check if ip is local ip address
+ * check if ip is local ip address // TODO: remove this procotol specific code here
* ========================================================================= */
InetAddress hostAddress = httpc.dnsResolve(entry.url().getHost());
if (hostAddress == null) {
@@ -856,9 +856,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
(entry.profile().storeHTCache()) ||
(doIndexing && isSupportedContent)
) {
- // store response header
- if (entry.responseHeader() != null) {
- this.cacheManager.storeHeader(entry.urlHash(), entry.responseHeader());
+ // store response header
+ if (entry.writeResourceInfo()) {
this.log.logInfo("WROTE HEADER for " + entry.cacheFile());
}
@@ -868,7 +867,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} else {
String error = entry.shallStoreCacheForProxy();
if (error == null) {
- this.cacheManager.writeFile(entry.url(), entry.cacheArray());
+ this.cacheManager.writeResourceContent(entry.url(), entry.cacheArray());
this.log.logFine("WROTE FILE (" + entry.cacheArray().length + " bytes) for " + entry.cacheFile());
} else {
this.log.logFine("WRITE OF FILE " + entry.cacheFile() + " FORBIDDEN: " + error);
diff --git a/source/de/anomic/plasma/plasmaSwitchboardQueue.java b/source/de/anomic/plasma/plasmaSwitchboardQueue.java
index eaa0e5c9c..eda6f0e90 100644
--- a/source/de/anomic/plasma/plasmaSwitchboardQueue.java
+++ b/source/de/anomic/plasma/plasmaSwitchboardQueue.java
@@ -44,28 +44,27 @@
package de.anomic.plasma;
-import de.anomic.http.httpHeader;
+import java.io.File;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.util.ArrayList;
+import java.util.Date;
+
import de.anomic.index.indexURL;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroException;
-import de.anomic.kelondro.kelondroStack;
import de.anomic.kelondro.kelondroRow;
+import de.anomic.kelondro.kelondroStack;
+import de.anomic.net.URL;
+import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.server.logging.serverLog;
-import de.anomic.server.serverDate;
import de.anomic.yacy.yacySeedDB;
-import java.io.File;
-import java.io.IOException;
-import java.net.MalformedURLException;
-import de.anomic.net.URL;
-import java.util.ArrayList;
-import java.util.Date;
-
public class plasmaSwitchboardQueue {
private kelondroStack sbQueueStack;
private plasmaCrawlProfile profiles;
- private plasmaHTCache htCache;
+ plasmaHTCache htCache;
private plasmaCrawlLURL lurls;
private File sbQueueStackPath;
@@ -191,7 +190,7 @@ public class plasmaSwitchboardQueue {
// computed values
private plasmaCrawlProfile.entry profileEntry;
- private httpHeader responseHeader;
+ private IResourceInfo contentInfo;
private URL referrerURL;
public Entry(URL url, String referrer, Date ifModifiedSince, boolean requestWithCookie,
@@ -206,7 +205,7 @@ public class plasmaSwitchboardQueue {
this.anchorName = (anchorName==null)?"":anchorName.trim();
this.profileEntry = null;
- this.responseHeader = null;
+ this.contentInfo = null;
this.referrerURL = null;
}
@@ -227,7 +226,7 @@ public class plasmaSwitchboardQueue {
this.anchorName = row.getColString(7, "UTF-8");
this.profileEntry = null;
- this.responseHeader = null;
+ this.contentInfo = null;
this.referrerURL = null;
}
@@ -248,7 +247,7 @@ public class plasmaSwitchboardQueue {
this.anchorName = (row[7] == null) ? null : (new String(row[7], "UTF-8")).trim();
this.profileEntry = null;
- this.responseHeader = null;
+ this.contentInfo = null;
this.referrerURL = null;
}
@@ -306,32 +305,24 @@ public class plasmaSwitchboardQueue {
return profileEntry;
}
- private httpHeader responseHeader() {
- if (responseHeader == null) try {
- responseHeader = htCache.getCachedResponse(indexURL.urlHash(url));
- } catch (IOException e) {
+ private IResourceInfo getCachedObjectInfo() {
+ if (this.contentInfo == null) try {
+ this.contentInfo = plasmaSwitchboardQueue.this.htCache.loadResourceInfo(this.url);
+ } catch (Exception e) {
serverLog.logSevere("PLASMA", "responseHeader: failed to get header", e);
return null;
}
- return responseHeader;
+ return this.contentInfo;
}
public String getMimeType() {
- httpHeader headers = this.responseHeader();
- return (headers == null) ? null : headers.mime();
+ IResourceInfo info = this.getCachedObjectInfo();
+ return (info == null) ? null : info.getMimeType();
}
public Date getModificationDate() {
- Date docDate = null;
-
- httpHeader headers = this.responseHeader();
- if (headers != null) {
- docDate = headers.lastModified();
- if (docDate == null) docDate = headers.date();
- }
- if (docDate == null) docDate = new Date();
-
- return docDate;
+ IResourceInfo info = this.getCachedObjectInfo();
+ return (info == null) ? new Date() : info.getModificationDate();
}
public URL referrerURL() {
@@ -360,6 +351,8 @@ public class plasmaSwitchboardQueue {
* this method returns null if the answer is 'YES'!
* if the answer is 'NO' (do not index), it returns a string with the reason
* to reject the crawling demand in clear text
+ *
+ * This function is used by plasmaSwitchboard#processResourceStack
*/
public final String shallIndexCacheForProxy() {
if (profile() == null) {
@@ -402,91 +395,8 @@ public class plasmaSwitchboardQueue {
return "Dynamic_(Requested_With_Cookie)";
}
- // -set-cookie in response
- // the set-cookie from the server does not indicate that the content is special
- // thus we do not care about it here for indexing
- if (responseHeader() != null) {
- // a picture cannot be indexed
- if (plasmaHTCache.isPicture(responseHeader())) {
- return "Media_Content_(Picture)";
- }
- if (!plasmaHTCache.isText(responseHeader())) {
- return "Media_Content_(not_text)";
- }
-
- // -if-modified-since in request
- // if the page is fresh at the very moment we can index it
- if ((ifModifiedSince != null) && (responseHeader().containsKey(httpHeader.LAST_MODIFIED))) {
- // parse date
- Date d = responseHeader().lastModified();
- if (d == null) {
- d = new Date(serverDate.correctedUTCTime());
- }
- // finally, we shall treat the cache as stale if the modification time is after the if-.. time
- if (d.after(ifModifiedSince)) {
- //System.out.println("***not indexed because if-modified-since");
- return "Stale_(Last-Modified>Modified-Since)";
- }
- }
-
- // -pragma in cached response
- if (responseHeader().containsKey(httpHeader.PRAGMA) &&
- ((String) responseHeader().get(httpHeader.PRAGMA)).toUpperCase().equals("NO-CACHE")) {
- return "Denied_(pragma_no_cache)";
- }
-
- // see for documentation also:
- // http://www.web-caching.com/cacheability.html
-
- // look for freshnes information
-
- // -expires in cached response
- // the expires value gives us a very easy hint when the cache is stale
- // sometimes, the expires date is set to the past to prevent that a page is cached
- // we use that information to see if we should index it
- final Date expires = responseHeader().expires();
- if (expires != null && expires.before(new Date(serverDate.correctedUTCTime()))) {
- return "Stale_(Expired)";
- }
-
- // -lastModified in cached response
- // this information is too weak to use it to prevent indexing
- // even if we can apply a TTL heuristic for cache usage
-
- // -cache-control in cached response
- // the cache-control has many value options.
- String cacheControl = (String) responseHeader.get(httpHeader.CACHE_CONTROL);
- if (cacheControl != null) {
- cacheControl = cacheControl.trim().toUpperCase();
- /* we have the following cases for cache-control:
- "public" -- can be indexed
- "private", "no-cache", "no-store" -- cannot be indexed
- "max-age=" -- stale/fresh dependent on date
- */
- if (cacheControl.startsWith("PRIVATE") ||
- cacheControl.startsWith("NO-CACHE") ||
- cacheControl.startsWith("NO-STORE")) {
- // easy case
- return "Stale_(denied_by_cache-control=" + cacheControl + ")";
-// } else if (cacheControl.startsWith("PUBLIC")) {
-// // ok, do nothing
- } else if (cacheControl.startsWith("MAX-AGE=")) {
- // we need also the load date
- final Date date = responseHeader().date();
- if (date == null) {
- return "Stale_(no_date_given_in_response)";
- }
- try {
- final long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live
- if (serverDate.correctedUTCTime() - date.getTime() > ttl) {
- //System.out.println("***not indexed because cache-control");
- return "Stale_(expired_by_cache-control)";
- }
- } catch (Exception e) {
- return "Error_(" + e.getMessage() + ")";
- }
- }
- }
+ if (getCachedObjectInfo() != null) {
+ return this.getCachedObjectInfo().shallIndexCacheForProxy();
}
return null;
}
@@ -496,6 +406,8 @@ public class plasmaSwitchboardQueue {
* this method returns null if the answer is 'YES'!
* if the answer is 'NO' (do not index), it returns a string with the reason
* to reject the crawling demand in clear text
+ *
+ * This function is used by plasmaSwitchboard#processResourceStack
*/
public final String shallIndexCacheForCrawler() {
if (profile() == null) {
@@ -520,9 +432,9 @@ public class plasmaSwitchboardQueue {
// we checked that in shallStoreCache
// a picture cannot be indexed
- if (responseHeader() != null) {
- if (plasmaHTCache.isPicture(responseHeader())) { return "Media_Content_(Picture)"; }
- if (!plasmaHTCache.isText(responseHeader())) { return "Media_Content_(not_text)"; }
+ if (getCachedObjectInfo() != null) {
+ String status = this.getCachedObjectInfo().shallIndexCacheForProxy();
+ if (status != null) return status;
}
if (plasmaHTCache.noIndexingURL(nURL)) { return "Media_Content_(forbidden)"; }