git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2495 6c8d7289-2bf4-0310-a012-ef5d649a1542

pull/1/head
theli 19 years ago
parent 4825bfaaf3
commit dae763d8e3

@ -59,6 +59,7 @@ import de.anomic.index.indexURL;
import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.server.serverCore; import de.anomic.server.serverCore;
import de.anomic.server.serverFileUtils; import de.anomic.server.serverFileUtils;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
@ -102,8 +103,8 @@ public class CacheAdmin_p {
info.ensureCapacity(40000); info.ensureCapacity(40000);
try { try {
final httpHeader fileheader = switchboard.cacheManager.getCachedResponse(indexURL.urlHash(url)); final IResourceInfo resInfo = switchboard.cacheManager.loadResourceInfo(url);
info.append("<b>HTTP Header:</b><br>").append(formatHeader(fileheader)).append("<br>"); info.append("<b>HTTP Header:</b><br>").append(formatHeader(resInfo.getMap())).append("<br>");
final String ff = file.toString(); final String ff = file.toString();
final int dotpos = ff.lastIndexOf('.'); final int dotpos = ff.lastIndexOf('.');
final String ext = (dotpos >= 0) ? ff.substring(dotpos + 1).toLowerCase() : ""; final String ext = (dotpos >= 0) ? ff.substring(dotpos + 1).toLowerCase() : "";
@ -198,7 +199,7 @@ public class CacheAdmin_p {
return new String(s); return new String(s);
} }
private static String formatHeader(httpHeader header) { private static String formatHeader(Map header) {
final StringBuffer result = new StringBuffer(2048); final StringBuffer result = new StringBuffer(2048);
if (header == null) { if (header == null) {
result.append("- no header in header cache -<br>"); result.append("- no header in header cache -<br>");

@ -56,6 +56,8 @@ Invalid URL
Unable to download resource content. Unable to download resource content.
:: <!-- 5 --> :: <!-- 5 -->
Unable to parse resource content. Unable to parse resource content.
:: <!-- 6 -->
Unsupported protocol.
#(/error)# #(/error)#
</font> </font>
</p> </p>

@ -4,23 +4,23 @@
//(C) by Michael Peter Christen; mc@anomic.de //(C) by Michael Peter Christen; mc@anomic.de
//first published on http://www.anomic.de //first published on http://www.anomic.de
//Frankfurt, Germany, 2004 //Frankfurt, Germany, 2004
//
//last major change: 12.07.2004 //last major change: 12.07.2004
//
//This program is free software; you can redistribute it and/or modify //This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by //it under the terms of the GNU General Public License as published by
//the Free Software Foundation; either version 2 of the License, or //the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version. //(at your option) any later version.
//
//This program is distributed in the hope that it will be useful, //This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of //but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details. //GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License //You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software //along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
//Using this software in any meaning (reading, learning, copying, compiling, //Using this software in any meaning (reading, learning, copying, compiling,
//running) means that you agree that the Author(s) is (are) not responsible //running) means that you agree that the Author(s) is (are) not responsible
//for cost, loss of data or any harm that may be caused directly or indirectly //for cost, loss of data or any harm that may be caused directly or indirectly
@ -32,7 +32,7 @@
//(are) also not responsible for proper configuration and usage of the //(are) also not responsible for proper configuration and usage of the
//software, even if provoked by documentation provided together with //software, even if provoked by documentation provided together with
//the software. //the software.
//
//Any changes to this file according to the GPL as documented in the file //Any changes to this file according to the GPL as documented in the file
//gpl.txt aside this file in the shipment you received can be done to the //gpl.txt aside this file in the shipment you received can be done to the
//lines that follows this copyright notice here, but changes must not be //lines that follows this copyright notice here, but changes must not be
@ -56,6 +56,7 @@ import de.anomic.http.httpc;
import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.plasma.plasmaCrawlLURL.Entry; import de.anomic.plasma.plasmaCrawlLURL.Entry;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch; import de.anomic.server.serverSwitch;
@ -123,35 +124,55 @@ public class ViewFile {
// loading the resource content as byte array // loading the resource content as byte array
byte[] resource = null; byte[] resource = null;
httpHeader resHeader = null; IResourceInfo resInfo = null;
String resMime = null; String resMime = null;
try { try {
resource = sb.cacheManager.loadResource(url); // trying to load the resource body
resource = sb.cacheManager.loadResourceContent(url);
// if the resource body was not cached we try to load it from web
if (resource == null) { if (resource == null) {
plasmaHTCache.Entry entry = sb.snippetCache.loadResourceFromWeb(url, 5000); plasmaHTCache.Entry entry = sb.snippetCache.loadResourceFromWeb(url, 5000);
if (entry != null) { if (entry != null) {
resHeader = entry.responseHeader(); resInfo = entry.getDocumentInfo();
resource = sb.cacheManager.loadResourceContent(url);
} }
resource = sb.cacheManager.loadResource(url);
if (resource == null) { if (resource == null) {
prop.put("error",4); prop.put("error",4);
prop.put("viewMode",VIEW_MODE_NO_TEXT); prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop; return prop;
} }
} }
if (resHeader == null) {
resHeader = sb.cacheManager.getCachedResponse(urlEntry.hash()); // try to load resource metadata
if (resHeader == null) { if (resInfo == null) {
resHeader = httpc.whead(url,url.getHost(),5000,null,null,sb.remoteProxyConfig);
if (resource == null) { // try to load the metadata from cache
try {
resInfo = sb.cacheManager.loadResourceInfo(urlEntry.url());
} catch (Exception e) { /* ignore this */}
// if the metadata where not cached try to load it from web
if (resInfo == null) {
String protocol = url.getProtocol();
if (!((protocol.equals("http") || protocol.equals("https")))) {
prop.put("error",6);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
httpHeader responseHeader = httpc.whead(url,url.getHost(),5000,null,null,sb.remoteProxyConfig);
if (responseHeader == null) {
prop.put("error",4); prop.put("error",4);
prop.put("viewMode",VIEW_MODE_NO_TEXT); prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop; return prop;
} }
resMime = resHeader.mime(); resMime = responseHeader.mime();
} }
} else {
resMime = resInfo.getMimeType();
} }
} catch (IOException e) { } catch (IOException e) {
if (url == null) { if (url == null) {
@ -173,7 +194,7 @@ public class ViewFile {
prop.put("viewMode_plainText",content); prop.put("viewMode_plainText",content);
} else if (viewMode.equals("parsed") || viewMode.equals("sentences") || viewMode.equals("iframe")) { } else if (viewMode.equals("parsed") || viewMode.equals("sentences") || viewMode.equals("iframe")) {
// parsing the resource content // parsing the resource content
plasmaParserDocument document = sb.snippetCache.parseDocument(url, resource,resHeader); plasmaParserDocument document = sb.snippetCache.parseDocument(url, resource,resInfo);
if (document == null) { if (document == null) {
prop.put("error",5); prop.put("error",5);
prop.put("viewMode",VIEW_MODE_NO_TEXT); prop.put("viewMode",VIEW_MODE_NO_TEXT);

@ -96,6 +96,8 @@ import de.anomic.index.indexURL;
import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParser; import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.plasma.cache.http.ResourceInfo;
import de.anomic.plasma.urlPattern.plasmaURLPattern; import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverCore; import de.anomic.server.serverCore;
import de.anomic.server.serverFileUtils; import de.anomic.server.serverFileUtils;
@ -413,8 +415,8 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
// decide wether to use a cache entry or connect to the network // decide wether to use a cache entry or connect to the network
File cacheFile = cacheManager.getCachePath(url); File cacheFile = cacheManager.getCachePath(url);
String urlHash = indexURL.urlHash(url); ResourceInfo cachedResInfo = (ResourceInfo) cacheManager.loadResourceInfo(url);
httpHeader cachedResponseHeader = cacheManager.getCachedResponse(urlHash); httpHeader cachedResponseHeader = (cachedResInfo == null)?null:cachedResInfo.getResponseHeader();
boolean cacheExists = ((cacheFile.isFile()) && (cachedResponseHeader != null)); boolean cacheExists = ((cacheFile.isFile()) && (cachedResponseHeader != null));
// why are files unzipped upon arrival? why not zip all files in cache? // why are files unzipped upon arrival? why not zip all files in cache?
@ -445,9 +447,10 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
0, // crawling depth 0, // crawling depth
url, // url url, // url
"", // name of the url is unknown "", // name of the url is unknown
requestHeader, // request headers //requestHeader, // request headers
"200 OK", // request status "200 OK", // request status
cachedResponseHeader, // response headers //cachedResponseHeader, // response headers
cachedResInfo,
null, // initiator null, // initiator
switchboard.defaultProxyProfile // profile switchboard.defaultProxyProfile // profile
); );
@ -580,14 +583,16 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
// reserver cache entry // reserver cache entry
Date requestDate = new Date(((Long)conProp.get(httpHeader.CONNECTION_PROP_REQUEST_START)).longValue()); Date requestDate = new Date(((Long)conProp.get(httpHeader.CONNECTION_PROP_REQUEST_START)).longValue());
IResourceInfo resInfo = new ResourceInfo(url,requestHeader,res.responseHeader);
plasmaHTCache.Entry cacheEntry = cacheManager.newEntry( plasmaHTCache.Entry cacheEntry = cacheManager.newEntry(
requestDate, requestDate,
0, 0,
url, url,
"", "",
requestHeader, //requestHeader,
res.status, res.status,
res.responseHeader, //res.responseHeader,
resInfo,
null, null,
switchboard.defaultProxyProfile switchboard.defaultProxyProfile
); );

@ -64,6 +64,8 @@ import de.anomic.http.httpc;
import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParser; import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.plasma.cache.http.ResourceInfo;
import de.anomic.server.serverCore; import de.anomic.server.serverCore;
import de.anomic.server.serverFileUtils; import de.anomic.server.serverFileUtils;
import de.anomic.server.serverHandler; import de.anomic.server.serverHandler;
@ -385,14 +387,14 @@ public class icapd implements serverHandler {
* ========================================================================= */ * ========================================================================= */
// generating a htcache entry object // generating a htcache entry object
IResourceInfo resInfo = new ResourceInfo(httpRequestURL,httpReqHeader,httpResHeader);
plasmaHTCache.Entry cacheEntry = cacheManager.newEntry( plasmaHTCache.Entry cacheEntry = cacheManager.newEntry(
new Date(), new Date(),
0, 0,
httpRequestURL, httpRequestURL,
"", "",
httpReqHeader,
httpRespStatusLine, httpRespStatusLine,
httpResHeader, resInfo,
null, null,
switchboard.defaultProxyProfile switchboard.defaultProxyProfile
); );

@ -0,0 +1,136 @@
// IResourceInfo.java
// -------------------------------------
// part of YACY
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2006
//
// This file ist contributed by Martin Thelian
//
// $LastChangedDate: 2006-02-20 23:57:42 +0100 (Mo, 20 Feb 2006) $
// $LastChangedRevision: 1715 $
// $LastChangedBy: theli $
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
package de.anomic.plasma.cache;
import java.util.Date;
import java.util.Map;
import de.anomic.net.URL;
public interface IResourceInfo {
/**
* Return the resource information as map
* @return
*/
public Map getMap();
/**
* Returns the URL of this content
* @return
*/
public URL getUrl();
/**
* Returns the referer URL of this URL
* @return referer URL
*/
public URL getRefererUrl();
/**
* Returns the mimetype of the cached object
* @return mimetype
*/
public String getMimeType();
/**
* Returns the modification date of the cached object
* @return the modifiaction date
*/
public Date getModificationDate();
/**
* Returns the url hash of the content URL
* @return
*/
public String getUrlHash();
/**
* Specifies if the resource was requested with a
* if modified since date
* @return
*/
public Date ifModifiedSince();
/**
* Specifies if the resource was requested with
* client specific information (e.g. cookies for http)
* @return
*/
public boolean requestWithCookie();
/**
* Specifies if the request prohibits indexing
* @return
*/
public boolean requestProhibitsIndexing();
/**
* Determines if a resource that was downloaded by the crawler
* is allowed to be indexed.
*
* @return an error string describing the reason why the
* resourse should not be indexed or null if indexing is allowed
*/
public String shallIndexCacheForCrawler();
/**
* Determines if a resource that was downloaded by the proxy
* is allowed to be indexed.
*
* @return an error string describing the reason why the
* resourse should not be indexed or null if indexing is allowed
*/
public String shallIndexCacheForProxy();
public String shallStoreCacheForProxy();
public boolean shallUseCacheForProxy();
public boolean validResponseStatus(String responseStatus);
}

@ -0,0 +1,86 @@
// RespourceInfoFactory.java
// -------------------------------------
// part of YACY
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2006
//
// This file ist contributed by Martin Thelian
//
// $LastChangedDate: 2006-02-20 23:57:42 +0100 (Mo, 20 Feb 2006) $
// $LastChangedRevision: 1715 $
// $LastChangedBy: theli $
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
package de.anomic.plasma.cache;
import java.lang.reflect.Constructor;
import java.util.Map;
import de.anomic.net.URL;
public class ResourceInfoFactory {
public IResourceInfo buildResourceInfoObj(
URL resourceURL,
Map resourceMetadata
) throws Exception {
String protocString = resourceURL.getProtocol();
// the full qualified class name
String className = this.getClass().getPackage().getName() + "." + protocString + ".ResourceInfo";
// loading class by name
Class moduleClass = Class.forName(className);
// getting the constructor
Constructor classConstructor = moduleClass.getConstructor( new Class[] {
URL.class,
Map.class
} );
// instantiating class
IResourceInfo infoObject = (IResourceInfo) classConstructor.newInstance(new Object[] {
resourceURL,
resourceMetadata
});
// return the newly created object
return infoObject;
}
}

@ -0,0 +1,467 @@
// ResourceInfo.java
// -------------------------------------
// part of YACY
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2006
//
// This file ist contributed by Martin Thelian
//
// $LastChangedDate: 2006-02-20 23:57:42 +0100 (Mo, 20 Feb 2006) $
// $LastChangedRevision: 1715 $
// $LastChangedBy: theli $
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
package de.anomic.plasma.cache.http;
import java.util.Date;
import java.util.Map;
import de.anomic.http.httpHeader;
import de.anomic.index.indexURL;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.cache.ResourceInfoFactory;
import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.server.serverDate;
public class ResourceInfo implements IResourceInfo {
private URL url;
private String urlHash;
private httpHeader responseHeader;
private httpHeader requestHeader;
/**
* Constructor used by the {@link ResourceInfoFactory}
* @param objectURL
* @param objectInfo
*/
public ResourceInfo(URL objectURL, Map objectInfo) {
if (objectURL == null) throw new NullPointerException();
if (objectInfo == null) throw new NullPointerException();
// generating the url hash
this.url = objectURL;
this.urlHash = indexURL.urlHash(this.url.toNormalform());
// create the http header object
this.responseHeader = new httpHeader(null, objectInfo);
}
public ResourceInfo(URL objectURL, httpHeader requestHeaders, httpHeader responseHeaders) {
if (objectURL == null) throw new NullPointerException();
if (responseHeaders == null) throw new NullPointerException();
// generating the url hash
this.url = objectURL;
this.urlHash = indexURL.urlHash(this.url.toNormalform());
this.requestHeader = requestHeaders;
this.responseHeader = responseHeaders;
}
public Map getMap() {
return this.responseHeader;
}
/**
* @see de.anomic.plasma.cache.IResourceInfo#getMimeType()
*/
public String getMimeType() {
if (this.responseHeader == null) return null;
String mimeType = this.responseHeader.mime();
mimeType = mimeType.trim().toLowerCase();
int pos = mimeType.indexOf(';');
return ((pos < 0) ? mimeType : mimeType.substring(0, pos));
}
/**
* @see de.anomic.plasma.cache.IResourceInfo#getModificationDate()
*/
public Date getModificationDate() {
Date docDate = null;
if (this.responseHeader != null) {
docDate = this.responseHeader.lastModified();
if (docDate == null) docDate = this.responseHeader.date();
}
if (docDate == null) docDate = new Date(serverDate.correctedUTCTime());
return docDate;
}
public URL getRefererUrl() {
if (this.requestHeader == null) return null;
try {
return new URL((String) this.requestHeader.get(httpHeader.REFERER, ""));
} catch (Exception e) {
return null;
}
}
/**
* @see de.anomic.plasma.cache.IResourceInfo#getUrl()
*/
public URL getUrl() {
return this.url;
}
/**
* @see de.anomic.plasma.cache.IResourceInfo#getUrlHash()
*/
public String getUrlHash() {
return this.urlHash;
}
/**
* @see de.anomic.plasma.cache.IResourceInfo#shallIndexCacheForCrawler()
*/
public String shallIndexCacheForCrawler() {
String mimeType = this.getMimeType();
if (plasmaHTCache.isPicture(mimeType)) { return "Media_Content_(Picture)"; }
if (!plasmaHTCache.isText(mimeType)) { return "Media_Content_(not_text)"; }
return null;
}
/**
* @see de.anomic.plasma.cache.IResourceInfo#shallIndexCacheForProxy()
*/
public String shallIndexCacheForProxy() {
// -set-cookie in response
// the set-cookie from the server does not indicate that the content is special
// thus we do not care about it here for indexing
// a picture cannot be indexed
String mimeType = this.getMimeType();
if (plasmaHTCache.isPicture(mimeType)) {
return "Media_Content_(Picture)";
}
if (!plasmaHTCache.isText(mimeType)) {
return "Media_Content_(not_text)";
}
// -if-modified-since in request
// if the page is fresh at the very moment we can index it
Date ifModifiedSince = getModificationDate();
if ((ifModifiedSince != null) && (this.responseHeader.containsKey(httpHeader.LAST_MODIFIED))) {
// parse date
Date d = this.responseHeader.lastModified();
if (d == null) {
d = new Date(serverDate.correctedUTCTime());
}
// finally, we shall treat the cache as stale if the modification time is after the if-.. time
if (d.after(ifModifiedSince)) {
//System.out.println("***not indexed because if-modified-since");
return "Stale_(Last-Modified>Modified-Since)";
}
}
// -pragma in cached response
if (this.responseHeader.containsKey(httpHeader.PRAGMA) &&
((String) this.responseHeader.get(httpHeader.PRAGMA)).toUpperCase().equals("NO-CACHE")) {
return "Denied_(pragma_no_cache)";
}
// see for documentation also:
// http://www.web-caching.com/cacheability.html
// look for freshnes information
// -expires in cached response
// the expires value gives us a very easy hint when the cache is stale
// sometimes, the expires date is set to the past to prevent that a page is cached
// we use that information to see if we should index it
final Date expires = this.responseHeader.expires();
if (expires != null && expires.before(new Date(serverDate.correctedUTCTime()))) {
return "Stale_(Expired)";
}
// -lastModified in cached response
// this information is too weak to use it to prevent indexing
// even if we can apply a TTL heuristic for cache usage
// -cache-control in cached response
// the cache-control has many value options.
String cacheControl = (String) this.responseHeader.get(httpHeader.CACHE_CONTROL);
if (cacheControl != null) {
cacheControl = cacheControl.trim().toUpperCase();
/* we have the following cases for cache-control:
"public" -- can be indexed
"private", "no-cache", "no-store" -- cannot be indexed
"max-age=<delta-seconds>" -- stale/fresh dependent on date
*/
if (cacheControl.startsWith("PRIVATE") ||
cacheControl.startsWith("NO-CACHE") ||
cacheControl.startsWith("NO-STORE")) {
// easy case
return "Stale_(denied_by_cache-control=" + cacheControl + ")";
// } else if (cacheControl.startsWith("PUBLIC")) {
// // ok, do nothing
} else if (cacheControl.startsWith("MAX-AGE=")) {
// we need also the load date
final Date date = this.responseHeader.date();
if (date == null) {
return "Stale_(no_date_given_in_response)";
}
try {
final long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live
if (serverDate.correctedUTCTime() - date.getTime() > ttl) {
//System.out.println("***not indexed because cache-control");
return "Stale_(expired_by_cache-control)";
}
} catch (Exception e) {
return "Error_(" + e.getMessage() + ")";
}
}
}
return null;
}
public String shallStoreCacheForProxy() {
if (this.requestHeader != null) {
// -authorization cases in request
// authorization makes pages very individual, and therefore we cannot use the
// content in the cache
if (this.requestHeader.containsKey(httpHeader.AUTHORIZATION)) { return "personalized"; }
// -ranges in request and response
// we do not cache partial content
if (this.requestHeader.containsKey(httpHeader.RANGE)) { return "partial"; }
}
if (this.responseHeader != null) {
// -ranges in request and response
// we do not cache partial content
if (this.responseHeader.containsKey(httpHeader.CONTENT_RANGE)) { return "partial"; }
// -if-modified-since in request
// we do not care about if-modified-since, because this case only occurres if the
// cache file does not exist, and we need as much info as possible for the indexing
// -cookies in request
// we do not care about cookies, because that would prevent loading more pages
// from one domain once a request resulted in a client-side stored cookie
// -set-cookie in response
// we do not care about cookies in responses, because that info comes along
// any/many pages from a server and does not express the validity of the page
// in modes of life-time/expiration or individuality
// -pragma in response
// if we have a pragma non-cache, we don't cache. usually if this is wanted from
// the server, it makes sense
String cacheControl = (String) this.responseHeader.get(httpHeader.PRAGMA);
if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return "controlled_no_cache"; }
// -expires in response
// we do not care about expires, because at the time this is called the data is
// obvious valid and that header info is used in the indexing later on
// -cache-control in response
// the cache-control has many value options.
cacheControl = (String) this.responseHeader.get(httpHeader.CACHE_CONTROL);
if (cacheControl != null) {
cacheControl = cacheControl.trim().toUpperCase();
if (cacheControl.startsWith("MAX-AGE=")) {
// we need also the load date
Date date = this.responseHeader.date();
if (date == null) return "stale_no_date_given_in_response";
try {
long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live
if (serverDate.correctedUTCTime() - date.getTime() > ttl) {
//System.out.println("***not indexed because cache-control");
return "stale_expired";
}
} catch (Exception e) {
return "stale_error_" + e.getMessage() + ")";
}
}
}
}
return null;
}
public boolean shallUseCacheForProxy() {
String cacheControl;
if (this.requestHeader != null) {
// -authorization cases in request
if (this.requestHeader.containsKey(httpHeader.AUTHORIZATION)) { return false; }
// -ranges in request
// we do not cache partial content
if (this.requestHeader.containsKey(httpHeader.RANGE)) { return false; }
// if the client requests a un-cached copy of the resource ...
cacheControl = (String) this.requestHeader.get(httpHeader.PRAGMA);
if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return false; }
cacheControl = (String) this.requestHeader.get(httpHeader.CACHE_CONTROL);
if (cacheControl != null) {
cacheControl = cacheControl.trim().toUpperCase();
if (cacheControl.startsWith("NO-CACHE") || cacheControl.startsWith("MAX-AGE=0")) { return false; }
}
}
// -if-modified-since in request
// The entity has to be transferred only if it has
// been modified since the date given by the If-Modified-Since header.
if (this.requestHeader.containsKey(httpHeader.IF_MODIFIED_SINCE)) {
// checking this makes only sense if the cached response contains
// a Last-Modified field. If the field does not exist, we go the safe way
if (!this.responseHeader.containsKey(httpHeader.LAST_MODIFIED)) { return false; }
// parse date
Date d1, d2;
d2 = this.responseHeader.lastModified(); if (d2 == null) { d2 = new Date(serverDate.correctedUTCTime()); }
d1 = this.requestHeader.ifModifiedSince(); if (d1 == null) { d1 = new Date(serverDate.correctedUTCTime()); }
// finally, we shall treat the cache as stale if the modification time is after the if-.. time
if (d2.after(d1)) { return false; }
}
String mimeType = this.getMimeType();
if (!plasmaHTCache.isPicture(mimeType)) {
// -cookies in request
// unfortunately, we should reload in case of a cookie
// but we think that pictures can still be considered as fresh
// -set-cookie in cached response
// this is a similar case as for COOKIE.
if (this.requestHeader.containsKey(httpHeader.COOKIE) ||
this.responseHeader.containsKey(httpHeader.SET_COOKIE) ||
this.responseHeader.containsKey(httpHeader.SET_COOKIE2)) {
return false; // too strong
}
}
// -pragma in cached response
// logically, we would not need to care about no-cache pragmas in cached response headers,
// because they cannot exist since they are not written to the cache.
// So this IF should always fail..
cacheControl = (String) this.responseHeader.get(httpHeader.PRAGMA);
if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return false; }
// see for documentation also:
// http://www.web-caching.com/cacheability.html
// http://vancouver-webpages.com/CacheNow/
// look for freshnes information
// if we don't have any freshnes indication, we treat the file as stale.
// no handle for freshness control:
// -expires in cached response
// the expires value gives us a very easy hint when the cache is stale
Date expires = this.responseHeader.expires();
if (expires != null) {
// System.out.println("EXPIRES-TEST: expires=" + expires + ", NOW=" + serverDate.correctedGMTDate() + ", url=" + url);
if (expires.before(new Date(serverDate.correctedUTCTime()))) { return false; }
}
Date lastModified = this.responseHeader.lastModified();
cacheControl = (String) this.responseHeader.get(httpHeader.CACHE_CONTROL);
if (cacheControl == null && lastModified == null && expires == null) { return false; }
// -lastModified in cached response
// we can apply a TTL (Time To Live) heuristic here. We call the time delta between the last read
// of the file and the last modified date as the age of the file. If we consider the file as
// middel-aged then, the maximum TTL would be cache-creation plus age.
// This would be a TTL factor of 100% we want no more than 10% TTL, so that a 10 month old cache
// file may only be treated as fresh for one more month, not more.
Date date = this.responseHeader.date();
if (lastModified != null) {
if (date == null) { date = new Date(serverDate.correctedUTCTime()); }
long age = date.getTime() - lastModified.getTime();
if (age < 0) { return false; }
// TTL (Time-To-Live) is age/10 = (d2.getTime() - d1.getTime()) / 10
// the actual living-time is serverDate.correctedGMTDate().getTime() - d2.getTime()
// therefore the cache is stale, if serverDate.correctedGMTDate().getTime() - d2.getTime() > age/10
if (serverDate.correctedUTCTime() - date.getTime() > age / 10) { return false; }
}
// -cache-control in cached response
// the cache-control has many value options.
if (cacheControl != null) {
cacheControl = cacheControl.trim().toUpperCase();
if (cacheControl.startsWith("PRIVATE") ||
cacheControl.startsWith("NO-CACHE") ||
cacheControl.startsWith("NO-STORE")) {
// easy case
return false;
// } else if (cacheControl.startsWith("PUBLIC")) {
// // ok, do nothing
} else if (cacheControl.startsWith("MAX-AGE=")) {
// we need also the load date
if (date == null) { return false; }
try {
final long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live
if (serverDate.correctedUTCTime() - date.getTime() > ttl) {
return false;
}
} catch (Exception e) {
return false;
}
}
}
return true;
}
public boolean validResponseStatus(String responseStatus) {
return responseStatus.startsWith("200") ||
responseStatus.startsWith("203");
}
public Date ifModifiedSince() {
return (this.requestHeader == null) ? null : this.requestHeader.ifModifiedSince();
}
public boolean requestWithCookie() {
return (this.requestHeader == null) ? false : this.requestHeader.containsKey(httpHeader.COOKIE);
}
public boolean requestProhibitsIndexing() {
return (this.requestHeader == null)
? false
: this.requestHeader.containsKey(httpHeader.X_YACY_INDEX_CONTROL) &&
((String)this.requestHeader.get(httpHeader.X_YACY_INDEX_CONTROL)).toUpperCase().equals("NO-INDEX");
}
public httpHeader getRequestHeader() {
return this.requestHeader;
}
public httpHeader getResponseHeader() {
return this.responseHeader;
}
}

@ -64,6 +64,8 @@ import de.anomic.plasma.plasmaCrawlLoader;
import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParser; import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.plasma.cache.http.ResourceInfo;
import de.anomic.plasma.crawler.AbstractCrawlWorker; import de.anomic.plasma.crawler.AbstractCrawlWorker;
import de.anomic.plasma.crawler.plasmaCrawlerPool; import de.anomic.plasma.crawler.plasmaCrawlerPool;
import de.anomic.plasma.urlPattern.plasmaURLPattern; import de.anomic.plasma.urlPattern.plasmaURLPattern;
@ -129,15 +131,15 @@ public final class CrawlWorker extends AbstractCrawlWorker {
return load(DEFAULT_CRAWLING_RETRY_COUNT); return load(DEFAULT_CRAWLING_RETRY_COUNT);
} }
protected plasmaHTCache.Entry createCacheEntry(Date requestDate, httpHeader requestHeader, httpc.response response) { protected plasmaHTCache.Entry createCacheEntry(URL requestUrl, Date requestDate, httpHeader requestHeader, httpc.response response) {
IResourceInfo resourceInfo = new ResourceInfo(requestUrl,requestHeader,response.responseHeader);
return this.cacheManager.newEntry( return this.cacheManager.newEntry(
requestDate, requestDate,
this.depth, this.depth,
this.url, this.url,
this.name, this.name,
requestHeader,
response.status, response.status,
response.responseHeader, resourceInfo,
this.initiator, this.initiator,
this.profile this.profile
); );
@ -197,7 +199,7 @@ public final class CrawlWorker extends AbstractCrawlWorker {
// the transfer is ok // the transfer is ok
// create a new cache entry // create a new cache entry
htCache = createCacheEntry(requestDate, requestHeader, res); htCache = createCacheEntry(this.url,requestDate, requestHeader, res);
// aborting download if content is to long ... // aborting download if content is to long ...
if (htCache.cacheFile().getAbsolutePath().length() > serverSystem.maxPathLength) { if (htCache.cacheFile().getAbsolutePath().length() > serverSystem.maxPathLength) {

@ -295,7 +295,7 @@ public final class plasmaCrawlStacker {
} }
// check if ip is local ip address // check if ip is local ip address
checkInterruption(); checkInterruption(); // TODO: this is protocol specific
InetAddress hostAddress = httpc.dnsResolve(nexturl.getHost()); InetAddress hostAddress = httpc.dnsResolve(nexturl.getHost());
if (hostAddress == null) { if (hostAddress == null) {
// if a http proxy is configured name resolution may not work // if a http proxy is configured name resolution may not work

@ -54,14 +54,12 @@
package de.anomic.plasma; package de.anomic.plasma;
import de.anomic.http.httpc; import de.anomic.http.httpc;
import de.anomic.http.httpHeader;
import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.kelondro.kelondroDyn; import de.anomic.kelondro.kelondroDyn;
import de.anomic.kelondro.kelondroMap; import de.anomic.kelondro.kelondroMap;
import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
import de.anomic.server.serverDate;
import de.anomic.server.serverFileUtils; import de.anomic.server.serverFileUtils;
import de.anomic.server.serverInstantThread; import de.anomic.server.serverInstantThread;
import de.anomic.server.serverSystem; import de.anomic.server.serverSystem;
@ -73,6 +71,9 @@ import java.io.IOException;
import java.net.InetAddress; import java.net.InetAddress;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.plasma.cache.ResourceInfoFactory;
import java.util.Date; import java.util.Date;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
@ -87,7 +88,7 @@ public final class plasmaHTCache {
private static final int stackLimit = 150; // if we exceed that limit, we do not check idle private static final int stackLimit = 150; // if we exceed that limit, we do not check idle
public static final long oneday = 1000 * 60 * 60 * 24; // milliseconds of a day public static final long oneday = 1000 * 60 * 60 * 24; // milliseconds of a day
private kelondroMap responseHeaderDB = null; kelondroMap responseHeaderDB = null;
private final LinkedList cacheStack; private final LinkedList cacheStack;
private final TreeMap cacheAge; // a <date+hash, cache-path> - relation private final TreeMap cacheAge; // a <date+hash, cache-path> - relation
public long curCacheSize; public long curCacheSize;
@ -96,12 +97,17 @@ public final class plasmaHTCache {
public final serverLog log; public final serverLog log;
public static final HashSet filesInUse = new HashSet(); // can we delete this file public static final HashSet filesInUse = new HashSet(); // can we delete this file
private ResourceInfoFactory objFactory;
public plasmaHTCache(File htCachePath, long maxCacheSize, int bufferkb, long preloadTime) { public plasmaHTCache(File htCachePath, long maxCacheSize, int bufferkb, long preloadTime) {
// this.switchboard = switchboard; // this.switchboard = switchboard;
this.log = new serverLog("HTCACHE"); this.log = new serverLog("HTCACHE");
this.cachePath = htCachePath; this.cachePath = htCachePath;
// create the object factory
this.objFactory = new ResourceInfoFactory();
// reset old HTCache ? // reset old HTCache ?
String[] list = this.cachePath.list(); String[] list = this.cachePath.list();
if (list != null) { if (list != null) {
@ -229,10 +235,6 @@ public final class plasmaHTCache {
} }
} }
public void storeHeader(String urlHash, httpHeader responseHeader) throws IOException {
this.responseHeaderDB.set(urlHash, responseHeader);
}
/** /**
* This method changes the HTCache size.<br> * This method changes the HTCache size.<br>
* @param new cache size in bytes * @param new cache size in bytes
@ -249,7 +251,7 @@ public final class plasmaHTCache {
return (this.curCacheSize >= this.maxCacheSize) ? 0 : this.maxCacheSize - this.curCacheSize; return (this.curCacheSize >= this.maxCacheSize) ? 0 : this.maxCacheSize - this.curCacheSize;
} }
public boolean writeFile(URL url, byte[] array) { public boolean writeResourceContent(URL url, byte[] array) {
if (array == null) return false; if (array == null) return false;
File file = getCachePath(url); File file = getCachePath(url);
try { try {
@ -445,10 +447,24 @@ public final class plasmaHTCache {
return prefix + s.substring(0, p); return prefix + s.substring(0, p);
} }
public httpHeader getCachedResponse(String urlHash) throws IOException { /**
* Returns an object containing metadata about a cached resource
* @param url the url of the resource
* @return an {@link IResourceInfo info object}
* @throws Exception of the info object could not be created, e.g. if the protocol is not supported
*/
public IResourceInfo loadResourceInfo(URL url) throws Exception {
// getting the URL hash
String urlHash = indexURL.urlHash(url.toNormalform());
// loading data from database
Map hdb = this.responseHeaderDB.get(urlHash); Map hdb = this.responseHeaderDB.get(urlHash);
if (hdb == null) return null; if (hdb == null) return null;
return new httpHeader(null, hdb);
// generate the cached object
IResourceInfo cachedObj = this.objFactory.buildResourceInfoObj(url, hdb);
return cachedObj;
} }
public boolean full() { public boolean full() {
@ -459,18 +475,17 @@ public final class plasmaHTCache {
return (this.cacheStack.size() == 0); return (this.cacheStack.size() == 0);
} }
public static boolean isPicture(httpHeader response) { public static boolean isPicture(String mimeType) {
Object ct = response.get(httpHeader.CONTENT_TYPE); if (mimeType == null) return false;
if (ct == null) return false; return mimeType.toUpperCase().startsWith("IMAGE");
return ((String)ct).toUpperCase().startsWith("IMAGE");
} }
public static boolean isText(httpHeader response) { public static boolean isText(String mimeType) {
// Object ct = response.get(httpHeader.CONTENT_TYPE); // Object ct = response.get(httpHeader.CONTENT_TYPE);
// if (ct == null) return false; // if (ct == null) return false;
// String t = ((String)ct).toLowerCase(); // String t = ((String)ct).toLowerCase();
// return ((t.startsWith("text")) || (t.equals("application/xhtml+xml"))); // return ((t.startsWith("text")) || (t.equals("application/xhtml+xml")));
return plasmaParser.supportedMimeTypesContains(response.mime()); return plasmaParser.supportedMimeTypesContains(mimeType);
} }
public static boolean noIndexingURL(String urlString) { public static boolean noIndexingURL(String urlString) {
@ -568,9 +583,8 @@ public final class plasmaHTCache {
} }
if (port < 0) { if (port < 0) {
return new File(this.cachePath, protocol + "/" + host + path); return new File(this.cachePath, protocol + "/" + host + path);
} else {
return new File(this.cachePath, protocol + "/" + host + "!" + port + path);
} }
return new File(this.cachePath, protocol + "/" + host + "!" + port + path);
} }
/** /**
@ -663,7 +677,7 @@ public final class plasmaHTCache {
return null; return null;
} }
public byte[] loadResource(URL url) { public byte[] loadResourceContent(URL url) {
// load the url as resource from the cache // load the url as resource from the cache
File f = getCachePath(url); File f = getCachePath(url);
if (f.exists()) try { if (f.exists()) try {
@ -690,12 +704,30 @@ public final class plasmaHTCache {
(ls.indexOf("memberlist.php?sid=") >= 0)); (ls.indexOf("memberlist.php?sid=") >= 0));
} }
public Entry newEntry(Date initDate, int depth, URL url, String name, public Entry newEntry(
httpHeader requestHeader, Date initDate,
String responseStatus, httpHeader responseHeader, int depth,
URL url,
String name,
//httpHeader requestHeader,
String responseStatus,
//httpHeader responseHeader,
IResourceInfo docInfo,
String initiator, String initiator,
plasmaCrawlProfile.entry profile) { plasmaCrawlProfile.entry profile
return new Entry(initDate, depth, url, name, requestHeader, responseStatus, responseHeader, initiator, profile); ) {
return new Entry(
initDate,
depth,
url,
name,
//requestHeader,
responseStatus,
//responseHeader,
docInfo,
initiator,
profile
);
} }
public final class Entry { public final class Entry {
@ -703,9 +735,9 @@ public final class plasmaHTCache {
// the class objects // the class objects
private Date initDate; // the date when the request happened; will be used as a key private Date initDate; // the date when the request happened; will be used as a key
private int depth; // the depth of prefetching private int depth; // the depth of prefetching
private httpHeader requestHeader; // we carry also the header to prevent too many file system access // private httpHeader requestHeader; // we carry also the header to prevent too many file system access
// private httpHeader responseHeader; // we carry also the header to prevent too many file system access
private String responseStatus; private String responseStatus;
private httpHeader responseHeader; // we carry also the header to prevent too many file system access
private File cacheFile; // the cache file private File cacheFile; // the cache file
private byte[] cacheArray; // or the cache as byte-array private byte[] cacheArray; // or the cache as byte-array
private URL url; private URL url;
@ -719,15 +751,21 @@ public final class plasmaHTCache {
private plasmaCrawlProfile.entry profile; private plasmaCrawlProfile.entry profile;
private String initiator; private String initiator;
/**
* protocolspecific information about the resource
*/
private IResourceInfo resInfo;
protected Object clone() throws CloneNotSupportedException { protected Object clone() throws CloneNotSupportedException {
return new Entry( return new Entry(
this.initDate, this.initDate,
this.depth, this.depth,
this.url, this.url,
this.name, this.name,
this.requestHeader, //this.requestHeader,
this.responseStatus, this.responseStatus,
this.responseHeader, //this.responseHeader,
this.resInfo,
this.initiator, this.initiator,
this.profile this.profile
); );
@ -737,15 +775,21 @@ public final class plasmaHTCache {
int depth, int depth,
URL url, URL url,
String name, String name,
httpHeader requestHeader, //httpHeader requestHeader,
String responseStatus, String responseStatus,
httpHeader responseHeader, //httpHeader responseHeader,
IResourceInfo resourceInfo,
String initiator, String initiator,
plasmaCrawlProfile.entry profile plasmaCrawlProfile.entry profile
) { ) {
if (resourceInfo == null){
System.out.println("Content information object is null. " + url);
System.exit(0);
}
this.resInfo = resourceInfo;
// normalize url // normalize url
// serverLog.logFine("PLASMA", "Entry: URL=" + url.toString());
this.nomalizedURLString = url.toNormalform(); this.nomalizedURLString = url.toNormalform();
try { try {
@ -761,28 +805,17 @@ public final class plasmaHTCache {
// assigned: // assigned:
this.initDate = initDate; this.initDate = initDate;
this.depth = depth; this.depth = depth;
this.requestHeader = requestHeader; //this.requestHeader = requestHeader;
this.responseStatus = responseStatus; this.responseStatus = responseStatus;
this.responseHeader = responseHeader; //this.responseHeader = responseHeader;
this.profile = profile; this.profile = profile;
this.initiator = (initiator == null) ? null : ((initiator.length() == 0) ? null : initiator); this.initiator = (initiator == null) ? null : ((initiator.length() == 0) ? null : initiator);
// calculated: // getting the last modified date
if (responseHeader == null) { this.lastModified = resourceInfo.getModificationDate();
try {
throw new RuntimeException("RESPONSE HEADER = NULL");
} catch (Exception e) {
System.out.println("RESPONSE HEADER = NULL in " + url);
e.printStackTrace();
System.exit(0);
}
this.lastModified = new Date(serverDate.correctedUTCTime()); // getting the doctype
} else { this.doctype = indexEntryAttribute.docType(resourceInfo.getMimeType());
this.lastModified = responseHeader.lastModified();
if (this.lastModified == null) this.lastModified = new Date(serverDate.correctedUTCTime()); // does not exist in header
}
this.doctype = indexEntryAttribute.docType(responseHeader.mime());
if (this.doctype == indexEntryAttribute.DT_UNKNOWN) this.doctype = indexEntryAttribute.docType(url); if (this.doctype == indexEntryAttribute.DT_UNKNOWN) this.doctype = indexEntryAttribute.docType(url);
this.language = indexEntryAttribute.language(url); this.language = indexEntryAttribute.language(url);
@ -822,12 +855,7 @@ public final class plasmaHTCache {
} }
public URL referrerURL() { public URL referrerURL() {
if (this.requestHeader == null) return null; return (this.resInfo==null)?null:this.resInfo.getRefererUrl();
try {
return new URL((String) this.requestHeader.get(httpHeader.REFERER, ""));
} catch (Exception e) {
return null;
}
} }
public File cacheFile() { public File cacheFile() {
@ -846,27 +874,36 @@ public final class plasmaHTCache {
// return this.requestHeader; // return this.requestHeader;
// } // }
public httpHeader responseHeader() { // public httpHeader responseHeader() {
return this.responseHeader; // return this.responseHeader;
// }
public IResourceInfo getDocumentInfo() {
return this.resInfo;
}
public boolean writeResourceInfo() throws IOException {
assert(this.nomalizedURLHash != null) : "URL Hash is null";
if (this.resInfo == null) return false;
plasmaHTCache.this.responseHeaderDB.set(this.nomalizedURLHash, this.resInfo.getMap());
return true;
} }
public String getMimeType() { public String getMimeType() {
return (this.responseHeader == null) ? null : this.responseHeader.mime(); return (this.resInfo == null) ? null : this.resInfo.getMimeType();
} }
public Date ifModifiedSince() { public Date ifModifiedSince() {
return (this.requestHeader == null) ? null : this.requestHeader.ifModifiedSince(); return (this.resInfo == null) ? null : this.resInfo.ifModifiedSince();
} }
public boolean requestWithCookie() { public boolean requestWithCookie() {
return (this.requestHeader == null) ? false : this.requestHeader.containsKey(httpHeader.COOKIE); return (this.resInfo == null) ? false : this.resInfo.requestWithCookie();
} }
public boolean requestProhibitsIndexing() { public boolean requestProhibitsIndexing() {
return (this.requestHeader == null) return (this.resInfo == null) ? false : this.resInfo.requestProhibitsIndexing();
? false
: this.requestHeader.containsKey(httpHeader.X_YACY_INDEX_CONTROL) &&
((String)this.requestHeader.get(httpHeader.X_YACY_INDEX_CONTROL)).toUpperCase().equals("NO-INDEX");
} }
/* /*
@ -878,9 +915,10 @@ public final class plasmaHTCache {
// the following three methods for cache read/write granting shall be as loose as possible // the following three methods for cache read/write granting shall be as loose as possible
// but also as strict as necessary to enable caching of most items // but also as strict as necessary to enable caching of most items
/**
* @return NULL if the answer is TRUE, in case of FALSE, the reason as String is returned
*/
public String shallStoreCacheForProxy() { public String shallStoreCacheForProxy() {
// returns NULL if the answer is TRUE
// in case of FALSE, the reason as String is returned
// check profile (disabled: we will check this in the plasmaSwitchboard) // check profile (disabled: we will check this in the plasmaSwitchboard)
//if (!this.profile.storeHTCache()) { return "storage_not_wanted"; } //if (!this.profile.storeHTCache()) { return "storage_not_wanted"; }
@ -889,8 +927,11 @@ public final class plasmaHTCache {
// if the storage was requested by prefetching, the request map is null // if the storage was requested by prefetching, the request map is null
// check status code // check status code
if (!(this.responseStatus.startsWith("200") || if ((this.resInfo != null) && (!this.resInfo.validResponseStatus(this.responseStatus))) {
this.responseStatus.startsWith("203"))) { return "bad_status_" + this.responseStatus.substring(0,3); } return "bad_status_" + this.responseStatus.substring(0,3);
}
// if (!(this.responseStatus.startsWith("200") ||
// this.responseStatus.startsWith("203"))) { return "bad_status_" + this.responseStatus.substring(0,3); }
// check storage location // check storage location
// sometimes a file name is equal to a path name in the same directory; // sometimes a file name is equal to a path name in the same directory;
@ -905,62 +946,10 @@ public final class plasmaHTCache {
if (isPOST(this.nomalizedURLString) && !this.profile.crawlingQ()) { return "dynamic_post"; } if (isPOST(this.nomalizedURLString) && !this.profile.crawlingQ()) { return "dynamic_post"; }
if (isCGI(this.nomalizedURLString)) { return "dynamic_cgi"; } if (isCGI(this.nomalizedURLString)) { return "dynamic_cgi"; }
if (this.requestHeader != null) { if (this.resInfo != null) {
// -authorization cases in request return this.resInfo.shallStoreCacheForProxy();
// authorization makes pages very individual, and therefore we cannot use the
// content in the cache
if (this.requestHeader.containsKey(httpHeader.AUTHORIZATION)) { return "personalized"; }
// -ranges in request and response
// we do not cache partial content
if (this.requestHeader.containsKey(httpHeader.RANGE)) { return "partial"; }
}
// -ranges in request and response
// we do not cache partial content
if (this.responseHeader != null && this.responseHeader.containsKey(httpHeader.CONTENT_RANGE)) { return "partial"; }
// -if-modified-since in request
// we do not care about if-modified-since, because this case only occurres if the
// cache file does not exist, and we need as much info as possible for the indexing
// -cookies in request
// we do not care about cookies, because that would prevent loading more pages
// from one domain once a request resulted in a client-side stored cookie
// -set-cookie in response
// we do not care about cookies in responses, because that info comes along
// any/many pages from a server and does not express the validity of the page
// in modes of life-time/expiration or individuality
// -pragma in response
// if we have a pragma non-cache, we don't cache. usually if this is wanted from
// the server, it makes sense
String cacheControl = (String) this.responseHeader.get(httpHeader.PRAGMA);
if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return "controlled_no_cache"; }
// -expires in response
// we do not care about expires, because at the time this is called the data is
// obvious valid and that header info is used in the indexing later on
// -cache-control in response
// the cache-control has many value options.
cacheControl = (String) this.responseHeader.get(httpHeader.CACHE_CONTROL);
if (cacheControl != null) {
cacheControl = cacheControl.trim().toUpperCase();
if (cacheControl.startsWith("MAX-AGE=")) {
// we need also the load date
Date date = this.responseHeader.date();
if (date == null) return "stale_no_date_given_in_response";
try {
long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live
if (serverDate.correctedUTCTime() - date.getTime() > ttl) {
//System.out.println("***not indexed because cache-control");
return "stale_expired";
}
} catch (Exception e) {
return "stale_error_" + e.getMessage() + ")";
}
}
} }
return null; return null;
} }
@ -971,146 +960,17 @@ public final class plasmaHTCache {
public boolean shallUseCacheForProxy() { public boolean shallUseCacheForProxy() {
// System.out.println("SHALL READ CACHE: requestHeader = " + requestHeader.toString() + ", responseHeader = " + responseHeader.toString()); // System.out.println("SHALL READ CACHE: requestHeader = " + requestHeader.toString() + ", responseHeader = " + responseHeader.toString());
String cacheControl;
if (this.requestHeader != null) {
// -authorization cases in request
if (this.requestHeader.containsKey(httpHeader.AUTHORIZATION)) { return false; }
// -ranges in request
// we do not cache partial content
if (this.requestHeader.containsKey(httpHeader.RANGE)) { return false; }
// if the client requests a un-cached copy of the resource ...
cacheControl = (String) this.requestHeader.get(httpHeader.PRAGMA);
if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return false; }
cacheControl = (String) this.requestHeader.get(httpHeader.CACHE_CONTROL);
if (cacheControl != null) {
cacheControl = cacheControl.trim().toUpperCase();
if (cacheControl.startsWith("NO-CACHE") || cacheControl.startsWith("MAX-AGE=0")) { return false; }
}
}
// -CGI access in request // -CGI access in request
// CGI access makes the page very individual, and therefore not usable in caches // CGI access makes the page very individual, and therefore not usable in caches
if (isPOST(this.nomalizedURLString)) { return false; } if (isPOST(this.nomalizedURLString)) { return false; }
if (isCGI(this.nomalizedURLString)) { return false; } if (isCGI(this.nomalizedURLString)) { return false; }
// -if-modified-since in request if (this.resInfo != null) {
// The entity has to be transferred only if it has return this.resInfo.shallUseCacheForProxy();
// been modified since the date given by the If-Modified-Since header.
if (this.requestHeader.containsKey(httpHeader.IF_MODIFIED_SINCE)) {
// checking this makes only sense if the cached response contains
// a Last-Modified field. If the field does not exist, we go the safe way
if (!this.responseHeader.containsKey(httpHeader.LAST_MODIFIED)) { return false; }
// parse date
Date d1, d2;
d2 = this.responseHeader.lastModified(); if (d2 == null) { d2 = new Date(serverDate.correctedUTCTime()); }
d1 = this.requestHeader.ifModifiedSince(); if (d1 == null) { d1 = new Date(serverDate.correctedUTCTime()); }
// finally, we shall treat the cache as stale if the modification time is after the if-.. time
if (d2.after(d1)) { return false; }
}
if (!isPicture(this.responseHeader)) {
// -cookies in request
// unfortunately, we should reload in case of a cookie
// but we think that pictures can still be considered as fresh
// -set-cookie in cached response
// this is a similar case as for COOKIE.
if (this.requestHeader.containsKey(httpHeader.COOKIE) ||
this.responseHeader.containsKey(httpHeader.SET_COOKIE) ||
this.responseHeader.containsKey(httpHeader.SET_COOKIE2)) {
return false; // too strong
}
}
// -pragma in cached response
// logically, we would not need to care about no-cache pragmas in cached response headers,
// because they cannot exist since they are not written to the cache.
// So this IF should always fail..
cacheControl = (String) this.responseHeader.get(httpHeader.PRAGMA);
if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return false; }
// see for documentation also:
// http://www.web-caching.com/cacheability.html
// http://vancouver-webpages.com/CacheNow/
// look for freshnes information
// if we don't have any freshnes indication, we treat the file as stale.
// no handle for freshness control:
// -expires in cached response
// the expires value gives us a very easy hint when the cache is stale
Date expires = this.responseHeader.expires();
if (expires != null) {
// System.out.println("EXPIRES-TEST: expires=" + expires + ", NOW=" + serverDate.correctedGMTDate() + ", url=" + url);
if (expires.before(new Date(serverDate.correctedUTCTime()))) { return false; }
}
Date lastModified = this.responseHeader.lastModified();
cacheControl = (String) this.responseHeader.get(httpHeader.CACHE_CONTROL);
if (cacheControl == null && lastModified == null && expires == null) { return false; }
// -lastModified in cached response
// we can apply a TTL (Time To Live) heuristic here. We call the time delta between the last read
// of the file and the last modified date as the age of the file. If we consider the file as
// middel-aged then, the maximum TTL would be cache-creation plus age.
// This would be a TTL factor of 100% we want no more than 10% TTL, so that a 10 month old cache
// file may only be treated as fresh for one more month, not more.
Date date = this.responseHeader.date();
if (lastModified != null) {
if (date == null) { date = new Date(serverDate.correctedUTCTime()); }
long age = date.getTime() - lastModified.getTime();
if (age < 0) { return false; }
// TTL (Time-To-Live) is age/10 = (d2.getTime() - d1.getTime()) / 10
// the actual living-time is serverDate.correctedGMTDate().getTime() - d2.getTime()
// therefore the cache is stale, if serverDate.correctedGMTDate().getTime() - d2.getTime() > age/10
if (serverDate.correctedUTCTime() - date.getTime() > age / 10) { return false; }
}
// -cache-control in cached response
// the cache-control has many value options.
if (cacheControl != null) {
cacheControl = cacheControl.trim().toUpperCase();
if (cacheControl.startsWith("PRIVATE") ||
cacheControl.startsWith("NO-CACHE") ||
cacheControl.startsWith("NO-STORE")) {
// easy case
return false;
// } else if (cacheControl.startsWith("PUBLIC")) {
// // ok, do nothing
} else if (cacheControl.startsWith("MAX-AGE=")) {
// we need also the load date
if (date == null) { return false; }
try {
final long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live
if (serverDate.correctedUTCTime() - date.getTime() > ttl) {
return false;
}
} catch (Exception e) {
return false;
}
}
} }
return true; return true;
} }
} // class Entry } // class Entry
/*
public static void main(String[] args) {
//String[] s = TimeZone.getAvailableIDs();
//for (int i = 0; i < s.length; i++) System.out.println("ZONE=" + s[i]);
Calendar c = GregorianCalendar.getInstance();
int zoneOffset = c.get(Calendar.ZONE_OFFSET)/(60*60*1000);
int DSTOffset = c.get(Calendar.DST_OFFSET)/(60*60*1000);
System.out.println("This Offset = " + (zoneOffset + DSTOffset));
for (int i = 0; i < 12; i++) {
c = new GregorianCalendar(TimeZone.getTimeZone("Etc/GMT-" + i));
//c.setTimeZone(TimeZone.getTimeZone("Etc/GMT+0"));
System.out.println("Zone offset: "+
c.get(Calendar.ZONE_OFFSET)/(60*60*1000));
System.out.println(c.get(GregorianCalendar.HOUR) + ", " + c.getTime() + ", " + c.getTimeInMillis());
}
}
**/
} }

@ -44,6 +44,7 @@ package de.anomic.plasma;
import java.io.IOException; import java.io.IOException;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.plasma.crawler.http.CrawlWorker; import de.anomic.plasma.crawler.http.CrawlWorker;
import java.util.Enumeration; import java.util.Enumeration;
@ -167,15 +168,15 @@ public class plasmaSnippetCache {
// if the snippet is not in the cache, we can try to get it from the htcache // if the snippet is not in the cache, we can try to get it from the htcache
byte[] resource = null; byte[] resource = null;
httpHeader header = null; IResourceInfo docInfo = null;
try { try {
resource = cacheManager.loadResource(url); resource = this.cacheManager.loadResourceContent(url);
if ((fetchOnline) && (resource == null)) { if ((fetchOnline) && (resource == null)) {
plasmaHTCache.Entry entry = loadResourceFromWeb(url, 5000); plasmaHTCache.Entry entry = loadResourceFromWeb(url, 5000);
if (entry != null) { if (entry != null) {
header = entry.responseHeader(); docInfo = entry.getDocumentInfo();
} }
resource = cacheManager.loadResource(url); resource = this.cacheManager.loadResourceContent(url);
source = SOURCE_WEB; source = SOURCE_WEB;
} }
} catch (IOException e) { } catch (IOException e) {
@ -185,7 +186,7 @@ public class plasmaSnippetCache {
//System.out.println("cannot load document for URL " + url); //System.out.println("cannot load document for URL " + url);
return new result(null, ERROR_RESOURCE_LOADING, "error loading resource from web, cacheManager returned NULL"); return new result(null, ERROR_RESOURCE_LOADING, "error loading resource from web, cacheManager returned NULL");
} }
plasmaParserDocument document = parseDocument(url, resource, header); plasmaParserDocument document = parseDocument(url, resource, docInfo);
if (document == null) return new result(null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed if (document == null) return new result(null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
//System.out.println("loaded document for URL " + url); //System.out.println("loaded document for URL " + url);
@ -360,18 +361,18 @@ public class plasmaSnippetCache {
return parseDocument(url, resource, null); return parseDocument(url, resource, null);
} }
public plasmaParserDocument parseDocument(URL url, byte[] resource, httpHeader header) { public plasmaParserDocument parseDocument(URL url, byte[] resource, IResourceInfo docInfo) {
try { try {
if (resource == null) return null; if (resource == null) return null;
// try to get the header from the htcache directory // try to get the header from the htcache directory
if (header == null) { if (docInfo == null) {
try { try {
header = this.cacheManager.getCachedResponse(indexURL.urlHash(url)); docInfo = this.cacheManager.loadResourceInfo(url);
} catch (IOException e) {} } catch (Exception e) {}
} }
if (header == null) { if (docInfo == null) {
String filename = this.cacheManager.getCachePath(url).getName(); String filename = this.cacheManager.getCachePath(url).getName();
int p = filename.lastIndexOf('.'); int p = filename.lastIndexOf('.');
if ( // if no extension is available if ( // if no extension is available
@ -394,8 +395,8 @@ public class plasmaSnippetCache {
} }
return null; return null;
} }
if (plasmaParser.supportedMimeTypesContains(header.mime())) { if (plasmaParser.supportedMimeTypesContains(docInfo.getMimeType())) {
return this.parser.parseSource(url, header.mime(), resource); return this.parser.parseSource(url, docInfo.getMimeType(), resource);
} }
return null; return null;
} catch (InterruptedException e) { } catch (InterruptedException e) {
@ -407,10 +408,10 @@ public class plasmaSnippetCache {
public byte[] getResource(URL url, boolean fetchOnline, int socketTimeout) { public byte[] getResource(URL url, boolean fetchOnline, int socketTimeout) {
// load the url as resource from the web // load the url as resource from the web
try { try {
byte[] resource = cacheManager.loadResource(url); byte[] resource = cacheManager.loadResourceContent(url);
if ((fetchOnline) && (resource == null)) { if ((fetchOnline) && (resource == null)) {
loadResourceFromWeb(url, (socketTimeout < 0) ? -1 : socketTimeout); loadResourceFromWeb(url, (socketTimeout < 0) ? -1 : socketTimeout);
resource = cacheManager.loadResource(url); resource = cacheManager.loadResourceContent(url);
} }
return resource; return resource;
} catch (IOException e) { } catch (IOException e) {

@ -829,7 +829,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
/* ========================================================================= /* =========================================================================
* LOCAL IP ADDRESS CHECK * LOCAL IP ADDRESS CHECK
* *
* check if ip is local ip address * check if ip is local ip address // TODO: remove this procotol specific code here
* ========================================================================= */ * ========================================================================= */
InetAddress hostAddress = httpc.dnsResolve(entry.url().getHost()); InetAddress hostAddress = httpc.dnsResolve(entry.url().getHost());
if (hostAddress == null) { if (hostAddress == null) {
@ -857,8 +857,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
(doIndexing && isSupportedContent) (doIndexing && isSupportedContent)
) { ) {
// store response header // store response header
if (entry.responseHeader() != null) { if (entry.writeResourceInfo()) {
this.cacheManager.storeHeader(entry.urlHash(), entry.responseHeader());
this.log.logInfo("WROTE HEADER for " + entry.cacheFile()); this.log.logInfo("WROTE HEADER for " + entry.cacheFile());
} }
@ -868,7 +867,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} else { } else {
String error = entry.shallStoreCacheForProxy(); String error = entry.shallStoreCacheForProxy();
if (error == null) { if (error == null) {
this.cacheManager.writeFile(entry.url(), entry.cacheArray()); this.cacheManager.writeResourceContent(entry.url(), entry.cacheArray());
this.log.logFine("WROTE FILE (" + entry.cacheArray().length + " bytes) for " + entry.cacheFile()); this.log.logFine("WROTE FILE (" + entry.cacheArray().length + " bytes) for " + entry.cacheFile());
} else { } else {
this.log.logFine("WRITE OF FILE " + entry.cacheFile() + " FORBIDDEN: " + error); this.log.logFine("WRITE OF FILE " + entry.cacheFile() + " FORBIDDEN: " + error);

@ -44,28 +44,27 @@
package de.anomic.plasma; package de.anomic.plasma;
import de.anomic.http.httpHeader; import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Date;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroStack;
import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroStack;
import de.anomic.net.URL;
import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
import de.anomic.server.serverDate;
import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.yacySeedDB;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import de.anomic.net.URL;
import java.util.ArrayList;
import java.util.Date;
public class plasmaSwitchboardQueue { public class plasmaSwitchboardQueue {
private kelondroStack sbQueueStack; private kelondroStack sbQueueStack;
private plasmaCrawlProfile profiles; private plasmaCrawlProfile profiles;
private plasmaHTCache htCache; plasmaHTCache htCache;
private plasmaCrawlLURL lurls; private plasmaCrawlLURL lurls;
private File sbQueueStackPath; private File sbQueueStackPath;
@ -191,7 +190,7 @@ public class plasmaSwitchboardQueue {
// computed values // computed values
private plasmaCrawlProfile.entry profileEntry; private plasmaCrawlProfile.entry profileEntry;
private httpHeader responseHeader; private IResourceInfo contentInfo;
private URL referrerURL; private URL referrerURL;
public Entry(URL url, String referrer, Date ifModifiedSince, boolean requestWithCookie, public Entry(URL url, String referrer, Date ifModifiedSince, boolean requestWithCookie,
@ -206,7 +205,7 @@ public class plasmaSwitchboardQueue {
this.anchorName = (anchorName==null)?"":anchorName.trim(); this.anchorName = (anchorName==null)?"":anchorName.trim();
this.profileEntry = null; this.profileEntry = null;
this.responseHeader = null; this.contentInfo = null;
this.referrerURL = null; this.referrerURL = null;
} }
@ -227,7 +226,7 @@ public class plasmaSwitchboardQueue {
this.anchorName = row.getColString(7, "UTF-8"); this.anchorName = row.getColString(7, "UTF-8");
this.profileEntry = null; this.profileEntry = null;
this.responseHeader = null; this.contentInfo = null;
this.referrerURL = null; this.referrerURL = null;
} }
@ -248,7 +247,7 @@ public class plasmaSwitchboardQueue {
this.anchorName = (row[7] == null) ? null : (new String(row[7], "UTF-8")).trim(); this.anchorName = (row[7] == null) ? null : (new String(row[7], "UTF-8")).trim();
this.profileEntry = null; this.profileEntry = null;
this.responseHeader = null; this.contentInfo = null;
this.referrerURL = null; this.referrerURL = null;
} }
@ -306,32 +305,24 @@ public class plasmaSwitchboardQueue {
return profileEntry; return profileEntry;
} }
private httpHeader responseHeader() { private IResourceInfo getCachedObjectInfo() {
if (responseHeader == null) try { if (this.contentInfo == null) try {
responseHeader = htCache.getCachedResponse(indexURL.urlHash(url)); this.contentInfo = plasmaSwitchboardQueue.this.htCache.loadResourceInfo(this.url);
} catch (IOException e) { } catch (Exception e) {
serverLog.logSevere("PLASMA", "responseHeader: failed to get header", e); serverLog.logSevere("PLASMA", "responseHeader: failed to get header", e);
return null; return null;
} }
return responseHeader; return this.contentInfo;
} }
public String getMimeType() { public String getMimeType() {
httpHeader headers = this.responseHeader(); IResourceInfo info = this.getCachedObjectInfo();
return (headers == null) ? null : headers.mime(); return (info == null) ? null : info.getMimeType();
} }
public Date getModificationDate() { public Date getModificationDate() {
Date docDate = null; IResourceInfo info = this.getCachedObjectInfo();
return (info == null) ? new Date() : info.getModificationDate();
httpHeader headers = this.responseHeader();
if (headers != null) {
docDate = headers.lastModified();
if (docDate == null) docDate = headers.date();
}
if (docDate == null) docDate = new Date();
return docDate;
} }
public URL referrerURL() { public URL referrerURL() {
@ -360,6 +351,8 @@ public class plasmaSwitchboardQueue {
* this method returns null if the answer is 'YES'! * this method returns null if the answer is 'YES'!
* if the answer is 'NO' (do not index), it returns a string with the reason * if the answer is 'NO' (do not index), it returns a string with the reason
* to reject the crawling demand in clear text * to reject the crawling demand in clear text
*
* This function is used by plasmaSwitchboard#processResourceStack
*/ */
public final String shallIndexCacheForProxy() { public final String shallIndexCacheForProxy() {
if (profile() == null) { if (profile() == null) {
@ -402,91 +395,8 @@ public class plasmaSwitchboardQueue {
return "Dynamic_(Requested_With_Cookie)"; return "Dynamic_(Requested_With_Cookie)";
} }
// -set-cookie in response if (getCachedObjectInfo() != null) {
// the set-cookie from the server does not indicate that the content is special return this.getCachedObjectInfo().shallIndexCacheForProxy();
// thus we do not care about it here for indexing
if (responseHeader() != null) {
// a picture cannot be indexed
if (plasmaHTCache.isPicture(responseHeader())) {
return "Media_Content_(Picture)";
}
if (!plasmaHTCache.isText(responseHeader())) {
return "Media_Content_(not_text)";
}
// -if-modified-since in request
// if the page is fresh at the very moment we can index it
if ((ifModifiedSince != null) && (responseHeader().containsKey(httpHeader.LAST_MODIFIED))) {
// parse date
Date d = responseHeader().lastModified();
if (d == null) {
d = new Date(serverDate.correctedUTCTime());
}
// finally, we shall treat the cache as stale if the modification time is after the if-.. time
if (d.after(ifModifiedSince)) {
//System.out.println("***not indexed because if-modified-since");
return "Stale_(Last-Modified>Modified-Since)";
}
}
// -pragma in cached response
if (responseHeader().containsKey(httpHeader.PRAGMA) &&
((String) responseHeader().get(httpHeader.PRAGMA)).toUpperCase().equals("NO-CACHE")) {
return "Denied_(pragma_no_cache)";
}
// see for documentation also:
// http://www.web-caching.com/cacheability.html
// look for freshnes information
// -expires in cached response
// the expires value gives us a very easy hint when the cache is stale
// sometimes, the expires date is set to the past to prevent that a page is cached
// we use that information to see if we should index it
final Date expires = responseHeader().expires();
if (expires != null && expires.before(new Date(serverDate.correctedUTCTime()))) {
return "Stale_(Expired)";
}
// -lastModified in cached response
// this information is too weak to use it to prevent indexing
// even if we can apply a TTL heuristic for cache usage
// -cache-control in cached response
// the cache-control has many value options.
String cacheControl = (String) responseHeader.get(httpHeader.CACHE_CONTROL);
if (cacheControl != null) {
cacheControl = cacheControl.trim().toUpperCase();
/* we have the following cases for cache-control:
"public" -- can be indexed
"private", "no-cache", "no-store" -- cannot be indexed
"max-age=<delta-seconds>" -- stale/fresh dependent on date
*/
if (cacheControl.startsWith("PRIVATE") ||
cacheControl.startsWith("NO-CACHE") ||
cacheControl.startsWith("NO-STORE")) {
// easy case
return "Stale_(denied_by_cache-control=" + cacheControl + ")";
// } else if (cacheControl.startsWith("PUBLIC")) {
// // ok, do nothing
} else if (cacheControl.startsWith("MAX-AGE=")) {
// we need also the load date
final Date date = responseHeader().date();
if (date == null) {
return "Stale_(no_date_given_in_response)";
}
try {
final long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live
if (serverDate.correctedUTCTime() - date.getTime() > ttl) {
//System.out.println("***not indexed because cache-control");
return "Stale_(expired_by_cache-control)";
}
} catch (Exception e) {
return "Error_(" + e.getMessage() + ")";
}
}
}
} }
return null; return null;
} }
@ -496,6 +406,8 @@ public class plasmaSwitchboardQueue {
* this method returns null if the answer is 'YES'! * this method returns null if the answer is 'YES'!
* if the answer is 'NO' (do not index), it returns a string with the reason * if the answer is 'NO' (do not index), it returns a string with the reason
* to reject the crawling demand in clear text * to reject the crawling demand in clear text
*
* This function is used by plasmaSwitchboard#processResourceStack
*/ */
public final String shallIndexCacheForCrawler() { public final String shallIndexCacheForCrawler() {
if (profile() == null) { if (profile() == null) {
@ -520,9 +432,9 @@ public class plasmaSwitchboardQueue {
// we checked that in shallStoreCache // we checked that in shallStoreCache
// a picture cannot be indexed // a picture cannot be indexed
if (responseHeader() != null) { if (getCachedObjectInfo() != null) {
if (plasmaHTCache.isPicture(responseHeader())) { return "Media_Content_(Picture)"; } String status = this.getCachedObjectInfo().shallIndexCacheForProxy();
if (!plasmaHTCache.isText(responseHeader())) { return "Media_Content_(not_text)"; } if (status != null) return status;
} }
if (plasmaHTCache.noIndexingURL(nURL)) { return "Media_Content_(forbidden)"; } if (plasmaHTCache.noIndexingURL(nURL)) { return "Media_Content_(forbidden)"; }

Loading…
Cancel
Save