git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2495 6c8d7289-2bf4-0310-a012-ef5d649a1542

pull/1/head
theli 19 years ago
parent 4825bfaaf3
commit dae763d8e3

@ -59,6 +59,7 @@ import de.anomic.index.indexURL;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.server.serverCore;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverObjects;
@ -102,8 +103,8 @@ public class CacheAdmin_p {
info.ensureCapacity(40000);
try {
final httpHeader fileheader = switchboard.cacheManager.getCachedResponse(indexURL.urlHash(url));
info.append("<b>HTTP Header:</b><br>").append(formatHeader(fileheader)).append("<br>");
final IResourceInfo resInfo = switchboard.cacheManager.loadResourceInfo(url);
info.append("<b>HTTP Header:</b><br>").append(formatHeader(resInfo.getMap())).append("<br>");
final String ff = file.toString();
final int dotpos = ff.lastIndexOf('.');
final String ext = (dotpos >= 0) ? ff.substring(dotpos + 1).toLowerCase() : "";
@ -198,7 +199,7 @@ public class CacheAdmin_p {
return new String(s);
}
private static String formatHeader(httpHeader header) {
private static String formatHeader(Map header) {
final StringBuffer result = new StringBuffer(2048);
if (header == null) {
result.append("- no header in header cache -<br>");

@ -56,6 +56,8 @@ Invalid URL
Unable to download resource content.
:: <!-- 5 -->
Unable to parse resource content.
:: <!-- 6 -->
Unsupported protocol.
#(/error)#
</font>
</p>

@ -4,23 +4,23 @@
//(C) by Michael Peter Christen; mc@anomic.de
//first published on http://www.anomic.de
//Frankfurt, Germany, 2004
//
//last major change: 12.07.2004
//
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version.
//
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
//Using this software in any meaning (reading, learning, copying, compiling,
//running) means that you agree that the Author(s) is (are) not responsible
//for cost, loss of data or any harm that may be caused directly or indirectly
@ -32,7 +32,7 @@
//(are) also not responsible for proper configuration and usage of the
//software, even if provoked by documentation provided together with
//the software.
//
//Any changes to this file according to the GPL as documented in the file
//gpl.txt aside this file in the shipment you received can be done to the
//lines that follows this copyright notice here, but changes must not be
@ -56,6 +56,7 @@ import de.anomic.http.httpc;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.plasma.plasmaCrawlLURL.Entry;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -92,147 +93,167 @@ public class ViewFile {
e1.printStackTrace();
}
if (post != null) {
// getting the url hash from which the content should be loaded
String urlHash = post.get("urlHash","");
if (urlHash.equals("")) {
prop.put("error",1);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
String viewMode = post.get("viewMode","sentences");
if (post != null) {
// getting the url hash from which the content should be loaded
String urlHash = post.get("urlHash","");
if (urlHash.equals("")) {
prop.put("error",1);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
// getting the urlEntry that belongs to the url hash
Entry urlEntry = null;
try {
urlEntry = sb.urlPool.loadedURL.getEntry(urlHash, null);
} catch (IOException e) {
prop.put("error",2);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
String viewMode = post.get("viewMode","sentences");
// gettin the url that belongs to the entry
URL url = urlEntry.url();
if (url == null) {
prop.put("error",3);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
// getting the urlEntry that belongs to the url hash
Entry urlEntry = null;
try {
urlEntry = sb.urlPool.loadedURL.getEntry(urlHash, null);
} catch (IOException e) {
prop.put("error",2);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
// loading the resource content as byte array
byte[] resource = null;
httpHeader resHeader = null;
String resMime = null;
try {
resource = sb.cacheManager.loadResource(url);
if (resource == null) {
plasmaHTCache.Entry entry = sb.snippetCache.loadResourceFromWeb(url, 5000);
// gettin the url that belongs to the entry
URL url = urlEntry.url();
if (url == null) {
prop.put("error",3);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
if (entry != null) {
resHeader = entry.responseHeader();
}
// loading the resource content as byte array
byte[] resource = null;
IResourceInfo resInfo = null;
String resMime = null;
try {
// trying to load the resource body
resource = sb.cacheManager.loadResourceContent(url);
resource = sb.cacheManager.loadResource(url);
// if the resource body was not cached we try to load it from web
if (resource == null) {
prop.put("error",4);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
}
if (resHeader == null) {
resHeader = sb.cacheManager.getCachedResponse(urlEntry.hash());
if (resHeader == null) {
resHeader = httpc.whead(url,url.getHost(),5000,null,null,sb.remoteProxyConfig);
plasmaHTCache.Entry entry = sb.snippetCache.loadResourceFromWeb(url, 5000);
if (entry != null) {
resInfo = entry.getDocumentInfo();
resource = sb.cacheManager.loadResourceContent(url);
}
if (resource == null) {
prop.put("error",4);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
resMime = resHeader.mime();
}
}
} catch (IOException e) {
if (url == null) {
prop.put("error",4);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
}
if (viewMode.equals("plain")) {
String content = new String(resource);
content = content.replaceAll("<","&lt;")
.replaceAll(">","&gt;")
.replaceAll("\"","&quot;")
.replaceAll("\n","<br>")
.replaceAll("\t","&nbsp;&nbsp;&nbsp;&nbsp;");
prop.put("error",0);
prop.put("viewMode",VIEW_MODE_AS_PLAIN_TEXT);
prop.put("viewMode_plainText",content);
} else if (viewMode.equals("parsed") || viewMode.equals("sentences") || viewMode.equals("iframe")) {
// parsing the resource content
plasmaParserDocument document = sb.snippetCache.parseDocument(url, resource,resHeader);
if (document == null) {
prop.put("error",5);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
resMime = document.getMimeType();
if (viewMode.equals("parsed")) {
String content = new String(document.getText());
content = wikiCode.replaceHTML(content); //added by Marc Nause
content = content.replaceAll("\n","<br>")
.replaceAll("\t","&nbsp;&nbsp;&nbsp;&nbsp;");
prop.put("viewMode",VIEW_MODE_AS_PARSED_TEXT);
prop.put("viewMode_parsedText",content);
} else if (viewMode.equals("iframe")) {
prop.put("viewMode",VIEW_MODE_AS_IFRAME);
prop.put("viewMode_url",url.toString());
} else {
prop.put("viewMode",VIEW_MODE_AS_PARSED_SENTENCES);
String[] sentences = document.getSentences();
boolean dark = true;
for (int i=0; i < sentences.length; i++) {
String currentSentence = wikiCode.replaceHTML(sentences[i]);
// Search word highlighting
String words = post.get("words",null);
if (words != null) {
try {
words = URLDecoder.decode(words,"UTF-8");
} catch (UnsupportedEncodingException e) {}
String[] wordArray = words.substring(1,words.length()-1).split(",");
for (int j=0; j < wordArray.length; j++) {
String currentWord = wordArray[j].trim();
currentSentence = currentSentence.replaceAll(currentWord,
"<b style=\"color: black; background-color: rgb(" + highlightingColors[j%6] + ");\">" + currentWord + "</b>");
// try to load resource metadata
if (resInfo == null) {
// try to load the metadata from cache
try {
resInfo = sb.cacheManager.loadResourceInfo(urlEntry.url());
} catch (Exception e) { /* ignore this */}
// if the metadata where not cached try to load it from web
if (resInfo == null) {
String protocol = url.getProtocol();
if (!((protocol.equals("http") || protocol.equals("https")))) {
prop.put("error",6);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
httpHeader responseHeader = httpc.whead(url,url.getHost(),5000,null,null,sb.remoteProxyConfig);
if (responseHeader == null) {
prop.put("error",4);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
resMime = responseHeader.mime();
}
} else {
resMime = resInfo.getMimeType();
}
} catch (IOException e) {
if (url == null) {
prop.put("error",4);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
}
if (viewMode.equals("plain")) {
String content = new String(resource);
content = content.replaceAll("<","&lt;")
.replaceAll(">","&gt;")
.replaceAll("\"","&quot;")
.replaceAll("\n","<br>")
.replaceAll("\t","&nbsp;&nbsp;&nbsp;&nbsp;");
prop.put("viewMode_sentences_" + i + "_nr",Integer.toString(i+1));
prop.put("viewMode_sentences_" + i + "_text",currentSentence);
prop.put("viewMode_sentences_" + i + "_dark",((dark) ? 1 : 0) ); dark=!dark;
prop.put("error",0);
prop.put("viewMode",VIEW_MODE_AS_PLAIN_TEXT);
prop.put("viewMode_plainText",content);
} else if (viewMode.equals("parsed") || viewMode.equals("sentences") || viewMode.equals("iframe")) {
// parsing the resource content
plasmaParserDocument document = sb.snippetCache.parseDocument(url, resource,resInfo);
if (document == null) {
prop.put("error",5);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
prop.put("viewMode_sentences",sentences.length);
resMime = document.getMimeType();
if (viewMode.equals("parsed")) {
String content = new String(document.getText());
content = wikiCode.replaceHTML(content); //added by Marc Nause
content = content.replaceAll("\n","<br>")
.replaceAll("\t","&nbsp;&nbsp;&nbsp;&nbsp;");
prop.put("viewMode",VIEW_MODE_AS_PARSED_TEXT);
prop.put("viewMode_parsedText",content);
} else if (viewMode.equals("iframe")) {
prop.put("viewMode",VIEW_MODE_AS_IFRAME);
prop.put("viewMode_url",url.toString());
} else {
prop.put("viewMode",VIEW_MODE_AS_PARSED_SENTENCES);
String[] sentences = document.getSentences();
boolean dark = true;
for (int i=0; i < sentences.length; i++) {
String currentSentence = wikiCode.replaceHTML(sentences[i]);
// Search word highlighting
String words = post.get("words",null);
if (words != null) {
try {
words = URLDecoder.decode(words,"UTF-8");
} catch (UnsupportedEncodingException e) {}
String[] wordArray = words.substring(1,words.length()-1).split(",");
for (int j=0; j < wordArray.length; j++) {
String currentWord = wordArray[j].trim();
currentSentence = currentSentence.replaceAll(currentWord,
"<b style=\"color: black; background-color: rgb(" + highlightingColors[j%6] + ");\">" + currentWord + "</b>");
}
}
prop.put("viewMode_sentences_" + i + "_nr",Integer.toString(i+1));
prop.put("viewMode_sentences_" + i + "_text",currentSentence);
prop.put("viewMode_sentences_" + i + "_dark",((dark) ? 1 : 0) ); dark=!dark;
}
prop.put("viewMode_sentences",sentences.length);
}
}
prop.put("error",0);
prop.put("error_url",url.toString());
prop.put("error_hash",urlHash);
prop.put("error_wordCount",Integer.toString(urlEntry.wordCount()));
prop.put("error_desc",urlEntry.descr());
prop.put("error_size",urlEntry.size());
prop.put("error_mimeType",resMime);
}
prop.put("error",0);
prop.put("error_url",url.toString());
prop.put("error_hash",urlHash);
prop.put("error_wordCount",Integer.toString(urlEntry.wordCount()));
prop.put("error_desc",urlEntry.descr());
prop.put("error_size",urlEntry.size());
prop.put("error_mimeType",resMime);
}
return prop;
return prop;
}
}

@ -96,6 +96,8 @@ import de.anomic.index.indexURL;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.plasma.cache.http.ResourceInfo;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverCore;
import de.anomic.server.serverFileUtils;
@ -413,8 +415,8 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
// decide wether to use a cache entry or connect to the network
File cacheFile = cacheManager.getCachePath(url);
String urlHash = indexURL.urlHash(url);
httpHeader cachedResponseHeader = cacheManager.getCachedResponse(urlHash);
ResourceInfo cachedResInfo = (ResourceInfo) cacheManager.loadResourceInfo(url);
httpHeader cachedResponseHeader = (cachedResInfo == null)?null:cachedResInfo.getResponseHeader();
boolean cacheExists = ((cacheFile.isFile()) && (cachedResponseHeader != null));
// why are files unzipped upon arrival? why not zip all files in cache?
@ -445,9 +447,10 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
0, // crawling depth
url, // url
"", // name of the url is unknown
requestHeader, // request headers
//requestHeader, // request headers
"200 OK", // request status
cachedResponseHeader, // response headers
//cachedResponseHeader, // response headers
cachedResInfo,
null, // initiator
switchboard.defaultProxyProfile // profile
);
@ -580,14 +583,16 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
// reserver cache entry
Date requestDate = new Date(((Long)conProp.get(httpHeader.CONNECTION_PROP_REQUEST_START)).longValue());
IResourceInfo resInfo = new ResourceInfo(url,requestHeader,res.responseHeader);
plasmaHTCache.Entry cacheEntry = cacheManager.newEntry(
requestDate,
0,
url,
"",
requestHeader,
//requestHeader,
res.status,
res.responseHeader,
//res.responseHeader,
resInfo,
null,
switchboard.defaultProxyProfile
);

@ -64,6 +64,8 @@ import de.anomic.http.httpc;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.plasma.cache.http.ResourceInfo;
import de.anomic.server.serverCore;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverHandler;
@ -385,14 +387,14 @@ public class icapd implements serverHandler {
* ========================================================================= */
// generating a htcache entry object
IResourceInfo resInfo = new ResourceInfo(httpRequestURL,httpReqHeader,httpResHeader);
plasmaHTCache.Entry cacheEntry = cacheManager.newEntry(
new Date(),
0,
httpRequestURL,
"",
httpReqHeader,
httpRespStatusLine,
httpResHeader,
resInfo,
null,
switchboard.defaultProxyProfile
);

@ -0,0 +1,136 @@
// IResourceInfo.java
// -------------------------------------
// part of YACY
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2006
//
// This file ist contributed by Martin Thelian
//
// $LastChangedDate: 2006-02-20 23:57:42 +0100 (Mo, 20 Feb 2006) $
// $LastChangedRevision: 1715 $
// $LastChangedBy: theli $
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
package de.anomic.plasma.cache;
import java.util.Date;
import java.util.Map;
import de.anomic.net.URL;
public interface IResourceInfo {
/**
* Return the resource information as map
* @return
*/
public Map getMap();
/**
* Returns the URL of this content
* @return
*/
public URL getUrl();
/**
* Returns the referer URL of this URL
* @return referer URL
*/
public URL getRefererUrl();
/**
* Returns the mimetype of the cached object
* @return mimetype
*/
public String getMimeType();
/**
* Returns the modification date of the cached object
* @return the modifiaction date
*/
public Date getModificationDate();
/**
* Returns the url hash of the content URL
* @return
*/
public String getUrlHash();
/**
* Specifies if the resource was requested with a
* if modified since date
* @return
*/
public Date ifModifiedSince();
/**
* Specifies if the resource was requested with
* client specific information (e.g. cookies for http)
* @return
*/
public boolean requestWithCookie();
/**
* Specifies if the request prohibits indexing
* @return
*/
public boolean requestProhibitsIndexing();
/**
* Determines if a resource that was downloaded by the crawler
* is allowed to be indexed.
*
* @return an error string describing the reason why the
* resourse should not be indexed or null if indexing is allowed
*/
public String shallIndexCacheForCrawler();
/**
* Determines if a resource that was downloaded by the proxy
* is allowed to be indexed.
*
* @return an error string describing the reason why the
* resourse should not be indexed or null if indexing is allowed
*/
public String shallIndexCacheForProxy();
public String shallStoreCacheForProxy();
public boolean shallUseCacheForProxy();
public boolean validResponseStatus(String responseStatus);
}

@ -0,0 +1,86 @@
// RespourceInfoFactory.java
// -------------------------------------
// part of YACY
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2006
//
// This file ist contributed by Martin Thelian
//
// $LastChangedDate: 2006-02-20 23:57:42 +0100 (Mo, 20 Feb 2006) $
// $LastChangedRevision: 1715 $
// $LastChangedBy: theli $
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
package de.anomic.plasma.cache;
import java.lang.reflect.Constructor;
import java.util.Map;
import de.anomic.net.URL;
public class ResourceInfoFactory {
public IResourceInfo buildResourceInfoObj(
URL resourceURL,
Map resourceMetadata
) throws Exception {
String protocString = resourceURL.getProtocol();
// the full qualified class name
String className = this.getClass().getPackage().getName() + "." + protocString + ".ResourceInfo";
// loading class by name
Class moduleClass = Class.forName(className);
// getting the constructor
Constructor classConstructor = moduleClass.getConstructor( new Class[] {
URL.class,
Map.class
} );
// instantiating class
IResourceInfo infoObject = (IResourceInfo) classConstructor.newInstance(new Object[] {
resourceURL,
resourceMetadata
});
// return the newly created object
return infoObject;
}
}

@ -0,0 +1,467 @@
// ResourceInfo.java
// -------------------------------------
// part of YACY
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2006
//
// This file ist contributed by Martin Thelian
//
// $LastChangedDate: 2006-02-20 23:57:42 +0100 (Mo, 20 Feb 2006) $
// $LastChangedRevision: 1715 $
// $LastChangedBy: theli $
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
package de.anomic.plasma.cache.http;
import java.util.Date;
import java.util.Map;
import de.anomic.http.httpHeader;
import de.anomic.index.indexURL;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.cache.ResourceInfoFactory;
import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.server.serverDate;
public class ResourceInfo implements IResourceInfo {
private URL url;
private String urlHash;
private httpHeader responseHeader;
private httpHeader requestHeader;
/**
* Constructor used by the {@link ResourceInfoFactory}
* @param objectURL
* @param objectInfo
*/
public ResourceInfo(URL objectURL, Map objectInfo) {
if (objectURL == null) throw new NullPointerException();
if (objectInfo == null) throw new NullPointerException();
// generating the url hash
this.url = objectURL;
this.urlHash = indexURL.urlHash(this.url.toNormalform());
// create the http header object
this.responseHeader = new httpHeader(null, objectInfo);
}
public ResourceInfo(URL objectURL, httpHeader requestHeaders, httpHeader responseHeaders) {
if (objectURL == null) throw new NullPointerException();
if (responseHeaders == null) throw new NullPointerException();
// generating the url hash
this.url = objectURL;
this.urlHash = indexURL.urlHash(this.url.toNormalform());
this.requestHeader = requestHeaders;
this.responseHeader = responseHeaders;
}
public Map getMap() {
return this.responseHeader;
}
/**
* @see de.anomic.plasma.cache.IResourceInfo#getMimeType()
*/
public String getMimeType() {
if (this.responseHeader == null) return null;
String mimeType = this.responseHeader.mime();
mimeType = mimeType.trim().toLowerCase();
int pos = mimeType.indexOf(';');
return ((pos < 0) ? mimeType : mimeType.substring(0, pos));
}
/**
* @see de.anomic.plasma.cache.IResourceInfo#getModificationDate()
*/
public Date getModificationDate() {
Date docDate = null;
if (this.responseHeader != null) {
docDate = this.responseHeader.lastModified();
if (docDate == null) docDate = this.responseHeader.date();
}
if (docDate == null) docDate = new Date(serverDate.correctedUTCTime());
return docDate;
}
public URL getRefererUrl() {
if (this.requestHeader == null) return null;
try {
return new URL((String) this.requestHeader.get(httpHeader.REFERER, ""));
} catch (Exception e) {
return null;
}
}
/**
* @see de.anomic.plasma.cache.IResourceInfo#getUrl()
*/
public URL getUrl() {
return this.url;
}
/**
* @see de.anomic.plasma.cache.IResourceInfo#getUrlHash()
*/
public String getUrlHash() {
return this.urlHash;
}
/**
* @see de.anomic.plasma.cache.IResourceInfo#shallIndexCacheForCrawler()
*/
public String shallIndexCacheForCrawler() {
String mimeType = this.getMimeType();
if (plasmaHTCache.isPicture(mimeType)) { return "Media_Content_(Picture)"; }
if (!plasmaHTCache.isText(mimeType)) { return "Media_Content_(not_text)"; }
return null;
}
/**
* @see de.anomic.plasma.cache.IResourceInfo#shallIndexCacheForProxy()
*/
public String shallIndexCacheForProxy() {
// -set-cookie in response
// the set-cookie from the server does not indicate that the content is special
// thus we do not care about it here for indexing
// a picture cannot be indexed
String mimeType = this.getMimeType();
if (plasmaHTCache.isPicture(mimeType)) {
return "Media_Content_(Picture)";
}
if (!plasmaHTCache.isText(mimeType)) {
return "Media_Content_(not_text)";
}
// -if-modified-since in request
// if the page is fresh at the very moment we can index it
Date ifModifiedSince = getModificationDate();
if ((ifModifiedSince != null) && (this.responseHeader.containsKey(httpHeader.LAST_MODIFIED))) {
// parse date
Date d = this.responseHeader.lastModified();
if (d == null) {
d = new Date(serverDate.correctedUTCTime());
}
// finally, we shall treat the cache as stale if the modification time is after the if-.. time
if (d.after(ifModifiedSince)) {
//System.out.println("***not indexed because if-modified-since");
return "Stale_(Last-Modified>Modified-Since)";
}
}
// -pragma in cached response
if (this.responseHeader.containsKey(httpHeader.PRAGMA) &&
((String) this.responseHeader.get(httpHeader.PRAGMA)).toUpperCase().equals("NO-CACHE")) {
return "Denied_(pragma_no_cache)";
}
// see for documentation also:
// http://www.web-caching.com/cacheability.html
// look for freshnes information
// -expires in cached response
// the expires value gives us a very easy hint when the cache is stale
// sometimes, the expires date is set to the past to prevent that a page is cached
// we use that information to see if we should index it
final Date expires = this.responseHeader.expires();
if (expires != null && expires.before(new Date(serverDate.correctedUTCTime()))) {
return "Stale_(Expired)";
}
// -lastModified in cached response
// this information is too weak to use it to prevent indexing
// even if we can apply a TTL heuristic for cache usage
// -cache-control in cached response
// the cache-control has many value options.
String cacheControl = (String) this.responseHeader.get(httpHeader.CACHE_CONTROL);
if (cacheControl != null) {
cacheControl = cacheControl.trim().toUpperCase();
/* we have the following cases for cache-control:
"public" -- can be indexed
"private", "no-cache", "no-store" -- cannot be indexed
"max-age=<delta-seconds>" -- stale/fresh dependent on date
*/
if (cacheControl.startsWith("PRIVATE") ||
cacheControl.startsWith("NO-CACHE") ||
cacheControl.startsWith("NO-STORE")) {
// easy case
return "Stale_(denied_by_cache-control=" + cacheControl + ")";
// } else if (cacheControl.startsWith("PUBLIC")) {
// // ok, do nothing
} else if (cacheControl.startsWith("MAX-AGE=")) {
// we need also the load date
final Date date = this.responseHeader.date();
if (date == null) {
return "Stale_(no_date_given_in_response)";
}
try {
final long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live
if (serverDate.correctedUTCTime() - date.getTime() > ttl) {
//System.out.println("***not indexed because cache-control");
return "Stale_(expired_by_cache-control)";
}
} catch (Exception e) {
return "Error_(" + e.getMessage() + ")";
}
}
}
return null;
}
public String shallStoreCacheForProxy() {
if (this.requestHeader != null) {
// -authorization cases in request
// authorization makes pages very individual, and therefore we cannot use the
// content in the cache
if (this.requestHeader.containsKey(httpHeader.AUTHORIZATION)) { return "personalized"; }
// -ranges in request and response
// we do not cache partial content
if (this.requestHeader.containsKey(httpHeader.RANGE)) { return "partial"; }
}
if (this.responseHeader != null) {
// -ranges in request and response
// we do not cache partial content
if (this.responseHeader.containsKey(httpHeader.CONTENT_RANGE)) { return "partial"; }
// -if-modified-since in request
// we do not care about if-modified-since, because this case only occurres if the
// cache file does not exist, and we need as much info as possible for the indexing
// -cookies in request
// we do not care about cookies, because that would prevent loading more pages
// from one domain once a request resulted in a client-side stored cookie
// -set-cookie in response
// we do not care about cookies in responses, because that info comes along
// any/many pages from a server and does not express the validity of the page
// in modes of life-time/expiration or individuality
// -pragma in response
// if we have a pragma non-cache, we don't cache. usually if this is wanted from
// the server, it makes sense
String cacheControl = (String) this.responseHeader.get(httpHeader.PRAGMA);
if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return "controlled_no_cache"; }
// -expires in response
// we do not care about expires, because at the time this is called the data is
// obvious valid and that header info is used in the indexing later on
// -cache-control in response
// the cache-control has many value options.
cacheControl = (String) this.responseHeader.get(httpHeader.CACHE_CONTROL);
if (cacheControl != null) {
cacheControl = cacheControl.trim().toUpperCase();
if (cacheControl.startsWith("MAX-AGE=")) {
// we need also the load date
Date date = this.responseHeader.date();
if (date == null) return "stale_no_date_given_in_response";
try {
long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live
if (serverDate.correctedUTCTime() - date.getTime() > ttl) {
//System.out.println("***not indexed because cache-control");
return "stale_expired";
}
} catch (Exception e) {
return "stale_error_" + e.getMessage() + ")";
}
}
}
}
return null;
}
public boolean shallUseCacheForProxy() {
String cacheControl;
if (this.requestHeader != null) {
// -authorization cases in request
if (this.requestHeader.containsKey(httpHeader.AUTHORIZATION)) { return false; }
// -ranges in request
// we do not cache partial content
if (this.requestHeader.containsKey(httpHeader.RANGE)) { return false; }
// if the client requests a un-cached copy of the resource ...
cacheControl = (String) this.requestHeader.get(httpHeader.PRAGMA);
if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return false; }
cacheControl = (String) this.requestHeader.get(httpHeader.CACHE_CONTROL);
if (cacheControl != null) {
cacheControl = cacheControl.trim().toUpperCase();
if (cacheControl.startsWith("NO-CACHE") || cacheControl.startsWith("MAX-AGE=0")) { return false; }
}
}
// -if-modified-since in request
// The entity has to be transferred only if it has
// been modified since the date given by the If-Modified-Since header.
if (this.requestHeader.containsKey(httpHeader.IF_MODIFIED_SINCE)) {
// checking this makes only sense if the cached response contains
// a Last-Modified field. If the field does not exist, we go the safe way
if (!this.responseHeader.containsKey(httpHeader.LAST_MODIFIED)) { return false; }
// parse date
Date d1, d2;
d2 = this.responseHeader.lastModified(); if (d2 == null) { d2 = new Date(serverDate.correctedUTCTime()); }
d1 = this.requestHeader.ifModifiedSince(); if (d1 == null) { d1 = new Date(serverDate.correctedUTCTime()); }
// finally, we shall treat the cache as stale if the modification time is after the if-.. time
if (d2.after(d1)) { return false; }
}
String mimeType = this.getMimeType();
if (!plasmaHTCache.isPicture(mimeType)) {
// -cookies in request
// unfortunately, we should reload in case of a cookie
// but we think that pictures can still be considered as fresh
// -set-cookie in cached response
// this is a similar case as for COOKIE.
if (this.requestHeader.containsKey(httpHeader.COOKIE) ||
this.responseHeader.containsKey(httpHeader.SET_COOKIE) ||
this.responseHeader.containsKey(httpHeader.SET_COOKIE2)) {
return false; // too strong
}
}
// -pragma in cached response
// logically, we would not need to care about no-cache pragmas in cached response headers,
// because they cannot exist since they are not written to the cache.
// So this IF should always fail..
cacheControl = (String) this.responseHeader.get(httpHeader.PRAGMA);
if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return false; }
// see for documentation also:
// http://www.web-caching.com/cacheability.html
// http://vancouver-webpages.com/CacheNow/
// look for freshnes information
// if we don't have any freshnes indication, we treat the file as stale.
// no handle for freshness control:
// -expires in cached response
// the expires value gives us a very easy hint when the cache is stale
Date expires = this.responseHeader.expires();
if (expires != null) {
// System.out.println("EXPIRES-TEST: expires=" + expires + ", NOW=" + serverDate.correctedGMTDate() + ", url=" + url);
if (expires.before(new Date(serverDate.correctedUTCTime()))) { return false; }
}
Date lastModified = this.responseHeader.lastModified();
cacheControl = (String) this.responseHeader.get(httpHeader.CACHE_CONTROL);
if (cacheControl == null && lastModified == null && expires == null) { return false; }
// -lastModified in cached response
// we can apply a TTL (Time To Live) heuristic here. We call the time delta between the last read
// of the file and the last modified date as the age of the file. If we consider the file as
// middel-aged then, the maximum TTL would be cache-creation plus age.
// This would be a TTL factor of 100% we want no more than 10% TTL, so that a 10 month old cache
// file may only be treated as fresh for one more month, not more.
Date date = this.responseHeader.date();
if (lastModified != null) {
if (date == null) { date = new Date(serverDate.correctedUTCTime()); }
long age = date.getTime() - lastModified.getTime();
if (age < 0) { return false; }
// TTL (Time-To-Live) is age/10 = (d2.getTime() - d1.getTime()) / 10
// the actual living-time is serverDate.correctedGMTDate().getTime() - d2.getTime()
// therefore the cache is stale, if serverDate.correctedGMTDate().getTime() - d2.getTime() > age/10
if (serverDate.correctedUTCTime() - date.getTime() > age / 10) { return false; }
}
// -cache-control in cached response
// the cache-control has many value options.
if (cacheControl != null) {
cacheControl = cacheControl.trim().toUpperCase();
if (cacheControl.startsWith("PRIVATE") ||
cacheControl.startsWith("NO-CACHE") ||
cacheControl.startsWith("NO-STORE")) {
// easy case
return false;
// } else if (cacheControl.startsWith("PUBLIC")) {
// // ok, do nothing
} else if (cacheControl.startsWith("MAX-AGE=")) {
// we need also the load date
if (date == null) { return false; }
try {
final long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live
if (serverDate.correctedUTCTime() - date.getTime() > ttl) {
return false;
}
} catch (Exception e) {
return false;
}
}
}
return true;
}
public boolean validResponseStatus(String responseStatus) {
return responseStatus.startsWith("200") ||
responseStatus.startsWith("203");
}
public Date ifModifiedSince() {
return (this.requestHeader == null) ? null : this.requestHeader.ifModifiedSince();
}
public boolean requestWithCookie() {
return (this.requestHeader == null) ? false : this.requestHeader.containsKey(httpHeader.COOKIE);
}
public boolean requestProhibitsIndexing() {
return (this.requestHeader == null)
? false
: this.requestHeader.containsKey(httpHeader.X_YACY_INDEX_CONTROL) &&
((String)this.requestHeader.get(httpHeader.X_YACY_INDEX_CONTROL)).toUpperCase().equals("NO-INDEX");
}
public httpHeader getRequestHeader() {
return this.requestHeader;
}
public httpHeader getResponseHeader() {
return this.responseHeader;
}
}

@ -64,6 +64,8 @@ import de.anomic.plasma.plasmaCrawlLoader;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.plasma.cache.http.ResourceInfo;
import de.anomic.plasma.crawler.AbstractCrawlWorker;
import de.anomic.plasma.crawler.plasmaCrawlerPool;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
@ -129,15 +131,15 @@ public final class CrawlWorker extends AbstractCrawlWorker {
return load(DEFAULT_CRAWLING_RETRY_COUNT);
}
protected plasmaHTCache.Entry createCacheEntry(Date requestDate, httpHeader requestHeader, httpc.response response) {
protected plasmaHTCache.Entry createCacheEntry(URL requestUrl, Date requestDate, httpHeader requestHeader, httpc.response response) {
IResourceInfo resourceInfo = new ResourceInfo(requestUrl,requestHeader,response.responseHeader);
return this.cacheManager.newEntry(
requestDate,
this.depth,
this.url,
this.name,
requestHeader,
response.status,
response.responseHeader,
resourceInfo,
this.initiator,
this.profile
);
@ -197,7 +199,7 @@ public final class CrawlWorker extends AbstractCrawlWorker {
// the transfer is ok
// create a new cache entry
htCache = createCacheEntry(requestDate, requestHeader, res);
htCache = createCacheEntry(this.url,requestDate, requestHeader, res);
// aborting download if content is to long ...
if (htCache.cacheFile().getAbsolutePath().length() > serverSystem.maxPathLength) {

@ -295,7 +295,7 @@ public final class plasmaCrawlStacker {
}
// check if ip is local ip address
checkInterruption();
checkInterruption(); // TODO: this is protocol specific
InetAddress hostAddress = httpc.dnsResolve(nexturl.getHost());
if (hostAddress == null) {
// if a http proxy is configured name resolution may not work

@ -54,14 +54,12 @@
package de.anomic.plasma;
import de.anomic.http.httpc;
import de.anomic.http.httpHeader;
import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexURL;
import de.anomic.kelondro.kelondroDyn;
import de.anomic.kelondro.kelondroMap;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.server.logging.serverLog;
import de.anomic.server.serverDate;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverInstantThread;
import de.anomic.server.serverSystem;
@ -73,6 +71,9 @@ import java.io.IOException;
import java.net.InetAddress;
import java.net.MalformedURLException;
import de.anomic.net.URL;
import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.plasma.cache.ResourceInfoFactory;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
@ -87,7 +88,7 @@ public final class plasmaHTCache {
private static final int stackLimit = 150; // if we exceed that limit, we do not check idle
public static final long oneday = 1000 * 60 * 60 * 24; // milliseconds of a day
private kelondroMap responseHeaderDB = null;
kelondroMap responseHeaderDB = null;
private final LinkedList cacheStack;
private final TreeMap cacheAge; // a <date+hash, cache-path> - relation
public long curCacheSize;
@ -96,12 +97,17 @@ public final class plasmaHTCache {
public final serverLog log;
public static final HashSet filesInUse = new HashSet(); // can we delete this file
private ResourceInfoFactory objFactory;
public plasmaHTCache(File htCachePath, long maxCacheSize, int bufferkb, long preloadTime) {
// this.switchboard = switchboard;
this.log = new serverLog("HTCACHE");
this.cachePath = htCachePath;
// create the object factory
this.objFactory = new ResourceInfoFactory();
// reset old HTCache ?
String[] list = this.cachePath.list();
if (list != null) {
@ -229,10 +235,6 @@ public final class plasmaHTCache {
}
}
public void storeHeader(String urlHash, httpHeader responseHeader) throws IOException {
this.responseHeaderDB.set(urlHash, responseHeader);
}
/**
* This method changes the HTCache size.<br>
* @param new cache size in bytes
@ -249,7 +251,7 @@ public final class plasmaHTCache {
return (this.curCacheSize >= this.maxCacheSize) ? 0 : this.maxCacheSize - this.curCacheSize;
}
public boolean writeFile(URL url, byte[] array) {
public boolean writeResourceContent(URL url, byte[] array) {
if (array == null) return false;
File file = getCachePath(url);
try {
@ -445,10 +447,24 @@ public final class plasmaHTCache {
return prefix + s.substring(0, p);
}
public httpHeader getCachedResponse(String urlHash) throws IOException {
/**
* Returns an object containing metadata about a cached resource
* @param url the url of the resource
* @return an {@link IResourceInfo info object}
* @throws Exception of the info object could not be created, e.g. if the protocol is not supported
*/
public IResourceInfo loadResourceInfo(URL url) throws Exception {
// getting the URL hash
String urlHash = indexURL.urlHash(url.toNormalform());
// loading data from database
Map hdb = this.responseHeaderDB.get(urlHash);
if (hdb == null) return null;
return new httpHeader(null, hdb);
// generate the cached object
IResourceInfo cachedObj = this.objFactory.buildResourceInfoObj(url, hdb);
return cachedObj;
}
public boolean full() {
@ -459,18 +475,17 @@ public final class plasmaHTCache {
return (this.cacheStack.size() == 0);
}
public static boolean isPicture(httpHeader response) {
Object ct = response.get(httpHeader.CONTENT_TYPE);
if (ct == null) return false;
return ((String)ct).toUpperCase().startsWith("IMAGE");
public static boolean isPicture(String mimeType) {
if (mimeType == null) return false;
return mimeType.toUpperCase().startsWith("IMAGE");
}
public static boolean isText(httpHeader response) {
public static boolean isText(String mimeType) {
// Object ct = response.get(httpHeader.CONTENT_TYPE);
// if (ct == null) return false;
// String t = ((String)ct).toLowerCase();
// return ((t.startsWith("text")) || (t.equals("application/xhtml+xml")));
return plasmaParser.supportedMimeTypesContains(response.mime());
return plasmaParser.supportedMimeTypesContains(mimeType);
}
public static boolean noIndexingURL(String urlString) {
@ -568,9 +583,8 @@ public final class plasmaHTCache {
}
if (port < 0) {
return new File(this.cachePath, protocol + "/" + host + path);
} else {
return new File(this.cachePath, protocol + "/" + host + "!" + port + path);
}
return new File(this.cachePath, protocol + "/" + host + "!" + port + path);
}
/**
@ -663,7 +677,7 @@ public final class plasmaHTCache {
return null;
}
public byte[] loadResource(URL url) {
public byte[] loadResourceContent(URL url) {
// load the url as resource from the cache
File f = getCachePath(url);
if (f.exists()) try {
@ -690,12 +704,30 @@ public final class plasmaHTCache {
(ls.indexOf("memberlist.php?sid=") >= 0));
}
public Entry newEntry(Date initDate, int depth, URL url, String name,
httpHeader requestHeader,
String responseStatus, httpHeader responseHeader,
String initiator,
plasmaCrawlProfile.entry profile) {
return new Entry(initDate, depth, url, name, requestHeader, responseStatus, responseHeader, initiator, profile);
public Entry newEntry(
Date initDate,
int depth,
URL url,
String name,
//httpHeader requestHeader,
String responseStatus,
//httpHeader responseHeader,
IResourceInfo docInfo,
String initiator,
plasmaCrawlProfile.entry profile
) {
return new Entry(
initDate,
depth,
url,
name,
//requestHeader,
responseStatus,
//responseHeader,
docInfo,
initiator,
profile
);
}
public final class Entry {
@ -703,9 +735,9 @@ public final class plasmaHTCache {
// the class objects
private Date initDate; // the date when the request happened; will be used as a key
private int depth; // the depth of prefetching
private httpHeader requestHeader; // we carry also the header to prevent too many file system access
// private httpHeader requestHeader; // we carry also the header to prevent too many file system access
// private httpHeader responseHeader; // we carry also the header to prevent too many file system access
private String responseStatus;
private httpHeader responseHeader; // we carry also the header to prevent too many file system access
private File cacheFile; // the cache file
private byte[] cacheArray; // or the cache as byte-array
private URL url;
@ -719,15 +751,21 @@ public final class plasmaHTCache {
private plasmaCrawlProfile.entry profile;
private String initiator;
/**
* protocolspecific information about the resource
*/
private IResourceInfo resInfo;
protected Object clone() throws CloneNotSupportedException {
return new Entry(
this.initDate,
this.depth,
this.url,
this.name,
this.requestHeader,
//this.requestHeader,
this.responseStatus,
this.responseHeader,
//this.responseHeader,
this.resInfo,
this.initiator,
this.profile
);
@ -737,15 +775,21 @@ public final class plasmaHTCache {
int depth,
URL url,
String name,
httpHeader requestHeader,
//httpHeader requestHeader,
String responseStatus,
httpHeader responseHeader,
//httpHeader responseHeader,
IResourceInfo resourceInfo,
String initiator,
plasmaCrawlProfile.entry profile
) {
if (resourceInfo == null){
System.out.println("Content information object is null. " + url);
System.exit(0);
}
this.resInfo = resourceInfo;
// normalize url
// serverLog.logFine("PLASMA", "Entry: URL=" + url.toString());
this.nomalizedURLString = url.toNormalform();
try {
@ -761,28 +805,17 @@ public final class plasmaHTCache {
// assigned:
this.initDate = initDate;
this.depth = depth;
this.requestHeader = requestHeader;
//this.requestHeader = requestHeader;
this.responseStatus = responseStatus;
this.responseHeader = responseHeader;
//this.responseHeader = responseHeader;
this.profile = profile;
this.initiator = (initiator == null) ? null : ((initiator.length() == 0) ? null : initiator);
// calculated:
if (responseHeader == null) {
try {
throw new RuntimeException("RESPONSE HEADER = NULL");
} catch (Exception e) {
System.out.println("RESPONSE HEADER = NULL in " + url);
e.printStackTrace();
System.exit(0);
}
this.lastModified = new Date(serverDate.correctedUTCTime());
} else {
this.lastModified = responseHeader.lastModified();
if (this.lastModified == null) this.lastModified = new Date(serverDate.correctedUTCTime()); // does not exist in header
}
this.doctype = indexEntryAttribute.docType(responseHeader.mime());
// getting the last modified date
this.lastModified = resourceInfo.getModificationDate();
// getting the doctype
this.doctype = indexEntryAttribute.docType(resourceInfo.getMimeType());
if (this.doctype == indexEntryAttribute.DT_UNKNOWN) this.doctype = indexEntryAttribute.docType(url);
this.language = indexEntryAttribute.language(url);
@ -822,12 +855,7 @@ public final class plasmaHTCache {
}
public URL referrerURL() {
if (this.requestHeader == null) return null;
try {
return new URL((String) this.requestHeader.get(httpHeader.REFERER, ""));
} catch (Exception e) {
return null;
}
return (this.resInfo==null)?null:this.resInfo.getRefererUrl();
}
public File cacheFile() {
@ -846,27 +874,36 @@ public final class plasmaHTCache {
// return this.requestHeader;
// }
public httpHeader responseHeader() {
return this.responseHeader;
// public httpHeader responseHeader() {
// return this.responseHeader;
// }
public IResourceInfo getDocumentInfo() {
return this.resInfo;
}
public boolean writeResourceInfo() throws IOException {
assert(this.nomalizedURLHash != null) : "URL Hash is null";
if (this.resInfo == null) return false;
plasmaHTCache.this.responseHeaderDB.set(this.nomalizedURLHash, this.resInfo.getMap());
return true;
}
public String getMimeType() {
return (this.responseHeader == null) ? null : this.responseHeader.mime();
return (this.resInfo == null) ? null : this.resInfo.getMimeType();
}
public Date ifModifiedSince() {
return (this.requestHeader == null) ? null : this.requestHeader.ifModifiedSince();
return (this.resInfo == null) ? null : this.resInfo.ifModifiedSince();
}
public boolean requestWithCookie() {
return (this.requestHeader == null) ? false : this.requestHeader.containsKey(httpHeader.COOKIE);
return (this.resInfo == null) ? false : this.resInfo.requestWithCookie();
}
public boolean requestProhibitsIndexing() {
return (this.requestHeader == null)
? false
: this.requestHeader.containsKey(httpHeader.X_YACY_INDEX_CONTROL) &&
((String)this.requestHeader.get(httpHeader.X_YACY_INDEX_CONTROL)).toUpperCase().equals("NO-INDEX");
return (this.resInfo == null) ? false : this.resInfo.requestProhibitsIndexing();
}
/*
@ -878,9 +915,10 @@ public final class plasmaHTCache {
// the following three methods for cache read/write granting shall be as loose as possible
// but also as strict as necessary to enable caching of most items
/**
* @return NULL if the answer is TRUE, in case of FALSE, the reason as String is returned
*/
public String shallStoreCacheForProxy() {
// returns NULL if the answer is TRUE
// in case of FALSE, the reason as String is returned
// check profile (disabled: we will check this in the plasmaSwitchboard)
//if (!this.profile.storeHTCache()) { return "storage_not_wanted"; }
@ -889,8 +927,11 @@ public final class plasmaHTCache {
// if the storage was requested by prefetching, the request map is null
// check status code
if (!(this.responseStatus.startsWith("200") ||
this.responseStatus.startsWith("203"))) { return "bad_status_" + this.responseStatus.substring(0,3); }
if ((this.resInfo != null) && (!this.resInfo.validResponseStatus(this.responseStatus))) {
return "bad_status_" + this.responseStatus.substring(0,3);
}
// if (!(this.responseStatus.startsWith("200") ||
// this.responseStatus.startsWith("203"))) { return "bad_status_" + this.responseStatus.substring(0,3); }
// check storage location
// sometimes a file name is equal to a path name in the same directory;
@ -905,62 +946,10 @@ public final class plasmaHTCache {
if (isPOST(this.nomalizedURLString) && !this.profile.crawlingQ()) { return "dynamic_post"; }
if (isCGI(this.nomalizedURLString)) { return "dynamic_cgi"; }
if (this.requestHeader != null) {
// -authorization cases in request
// authorization makes pages very individual, and therefore we cannot use the
// content in the cache
if (this.requestHeader.containsKey(httpHeader.AUTHORIZATION)) { return "personalized"; }
// -ranges in request and response
// we do not cache partial content
if (this.requestHeader.containsKey(httpHeader.RANGE)) { return "partial"; }
}
// -ranges in request and response
// we do not cache partial content
if (this.responseHeader != null && this.responseHeader.containsKey(httpHeader.CONTENT_RANGE)) { return "partial"; }
// -if-modified-since in request
// we do not care about if-modified-since, because this case only occurres if the
// cache file does not exist, and we need as much info as possible for the indexing
// -cookies in request
// we do not care about cookies, because that would prevent loading more pages
// from one domain once a request resulted in a client-side stored cookie
// -set-cookie in response
// we do not care about cookies in responses, because that info comes along
// any/many pages from a server and does not express the validity of the page
// in modes of life-time/expiration or individuality
// -pragma in response
// if we have a pragma non-cache, we don't cache. usually if this is wanted from
// the server, it makes sense
String cacheControl = (String) this.responseHeader.get(httpHeader.PRAGMA);
if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return "controlled_no_cache"; }
// -expires in response
// we do not care about expires, because at the time this is called the data is
// obvious valid and that header info is used in the indexing later on
// -cache-control in response
// the cache-control has many value options.
cacheControl = (String) this.responseHeader.get(httpHeader.CACHE_CONTROL);
if (cacheControl != null) {
cacheControl = cacheControl.trim().toUpperCase();
if (cacheControl.startsWith("MAX-AGE=")) {
// we need also the load date
Date date = this.responseHeader.date();
if (date == null) return "stale_no_date_given_in_response";
try {
long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live
if (serverDate.correctedUTCTime() - date.getTime() > ttl) {
//System.out.println("***not indexed because cache-control");
return "stale_expired";
}
} catch (Exception e) {
return "stale_error_" + e.getMessage() + ")";
}
}
if (this.resInfo != null) {
return this.resInfo.shallStoreCacheForProxy();
}
return null;
}
@ -971,146 +960,17 @@ public final class plasmaHTCache {
public boolean shallUseCacheForProxy() {
// System.out.println("SHALL READ CACHE: requestHeader = " + requestHeader.toString() + ", responseHeader = " + responseHeader.toString());
String cacheControl;
if (this.requestHeader != null) {
// -authorization cases in request
if (this.requestHeader.containsKey(httpHeader.AUTHORIZATION)) { return false; }
// -ranges in request
// we do not cache partial content
if (this.requestHeader.containsKey(httpHeader.RANGE)) { return false; }
// if the client requests a un-cached copy of the resource ...
cacheControl = (String) this.requestHeader.get(httpHeader.PRAGMA);
if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return false; }
cacheControl = (String) this.requestHeader.get(httpHeader.CACHE_CONTROL);
if (cacheControl != null) {
cacheControl = cacheControl.trim().toUpperCase();
if (cacheControl.startsWith("NO-CACHE") || cacheControl.startsWith("MAX-AGE=0")) { return false; }
}
}
// -CGI access in request
// CGI access makes the page very individual, and therefore not usable in caches
if (isPOST(this.nomalizedURLString)) { return false; }
if (isCGI(this.nomalizedURLString)) { return false; }
// -if-modified-since in request
// The entity has to be transferred only if it has
// been modified since the date given by the If-Modified-Since header.
if (this.requestHeader.containsKey(httpHeader.IF_MODIFIED_SINCE)) {
// checking this makes only sense if the cached response contains
// a Last-Modified field. If the field does not exist, we go the safe way
if (!this.responseHeader.containsKey(httpHeader.LAST_MODIFIED)) { return false; }
// parse date
Date d1, d2;
d2 = this.responseHeader.lastModified(); if (d2 == null) { d2 = new Date(serverDate.correctedUTCTime()); }
d1 = this.requestHeader.ifModifiedSince(); if (d1 == null) { d1 = new Date(serverDate.correctedUTCTime()); }
// finally, we shall treat the cache as stale if the modification time is after the if-.. time
if (d2.after(d1)) { return false; }
}
if (!isPicture(this.responseHeader)) {
// -cookies in request
// unfortunately, we should reload in case of a cookie
// but we think that pictures can still be considered as fresh
// -set-cookie in cached response
// this is a similar case as for COOKIE.
if (this.requestHeader.containsKey(httpHeader.COOKIE) ||
this.responseHeader.containsKey(httpHeader.SET_COOKIE) ||
this.responseHeader.containsKey(httpHeader.SET_COOKIE2)) {
return false; // too strong
}
}
// -pragma in cached response
// logically, we would not need to care about no-cache pragmas in cached response headers,
// because they cannot exist since they are not written to the cache.
// So this IF should always fail..
cacheControl = (String) this.responseHeader.get(httpHeader.PRAGMA);
if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return false; }
// see for documentation also:
// http://www.web-caching.com/cacheability.html
// http://vancouver-webpages.com/CacheNow/
// look for freshnes information
// if we don't have any freshnes indication, we treat the file as stale.
// no handle for freshness control:
// -expires in cached response
// the expires value gives us a very easy hint when the cache is stale
Date expires = this.responseHeader.expires();
if (expires != null) {
// System.out.println("EXPIRES-TEST: expires=" + expires + ", NOW=" + serverDate.correctedGMTDate() + ", url=" + url);
if (expires.before(new Date(serverDate.correctedUTCTime()))) { return false; }
}
Date lastModified = this.responseHeader.lastModified();
cacheControl = (String) this.responseHeader.get(httpHeader.CACHE_CONTROL);
if (cacheControl == null && lastModified == null && expires == null) { return false; }
// -lastModified in cached response
// we can apply a TTL (Time To Live) heuristic here. We call the time delta between the last read
// of the file and the last modified date as the age of the file. If we consider the file as
// middel-aged then, the maximum TTL would be cache-creation plus age.
// This would be a TTL factor of 100% we want no more than 10% TTL, so that a 10 month old cache
// file may only be treated as fresh for one more month, not more.
Date date = this.responseHeader.date();
if (lastModified != null) {
if (date == null) { date = new Date(serverDate.correctedUTCTime()); }
long age = date.getTime() - lastModified.getTime();
if (age < 0) { return false; }
// TTL (Time-To-Live) is age/10 = (d2.getTime() - d1.getTime()) / 10
// the actual living-time is serverDate.correctedGMTDate().getTime() - d2.getTime()
// therefore the cache is stale, if serverDate.correctedGMTDate().getTime() - d2.getTime() > age/10
if (serverDate.correctedUTCTime() - date.getTime() > age / 10) { return false; }
if (this.resInfo != null) {
return this.resInfo.shallUseCacheForProxy();
}
// -cache-control in cached response
// the cache-control has many value options.
if (cacheControl != null) {
cacheControl = cacheControl.trim().toUpperCase();
if (cacheControl.startsWith("PRIVATE") ||
cacheControl.startsWith("NO-CACHE") ||
cacheControl.startsWith("NO-STORE")) {
// easy case
return false;
// } else if (cacheControl.startsWith("PUBLIC")) {
// // ok, do nothing
} else if (cacheControl.startsWith("MAX-AGE=")) {
// we need also the load date
if (date == null) { return false; }
try {
final long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live
if (serverDate.correctedUTCTime() - date.getTime() > ttl) {
return false;
}
} catch (Exception e) {
return false;
}
}
}
return true;
}
} // class Entry
/*
public static void main(String[] args) {
//String[] s = TimeZone.getAvailableIDs();
//for (int i = 0; i < s.length; i++) System.out.println("ZONE=" + s[i]);
Calendar c = GregorianCalendar.getInstance();
int zoneOffset = c.get(Calendar.ZONE_OFFSET)/(60*60*1000);
int DSTOffset = c.get(Calendar.DST_OFFSET)/(60*60*1000);
System.out.println("This Offset = " + (zoneOffset + DSTOffset));
for (int i = 0; i < 12; i++) {
c = new GregorianCalendar(TimeZone.getTimeZone("Etc/GMT-" + i));
//c.setTimeZone(TimeZone.getTimeZone("Etc/GMT+0"));
System.out.println("Zone offset: "+
c.get(Calendar.ZONE_OFFSET)/(60*60*1000));
System.out.println(c.get(GregorianCalendar.HOUR) + ", " + c.getTime() + ", " + c.getTimeInMillis());
}
}
**/
}

@ -44,6 +44,7 @@ package de.anomic.plasma;
import java.io.IOException;
import de.anomic.net.URL;
import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.plasma.crawler.http.CrawlWorker;
import java.util.Enumeration;
@ -167,15 +168,15 @@ public class plasmaSnippetCache {
// if the snippet is not in the cache, we can try to get it from the htcache
byte[] resource = null;
httpHeader header = null;
IResourceInfo docInfo = null;
try {
resource = cacheManager.loadResource(url);
resource = this.cacheManager.loadResourceContent(url);
if ((fetchOnline) && (resource == null)) {
plasmaHTCache.Entry entry = loadResourceFromWeb(url, 5000);
if (entry != null) {
header = entry.responseHeader();
docInfo = entry.getDocumentInfo();
}
resource = cacheManager.loadResource(url);
resource = this.cacheManager.loadResourceContent(url);
source = SOURCE_WEB;
}
} catch (IOException e) {
@ -185,7 +186,7 @@ public class plasmaSnippetCache {
//System.out.println("cannot load document for URL " + url);
return new result(null, ERROR_RESOURCE_LOADING, "error loading resource from web, cacheManager returned NULL");
}
plasmaParserDocument document = parseDocument(url, resource, header);
plasmaParserDocument document = parseDocument(url, resource, docInfo);
if (document == null) return new result(null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
//System.out.println("loaded document for URL " + url);
@ -360,18 +361,18 @@ public class plasmaSnippetCache {
return parseDocument(url, resource, null);
}
public plasmaParserDocument parseDocument(URL url, byte[] resource, httpHeader header) {
public plasmaParserDocument parseDocument(URL url, byte[] resource, IResourceInfo docInfo) {
try {
if (resource == null) return null;
// try to get the header from the htcache directory
if (header == null) {
if (docInfo == null) {
try {
header = this.cacheManager.getCachedResponse(indexURL.urlHash(url));
} catch (IOException e) {}
docInfo = this.cacheManager.loadResourceInfo(url);
} catch (Exception e) {}
}
if (header == null) {
if (docInfo == null) {
String filename = this.cacheManager.getCachePath(url).getName();
int p = filename.lastIndexOf('.');
if ( // if no extension is available
@ -394,8 +395,8 @@ public class plasmaSnippetCache {
}
return null;
}
if (plasmaParser.supportedMimeTypesContains(header.mime())) {
return this.parser.parseSource(url, header.mime(), resource);
if (plasmaParser.supportedMimeTypesContains(docInfo.getMimeType())) {
return this.parser.parseSource(url, docInfo.getMimeType(), resource);
}
return null;
} catch (InterruptedException e) {
@ -407,10 +408,10 @@ public class plasmaSnippetCache {
public byte[] getResource(URL url, boolean fetchOnline, int socketTimeout) {
// load the url as resource from the web
try {
byte[] resource = cacheManager.loadResource(url);
byte[] resource = cacheManager.loadResourceContent(url);
if ((fetchOnline) && (resource == null)) {
loadResourceFromWeb(url, (socketTimeout < 0) ? -1 : socketTimeout);
resource = cacheManager.loadResource(url);
resource = cacheManager.loadResourceContent(url);
}
return resource;
} catch (IOException e) {

@ -829,7 +829,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
/* =========================================================================
* LOCAL IP ADDRESS CHECK
*
* check if ip is local ip address
* check if ip is local ip address // TODO: remove this procotol specific code here
* ========================================================================= */
InetAddress hostAddress = httpc.dnsResolve(entry.url().getHost());
if (hostAddress == null) {
@ -857,8 +857,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
(doIndexing && isSupportedContent)
) {
// store response header
if (entry.responseHeader() != null) {
this.cacheManager.storeHeader(entry.urlHash(), entry.responseHeader());
if (entry.writeResourceInfo()) {
this.log.logInfo("WROTE HEADER for " + entry.cacheFile());
}
@ -868,7 +867,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} else {
String error = entry.shallStoreCacheForProxy();
if (error == null) {
this.cacheManager.writeFile(entry.url(), entry.cacheArray());
this.cacheManager.writeResourceContent(entry.url(), entry.cacheArray());
this.log.logFine("WROTE FILE (" + entry.cacheArray().length + " bytes) for " + entry.cacheFile());
} else {
this.log.logFine("WRITE OF FILE " + entry.cacheFile() + " FORBIDDEN: " + error);

@ -44,28 +44,27 @@
package de.anomic.plasma;
import de.anomic.http.httpHeader;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Date;
import de.anomic.index.indexURL;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroStack;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroStack;
import de.anomic.net.URL;
import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.server.logging.serverLog;
import de.anomic.server.serverDate;
import de.anomic.yacy.yacySeedDB;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import de.anomic.net.URL;
import java.util.ArrayList;
import java.util.Date;
public class plasmaSwitchboardQueue {
private kelondroStack sbQueueStack;
private plasmaCrawlProfile profiles;
private plasmaHTCache htCache;
plasmaHTCache htCache;
private plasmaCrawlLURL lurls;
private File sbQueueStackPath;
@ -191,7 +190,7 @@ public class plasmaSwitchboardQueue {
// computed values
private plasmaCrawlProfile.entry profileEntry;
private httpHeader responseHeader;
private IResourceInfo contentInfo;
private URL referrerURL;
public Entry(URL url, String referrer, Date ifModifiedSince, boolean requestWithCookie,
@ -206,7 +205,7 @@ public class plasmaSwitchboardQueue {
this.anchorName = (anchorName==null)?"":anchorName.trim();
this.profileEntry = null;
this.responseHeader = null;
this.contentInfo = null;
this.referrerURL = null;
}
@ -227,7 +226,7 @@ public class plasmaSwitchboardQueue {
this.anchorName = row.getColString(7, "UTF-8");
this.profileEntry = null;
this.responseHeader = null;
this.contentInfo = null;
this.referrerURL = null;
}
@ -248,7 +247,7 @@ public class plasmaSwitchboardQueue {
this.anchorName = (row[7] == null) ? null : (new String(row[7], "UTF-8")).trim();
this.profileEntry = null;
this.responseHeader = null;
this.contentInfo = null;
this.referrerURL = null;
}
@ -306,32 +305,24 @@ public class plasmaSwitchboardQueue {
return profileEntry;
}
private httpHeader responseHeader() {
if (responseHeader == null) try {
responseHeader = htCache.getCachedResponse(indexURL.urlHash(url));
} catch (IOException e) {
private IResourceInfo getCachedObjectInfo() {
if (this.contentInfo == null) try {
this.contentInfo = plasmaSwitchboardQueue.this.htCache.loadResourceInfo(this.url);
} catch (Exception e) {
serverLog.logSevere("PLASMA", "responseHeader: failed to get header", e);
return null;
}
return responseHeader;
return this.contentInfo;
}
public String getMimeType() {
httpHeader headers = this.responseHeader();
return (headers == null) ? null : headers.mime();
IResourceInfo info = this.getCachedObjectInfo();
return (info == null) ? null : info.getMimeType();
}
public Date getModificationDate() {
Date docDate = null;
httpHeader headers = this.responseHeader();
if (headers != null) {
docDate = headers.lastModified();
if (docDate == null) docDate = headers.date();
}
if (docDate == null) docDate = new Date();
return docDate;
IResourceInfo info = this.getCachedObjectInfo();
return (info == null) ? new Date() : info.getModificationDate();
}
public URL referrerURL() {
@ -360,6 +351,8 @@ public class plasmaSwitchboardQueue {
* this method returns null if the answer is 'YES'!
* if the answer is 'NO' (do not index), it returns a string with the reason
* to reject the crawling demand in clear text
*
* This function is used by plasmaSwitchboard#processResourceStack
*/
public final String shallIndexCacheForProxy() {
if (profile() == null) {
@ -402,91 +395,8 @@ public class plasmaSwitchboardQueue {
return "Dynamic_(Requested_With_Cookie)";
}
// -set-cookie in response
// the set-cookie from the server does not indicate that the content is special
// thus we do not care about it here for indexing
if (responseHeader() != null) {
// a picture cannot be indexed
if (plasmaHTCache.isPicture(responseHeader())) {
return "Media_Content_(Picture)";
}
if (!plasmaHTCache.isText(responseHeader())) {
return "Media_Content_(not_text)";
}
// -if-modified-since in request
// if the page is fresh at the very moment we can index it
if ((ifModifiedSince != null) && (responseHeader().containsKey(httpHeader.LAST_MODIFIED))) {
// parse date
Date d = responseHeader().lastModified();
if (d == null) {
d = new Date(serverDate.correctedUTCTime());
}
// finally, we shall treat the cache as stale if the modification time is after the if-.. time
if (d.after(ifModifiedSince)) {
//System.out.println("***not indexed because if-modified-since");
return "Stale_(Last-Modified>Modified-Since)";
}
}
// -pragma in cached response
if (responseHeader().containsKey(httpHeader.PRAGMA) &&
((String) responseHeader().get(httpHeader.PRAGMA)).toUpperCase().equals("NO-CACHE")) {
return "Denied_(pragma_no_cache)";
}
// see for documentation also:
// http://www.web-caching.com/cacheability.html
// look for freshnes information
// -expires in cached response
// the expires value gives us a very easy hint when the cache is stale
// sometimes, the expires date is set to the past to prevent that a page is cached
// we use that information to see if we should index it
final Date expires = responseHeader().expires();
if (expires != null && expires.before(new Date(serverDate.correctedUTCTime()))) {
return "Stale_(Expired)";
}
// -lastModified in cached response
// this information is too weak to use it to prevent indexing
// even if we can apply a TTL heuristic for cache usage
// -cache-control in cached response
// the cache-control has many value options.
String cacheControl = (String) responseHeader.get(httpHeader.CACHE_CONTROL);
if (cacheControl != null) {
cacheControl = cacheControl.trim().toUpperCase();
/* we have the following cases for cache-control:
"public" -- can be indexed
"private", "no-cache", "no-store" -- cannot be indexed
"max-age=<delta-seconds>" -- stale/fresh dependent on date
*/
if (cacheControl.startsWith("PRIVATE") ||
cacheControl.startsWith("NO-CACHE") ||
cacheControl.startsWith("NO-STORE")) {
// easy case
return "Stale_(denied_by_cache-control=" + cacheControl + ")";
// } else if (cacheControl.startsWith("PUBLIC")) {
// // ok, do nothing
} else if (cacheControl.startsWith("MAX-AGE=")) {
// we need also the load date
final Date date = responseHeader().date();
if (date == null) {
return "Stale_(no_date_given_in_response)";
}
try {
final long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live
if (serverDate.correctedUTCTime() - date.getTime() > ttl) {
//System.out.println("***not indexed because cache-control");
return "Stale_(expired_by_cache-control)";
}
} catch (Exception e) {
return "Error_(" + e.getMessage() + ")";
}
}
}
if (getCachedObjectInfo() != null) {
return this.getCachedObjectInfo().shallIndexCacheForProxy();
}
return null;
}
@ -496,6 +406,8 @@ public class plasmaSwitchboardQueue {
* this method returns null if the answer is 'YES'!
* if the answer is 'NO' (do not index), it returns a string with the reason
* to reject the crawling demand in clear text
*
* This function is used by plasmaSwitchboard#processResourceStack
*/
public final String shallIndexCacheForCrawler() {
if (profile() == null) {
@ -520,9 +432,9 @@ public class plasmaSwitchboardQueue {
// we checked that in shallStoreCache
// a picture cannot be indexed
if (responseHeader() != null) {
if (plasmaHTCache.isPicture(responseHeader())) { return "Media_Content_(Picture)"; }
if (!plasmaHTCache.isText(responseHeader())) { return "Media_Content_(not_text)"; }
if (getCachedObjectInfo() != null) {
String status = this.getCachedObjectInfo().shallIndexCacheForProxy();
if (status != null) return status;
}
if (plasmaHTCache.noIndexingURL(nURL)) { return "Media_Content_(forbidden)"; }

Loading…
Cancel
Save