*) Adding yacy specific X-YACY-Index-Control header which can be used by clients

to disallow yacy to index the response that belongs to the request where 
   X-YACY-Index-Contro is set to "no-index"

*) Bugfix for Seed-List download via Remote Proxy.
   Now the pragma and cache-control http headers of the request are properly set to "no-cache" 
   See: http://www.yacy-forum.de/viewtopic.php?p=11639#11639

*) Bugfix for http-Proxy
   yacy has ignored "no-cache"- pragma and cache-control http headers that were send in requests.
   Now, these request headers are evaluated properly

TODO: Missing evaluation of "no-store" request headers

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@971 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 20 years ago
parent 539f9e57aa
commit 9a5ab62928

@ -131,6 +131,8 @@ public final class httpHeader extends TreeMap implements Map {
public static final String X_YACY_ORIGINAL_REQUEST_LINE = "X-Original-Request-Line";
public static final String X_YACY_PREVIOUS_REQUEST_LINE = "X-Previous-Request-Line";
public static final String X_YACY_INDEX_CONTROL = "X-YACY-Index-Control";
/* =============================================================
* Constants defining http methods
* ============================================================= */

@ -1072,13 +1072,36 @@ do upload
String password,
httpRemoteProxyConfig theRemoteProxyConfig
) throws IOException {
return wget(url,timeout,user,password,theRemoteProxyConfig,null);
}
public static ArrayList wget(
URL url,
int timeout,
String user,
String password,
httpRemoteProxyConfig theRemoteProxyConfig,
httpHeader requestHeader
) throws IOException {
int port = url.getPort();
boolean ssl = url.getProtocol().equals("https");
if (port < 0) port = (ssl) ? 443: 80;
String path = url.getPath();
String query = url.getQuery();
if ((query != null) && (query.length() > 0)) path = path + "?" + query;
// splitting of the byte array into lines
byte[] a = singleGET(
url,
url.getHost(),
port,
path,
timeout,
user,
password,
theRemoteProxyConfig
ssl,
theRemoteProxyConfig,
requestHeader
);
if (a == null) return null;
@ -1099,9 +1122,20 @@ do upload
String user,
String password,
httpRemoteProxyConfig theRemoteProxyConfig
) throws IOException {
return whead(url,timeout,user,password,theRemoteProxyConfig,null);
}
public static httpHeader whead(
URL url,
int timeout,
String user,
String password,
httpRemoteProxyConfig theRemoteProxyConfig,
httpHeader requestHeader
) throws IOException {
// generate request header
httpHeader requestHeader = new httpHeader();
if (requestHeader == null) requestHeader = new httpHeader();
if ((user != null) && (password != null) && (user.length() != 0)) {
requestHeader.put(httpHeader.AUTHORIZATION, serverCodings.standardCoder.encodeBase64String(user + ":" + password));
}

@ -699,8 +699,25 @@ public final class plasmaHTCache {
return null;
}
public boolean shallUseCacheForProxy() {
// decide upon header information if a specific file should be taken from the cache or not
/**
* decide upon header information if a specific file should be taken from the cache or not
* @return
*/
public boolean shallUseCacheForProxy() {
// if the client requests a un-cached copy of the resource ...
if (
(this.requestHeader.containsKey(httpHeader.PRAGMA)) &&
(((String) this.requestHeader.get(httpHeader.PRAGMA)).toUpperCase().equals("NO-CACHE"))
) return false;
if (
(this.requestHeader.containsKey(httpHeader.CACHE_CONTROL)) &&
(
(((String) this.requestHeader.get(httpHeader.CACHE_CONTROL)).toUpperCase().startsWith("NO-CACHE")) ||
(((String) this.requestHeader.get(httpHeader.CACHE_CONTROL)).toUpperCase().startsWith("MAX-AGE=0"))
)
) return false;
//System.out.println("SHALL READ CACHE: requestHeader = " + requestHeader.toString() + ", responseHeader = " + responseHeader.toString());

@ -631,36 +631,51 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
synchronized public boolean htEntryStoreProcess(plasmaHTCache.Entry entry) throws IOException {
if (entry == null) return false;
if (entry == null) return false;
// store response header
if (entry.responseHeader != null) {
cacheManager.storeHeader(entry.nomalizedURLHash, entry.responseHeader);
log.logInfo("WROTE HEADER for " + entry.cacheFile);
this.cacheManager.storeHeader(entry.nomalizedURLHash, entry.responseHeader);
this.log.logInfo("WROTE HEADER for " + entry.cacheFile);
}
/*
* Evaluating request header:
* With the X-YACY-Index-Control header set to "no-index" a client could disallow
* yacy to index the response returned as answer to a request
*/
boolean doIndexing = true;
if (entry.requestHeader != null) {
if (
(entry.requestHeader.containsKey(httpHeader.X_YACY_INDEX_CONTROL)) &&
(((String) entry.requestHeader.get(httpHeader.X_YACY_INDEX_CONTROL)).toUpperCase().equals("NO-INDEX"))
) {
doIndexing = false;
}
}
// work off unwritten files
if (entry.cacheArray == null) {
log.logInfo("EXISTING FILE (" + entry.cacheFile.length() + " bytes) for " + entry.cacheFile);
this.log.logInfo("EXISTING FILE (" + entry.cacheFile.length() + " bytes) for " + entry.cacheFile);
} else {
String error = entry.shallStoreCacheForProxy();
if (error == null) {
cacheManager.writeFile(entry.url, entry.cacheArray);
log.logInfo("WROTE FILE (" + entry.cacheArray.length + " bytes) for " + entry.cacheFile);
this.cacheManager.writeFile(entry.url, entry.cacheArray);
this.log.logInfo("WROTE FILE (" + entry.cacheArray.length + " bytes) for " + entry.cacheFile);
} else {
log.logInfo("WRITE OF FILE " + entry.cacheFile + " FORBIDDEN: " + error);
this.log.logInfo("WRITE OF FILE " + entry.cacheFile + " FORBIDDEN: " + error);
}
}
if (plasmaParser.supportedContent(entry.url,entry.responseHeader.mime())){
if ((doIndexing) && plasmaParser.supportedContent(entry.url,entry.responseHeader.mime())){
// registering the cachefile as in use
if (entry.cacheFile.exists()) {
cacheManager.filesInUse.add(entry.cacheFile);
plasmaHTCache.filesInUse.add(entry.cacheFile);
}
// enqueue for further crawling
enQueue(sbQueue.newEntry(entry.url, plasmaURL.urlHash(entry.referrerURL()),
enQueue(this.sbQueue.newEntry(entry.url, plasmaURL.urlHash(entry.referrerURL()),
entry.requestHeader.ifModifiedSince(), entry.requestHeader.containsKey(httpHeader.COOKIE),
entry.initiator(), entry.depth, entry.profile.handle(),
entry.name()

@ -136,45 +136,49 @@ public class yacyPeerActions {
}
public void loadSeedLists() {
// uses the superseed to initialize the database with known seeds
// uses the superseed to initialize the database with known seeds
yacySeed ys;
String seedListFileURL;
yacySeed ys;
String seedListFileURL;
URL url;
ArrayList seedList;
Iterator enu;
ArrayList seedList;
Iterator enu;
int lc;
int sc = seedDB.sizeConnected();
httpHeader header;
yacyCore.log.logInfo("BOOTSTRAP: " + sc + " seeds known from previous run");
// - load the superseed: a list of URL's
disorderSet superseed = loadSuperseed(superseedFile, superseedURL);
disorderSet superseed = loadSuperseed(superseedFile, superseedURL);
// - use the superseed to further fill up the seedDB
int ssc = 0;
for (int i = 0; i < superseed.size(); i++) {
if (Thread.currentThread().isInterrupted()) break;
seedListFileURL = (String) superseed.any();
if (seedListFileURL.startsWith("http://")) {
// load the seed list
try {
for (int i = 0; i < superseed.size(); i++) {
if (Thread.currentThread().isInterrupted()) break;
seedListFileURL = (String) superseed.any();
if (seedListFileURL.startsWith("http://")) {
// load the seed list
try {
httpHeader reqHeader = new httpHeader();
reqHeader.put(httpHeader.PRAGMA,"no-cache");
reqHeader.put(httpHeader.CACHE_CONTROL,"no-cache");
url = new URL(seedListFileURL);
header = httpc.whead(url, 5000, null, null, this.sb.remoteProxyConfig);
header = httpc.whead(url, 5000, null, null, this.sb.remoteProxyConfig,reqHeader);
if ((header == null) || (header.lastModified() == null)) {
yacyCore.log.logInfo("BOOTSTRAP: seed-list URL " + seedListFileURL + " not available");
} else if ((header.age() > 86400000) && (ssc > 0)) {
yacyCore.log.logInfo("BOOTSTRAP: seed-list URL " + seedListFileURL + " too old (" + (header.age() / 86400000) + " days)");
} else {
ssc++;
seedList = httpc.wget(url, 5000, null, null, this.sb.remoteProxyConfig);
seedList = httpc.wget(url, 5000, null, null, this.sb.remoteProxyConfig,reqHeader);
enu = seedList.iterator();
lc = 0;
while (enu.hasNext()) {
ys = yacySeed.genRemoteSeed((String) enu.next(), null);
if ((ys != null) && (ys.isProper() == null) &&
((seedDB.mySeed == null) || (seedDB.mySeed.hash != ys.hash))) {
((seedDB.mySeed == null) || (seedDB.mySeed.hash != ys.hash))) {
if (connectPeer(ys, false)) lc++;
//seedDB.writeMap(ys.hash, ys.getMap(), "init");
//System.out.println("BOOTSTRAP: received peer " + ys.get(yacySeed.NAME, "anonymous") + "/" + ys.getAddress());
@ -184,13 +188,13 @@ public class yacyPeerActions {
yacyCore.log.logInfo("BOOTSTRAP: " + lc + " seeds from seed-list URL " + seedListFileURL + ", AGE=" + (header.age() / 3600000) + "h");
}
} catch (Exception e) {
// this is when wget fails; may be because of missing internet connection
// we do nothing here and go silently over it
} catch (Exception e) {
// this is when wget fails; may be because of missing internet connection
// we do nothing here and go silently over it
yacyCore.log.logSevere("BOOTSTRAP: failed to load seeds from seed-list URL " + seedListFileURL);
}
}
}
}
}
}
yacyCore.log.logInfo("BOOTSTRAP: " + (seedDB.sizeConnected() - sc) + " new seeds while bootstraping.");
}

Loading…
Cancel
Save