nullpointer fix for profile(); other minor change;

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2009 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
borg-0300 19 years ago
parent 7f167be945
commit 92110aea32

@ -44,21 +44,22 @@
package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Date;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpHeader;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroStack;
import de.anomic.server.serverDate;
import de.anomic.server.logging.serverLog;
import de.anomic.server.serverDate;
import de.anomic.yacy.yacySeedDB;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Date;
public class plasmaSwitchboardQueue {
private kelondroStack sbQueueStack;
@ -327,16 +328,26 @@ public class plasmaSwitchboardQueue {
* if the answer is 'NO' (do not index), it returns a string with the reason
* to reject the crawling demand in clear text
*/
public String shallIndexCacheForProxy() {
public final String shallIndexCacheForProxy() {
if (profile() == null) {
return "shallIndexCacheForProxy: profile() is null !";
}
// check profile
if (!profile().localIndexing()) { return "Indexing_Not_Allowed"; }
if (!profile().localIndexing()) {
return "Indexing_Not_Allowed";
}
String nURL = normalizedURLString();
// -CGI access in request
// CGI access makes the page very individual, and therefore not usable in caches
if (!profile().crawlingQ()) {
if (plasmaHTCache.isPOST(nURL)) { return "Dynamic_(POST)"; }
if (plasmaHTCache.isCGI(nURL)) { return "Dynamic_(CGI)"; }
if (plasmaHTCache.isPOST(nURL)) {
return "Dynamic_(POST)";
}
if (plasmaHTCache.isCGI(nURL)) {
return "Dynamic_(CGI)";
}
}
// -authorization cases in request
@ -346,7 +357,9 @@ public class plasmaSwitchboardQueue {
// we checked that in shallStoreCache
// a picture cannot be indexed
if (plasmaHTCache.noIndexingURL(nURL)) { return "Media_Content_(forbidden)"; }
if (plasmaHTCache.noIndexingURL(nURL)) {
return "Media_Content_(forbidden)";
}
// -cookies in request
// unfortunately, we cannot index pages which have been requested with a cookie
@ -361,15 +374,21 @@ public class plasmaSwitchboardQueue {
// thus we do not care about it here for indexing
if (responseHeader() != null) {
// a picture cannot be indexed
if (plasmaHTCache.isPicture(responseHeader())) return "Media_Content_(Picture)";
if (!plasmaHTCache.isText(responseHeader())) return "Media_Content_(not_text)";
if (plasmaHTCache.isPicture(responseHeader())) {
return "Media_Content_(Picture)";
}
if (!plasmaHTCache.isText(responseHeader())) {
return "Media_Content_(not_text)";
}
// -if-modified-since in request
// if the page is fresh at the very moment we can index it
if ((ifModifiedSince != null) && (responseHeader().containsKey(httpHeader.LAST_MODIFIED))) {
// parse date
Date d = responseHeader().lastModified();
if (d == null) d = new Date(serverDate.correctedUTCTime());
if (d == null) {
d = new Date(serverDate.correctedUTCTime());
}
// finally, we shall treat the cache as stale if the modification time is after the if-.. time
if (d.after(ifModifiedSince)) {
//System.out.println("***not indexed because if-modified-since");
@ -392,9 +411,9 @@ public class plasmaSwitchboardQueue {
// the expires value gives us a very easy hint when the cache is stale
// sometimes, the expires date is set to the past to prevent that a page is cached
// we use that information to see if we should index it
Date expires = responseHeader().expires();
if (expires != null) {
if (expires.before(new Date(serverDate.correctedUTCTime()))) return "Stale_(Expired)";
final Date expires = responseHeader().expires();
if (expires != null && expires.before(new Date(serverDate.correctedUTCTime()))) {
return "Stale_(Expired)";
}
// -lastModified in cached response
@ -415,15 +434,17 @@ public class plasmaSwitchboardQueue {
cacheControl.startsWith("NO-CACHE") ||
cacheControl.startsWith("NO-STORE")) {
// easy case
return "Stale_(denied_by_cache-control=" + cacheControl+ ")";
return "Stale_(denied_by_cache-control=" + cacheControl + ")";
// } else if (cacheControl.startsWith("PUBLIC")) {
// // ok, do nothing
} else if (cacheControl.startsWith("MAX-AGE=")) {
// we need also the load date
Date date = responseHeader().date();
if (date == null) return "Stale_(no_date_given_in_response)";
final Date date = responseHeader().date();
if (date == null) {
return "Stale_(no_date_given_in_response)";
}
try {
long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live
final long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live
if (serverDate.correctedUTCTime() - date.getTime() > ttl) {
//System.out.println("***not indexed because cache-control");
return "Stale_(expired_by_cache-control)";
@ -443,11 +464,15 @@ public class plasmaSwitchboardQueue {
* if the answer is 'NO' (do not index), it returns a string with the reason
* to reject the crawling demand in clear text
*/
public String shallIndexCacheForCrawler() {
public final String shallIndexCacheForCrawler() {
if (profile() == null) {
return "shallIndexCacheForCrawler: profile() is null !";
}
// check profile
if (!profile().localIndexing()) { return "Indexing_Not_Allowed"; }
String nURL = normalizedURLString();
final String nURL = normalizedURLString();
// -CGI access in request
// CGI access makes the page very individual, and therefore not usable in caches
if (!profile().crawlingQ()) {

Loading…
Cancel
Save