From 40777556c546176b6f993316583f39b0a076c6b2 Mon Sep 17 00:00:00 2001 From: theli Date: Tue, 18 Oct 2005 07:45:27 +0000 Subject: [PATCH] *) Connection Tracking - adding automatic refresh - accepts new parameter nameLookup which can be used to deactivate yacy-peer name lookup (because we have problems with this on large seed-dbs) *) ViewFile New page that can be used to view - original content - plain text content - parsed content - parsed sentences of a webpage specified by there url hash Mainly for debugging purpose at the moment *) Robots.txt Bugfix for if-modified-since usage TODO: synchronization of downloads to avoid loading the same robots-file multiple times in parallel by different threads *) Shutdown Better abortion of transferRWI and transferURL sessions on server shutdown *) Status Page Adding icon to start/stop crawling via status page git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@950 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/Connections_p.html | 1 + htroot/Connections_p.java | 26 ++- htroot/IndexCreateIndexingQueue_p.java | 2 +- htroot/Status.html | 1 + htroot/Status.java | 12 ++ htroot/Status_p.inc | 2 +- htroot/ViewFile.html | 85 ++++++++ htroot/ViewFile.java | 173 +++++++++++++++ htroot/env/grafics/start.gif | Bin 0 -> 88 bytes htroot/env/grafics/stop.gif | Bin 0 -> 90 bytes htroot/index.html | 2 +- htroot/yacy/hello.java | 15 +- htroot/yacy/transferRWI.java | 20 +- htroot/yacy/transferURL.java | 9 +- source/de/anomic/data/robotsParser.java | 11 +- source/de/anomic/data/wikiCode.java | 2 +- source/de/anomic/http/httpHeader.java | 7 + source/de/anomic/http/httpd.java | 47 +---- source/de/anomic/http/httpdFileHandler.java | 4 + .../anomic/plasma/plasmaCrawlRobotsTxt.java | 8 +- source/de/anomic/plasma/plasmaHTCache.java | 199 +++++++++--------- .../de/anomic/plasma/plasmaSnippetCache.java | 2 +- .../de/anomic/plasma/plasmaSwitchboard.java | 4 +- source/de/anomic/server/serverCore.java | 15 +- source/de/anomic/yacy/yacyClient.java | 10 +- 25 files changed, 473 insertions(+), 184 deletions(-) create mode 100644 htroot/ViewFile.html create mode 100644 htroot/ViewFile.java create mode 100644 htroot/env/grafics/start.gif create mode 100644 htroot/env/grafics/stop.gif diff --git a/htroot/Connections_p.html b/htroot/Connections_p.html index c3941206f..f13af8523 100644 --- a/htroot/Connections_p.html +++ b/htroot/Connections_p.html @@ -3,6 +3,7 @@ YaCy '#[clientname]#': Connection Tracking #[metas]# + #[header]# diff --git a/htroot/Connections_p.java b/htroot/Connections_p.java index 40abf88b2..e611411f7 100644 --- a/htroot/Connections_p.java +++ b/htroot/Connections_p.java @@ -64,18 +64,29 @@ import de.anomic.server.serverCore.Session; import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacySeed; -public class Connections_p { +public final class Connections_p { public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch sb) { // return variable that accumulates replacements plasmaSwitchboard switchboard = (plasmaSwitchboard) sb; serverObjects prop = new serverObjects(); + // determines if name lookup should be done or not + boolean doNameLookup = true; + if (post.containsKey("nameLookup") && post.get("nameLookup","true").equals("false")) { + doNameLookup = false; + } + + // getting the virtualHost string String virtualHost = switchboard.getConfig("fileHost","localhost"); + // getting the serverCore thread serverThread httpd = switchboard.getThread("10_httpd"); + + // getting the session threadgroup ThreadGroup httpSessions = ((serverCore)httpd).getSessionThreadGroup(); + // getting the server core pool configuration GenericObjectPool.Config httpdPoolConfig = ((serverCore)httpd).getPoolConfig(); /* waiting for all threads to finish */ @@ -122,11 +133,14 @@ public class Connections_p { // determining if the source is a yacy host - yacySeed seed = yacyCore.seedDB.lookupByIP(userAddress,true,false,false); - if (seed != null) { - if ((seed.hash == yacyCore.seedDB.mySeed.hash) && - (!seed.get(yacySeed.PORT,"").equals(Integer.toString(userPort)))) { - seed = null; + yacySeed seed = null; + if (doNameLookup) { + seed = yacyCore.seedDB.lookupByIP(userAddress,true,false,false); + if (seed != null) { + if ((seed.hash.equals(yacyCore.seedDB.mySeed.hash)) && + (!seed.get(yacySeed.PORT,"").equals(Integer.toString(userPort)))) { + seed = null; + } } } diff --git a/htroot/IndexCreateIndexingQueue_p.java b/htroot/IndexCreateIndexingQueue_p.java index 82290feb3..691e0061d 100644 --- a/htroot/IndexCreateIndexingQueue_p.java +++ b/htroot/IndexCreateIndexingQueue_p.java @@ -166,7 +166,7 @@ public class IndexCreateIndexingQueue_p { } catch (IOException e) {} prop.put("indexing-queue_num", entryCount);//num entries in queue - prop.put("indexing-queue_totalSize", Status.bytesToString(totalSize));//num entries in queue + prop.put("indexing-queue_totalSize", bytesToString(totalSize));//num entries in queue prop.put("indexing-queue_list", entryCount); } diff --git a/htroot/Status.html b/htroot/Status.html index dd6efdbe5..474fa400f 100644 --- a/htroot/Status.html +++ b/htroot/Status.html @@ -96,6 +96,7 @@ You are in permanent mode. Attention: If you don't have a flatrate or are

#%[privateStatusTable]%# +

Last Refresh: #[date]#

#[footer]# diff --git a/htroot/Status.java b/htroot/Status.java index 5e7fb3f77..0a73738d9 100644 --- a/htroot/Status.java +++ b/htroot/Status.java @@ -48,6 +48,7 @@ import java.lang.Math; import java.text.DecimalFormat; +import java.util.Date; import java.io.File; import de.anomic.http.httpHeader; @@ -71,6 +72,16 @@ public class Status { // return variable that accumulates replacements final serverObjects prop = new serverObjects(); + if (post != null) { + if (post.containsKey("pausecrawlqueue")) { + ((plasmaSwitchboard)env).pauseCrawling(); + } else if (post.containsKey("continuecrawlqueue")) { + ((plasmaSwitchboard)env).continueCrawling(); + } + prop.put("LOCATION",""); + return prop; + } + /* versionProbe=http://www.anomic.de/AnomicHTTPProxy/release.txt superseedFile=superseed.txt @@ -252,6 +263,7 @@ public class Status { // return rewrite properties + prop.put("date",(new Date()).toString()); return prop; } diff --git a/htroot/Status_p.inc b/htroot/Status_p.inc index 3268dbb28..d516f709e 100644 --- a/htroot/Status_p.inc +++ b/htroot/Status_p.inc @@ -70,7 +70,7 @@ Loader Queue - #[loaderQueueSize]# | #[loaderQueueMax]# #(loaderPaused)#::(paused)#(/loaderPaused)# + #[loaderQueueSize]# | #[loaderQueueMax]# #(loaderPaused)#::(paused)#(/loaderPaused)#  [Details] diff --git a/htroot/ViewFile.html b/htroot/ViewFile.html new file mode 100644 index 000000000..07be4b480 --- /dev/null +++ b/htroot/ViewFile.html @@ -0,0 +1,85 @@ + + + +YaCy '#[clientname]#': View URL Content +#[metas]# + + +#[header]# +

+

View URL Content

+ +

+#(error)# + + + + + + + + + + + + + + + + + + + + + + + + + +
URL#[url]#
Hash#[hash]#
Word Count#[wordCount]#
Description#[desc]#
Size#[size]#
View as: + Original | + Plain Text | + Parsed Text | + Parsed Sentences +
+:: +No URL hash submitted. +:: +Unable to find URL Entry in DB +:: +Invalid URL +:: +Unable to download resource content. +:: +Unable to parse resource content. +#(/error)# +
+

+

+#(viewMode)# +:: +

Plain Resource Content


+ #[plainText]# +:: +

Parsed Resource Content


+ #[parsedText]# +:: +

Parsed Resource Sentences


+ + #{sentences}# + + + + + #{/sentences}# +
#[nr]##[text]#
+:: +

Original Resource Content


+ +#(/viewMode)# +

+ +#[footer]# + + \ No newline at end of file diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java new file mode 100644 index 000000000..ce94aa971 --- /dev/null +++ b/htroot/ViewFile.java @@ -0,0 +1,173 @@ +//ViewFile.java +//----------------------- +//part of YaCy +//(C) by Michael Peter Christen; mc@anomic.de +//first published on http://www.anomic.de +//Frankfurt, Germany, 2004 +// +//last major change: 12.07.2004 +// +//This program is free software; you can redistribute it and/or modify +//it under the terms of the GNU General Public License as published by +//the Free Software Foundation; either version 2 of the License, or +//(at your option) any later version. +// +//This program is distributed in the hope that it will be useful, +//but WITHOUT ANY WARRANTY; without even the implied warranty of +//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +//GNU General Public License for more details. +// +//You should have received a copy of the GNU General Public License +//along with this program; if not, write to the Free Software +//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +//Using this software in any meaning (reading, learning, copying, compiling, +//running) means that you agree that the Author(s) is (are) not responsible +//for cost, loss of data or any harm that may be caused directly or indirectly +//by usage of this softare or this documentation. The usage of this software +//is on your own risk. The installation and usage (starting/running) of this +//software may allow other people or application to access your computer and +//any attached devices and is highly dependent on the configuration of the +//software which must be done by the user of the software; the author(s) is +//(are) also not responsible for proper configuration and usage of the +//software, even if provoked by documentation provided together with +//the software. +// +//Any changes to this file according to the GPL as documented in the file +//gpl.txt aside this file in the shipment you received can be done to the +//lines that follows this copyright notice here, but changes must not be +//done inside the copyright notive above. A re-distribution must contain +//the intact and unchanged copyright notice. +//Contributions and changes to the program code must be marked as such. + +//you must compile this file with +//javac -classpath .:../Classes Status.java +//if the shell's current path is HTROOT + +import java.io.IOException; +import java.net.URL; + +import de.anomic.http.httpHeader; +import de.anomic.plasma.plasmaParserDocument; +import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.plasma.plasmaCrawlLURL.Entry; +import de.anomic.server.serverObjects; +import de.anomic.server.serverSwitch; + +public class ViewFile { + + public static final int VIEW_MODE_NO_TEXT = 0; + public static final int VIEW_MODE_AS_PLAIN_TEXT = 1; + public static final int VIEW_MODE_AS_PARSED_TEXT = 2; + public static final int VIEW_MODE_AS_PARSED_SENTENCES = 3; + public static final int VIEW_MODE_AS_IFRAME = 4; + + public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) { + + serverObjects prop = new serverObjects(); + plasmaSwitchboard sb = (plasmaSwitchboard)env; + + if (post != null) { + // getting the url hash from which the content should be loaded + String urlHash = post.get("urlHash",""); + if (urlHash.equals("")) { + prop.put("error",1); + prop.put("viewMode",VIEW_MODE_NO_TEXT); + return prop; + } + + String viewMode = post.get("viewMode","plain"); + + // getting the urlEntry that belongs to the url hash + Entry urlEntry = sb.urlPool.loadedURL.getEntry(urlHash); + if (urlEntry == null) { + prop.put("error",2); + prop.put("viewMode",VIEW_MODE_NO_TEXT); + return prop; + } + + // gettin the url that belongs to the entry + URL url = urlEntry.url(); + if (url == null) { + prop.put("error",3); + prop.put("viewMode",VIEW_MODE_NO_TEXT); + return prop; + } + + // loading the resource content as byte array + byte[] resource = null; + try { + resource = sb.cacheManager.loadResource(url); + if (resource == null) { + sb.snippetCache.loadResourceFromWeb(url, 5000); + + resource = sb.cacheManager.loadResource(url); + if (resource == null) { + prop.put("error",4); + prop.put("viewMode",VIEW_MODE_NO_TEXT); + return prop; + } + } + } catch (IOException e) { + if (url == null) { + prop.put("error",4); + prop.put("viewMode",VIEW_MODE_NO_TEXT); + return prop; + } + } + if (viewMode.equals("plain")) { + String content = new String(resource); + content = content.replaceAll("<","<") + .replaceAll(">",">") + .replaceAll("\"",""") + .replaceAll("\n","
") + .replaceAll("\t","    "); + + prop.put("error",0); + prop.put("viewMode",VIEW_MODE_AS_PLAIN_TEXT); + prop.put("viewMode_plainText",content); + } else if (viewMode.equals("parsed") || viewMode.equals("sentences") || viewMode.equals("iframe")) { + // parsing the resource content + plasmaParserDocument document = sb.snippetCache.parseDocument(url, resource); + if (document == null) { + prop.put("error",5); + prop.put("viewMode",VIEW_MODE_NO_TEXT); + return prop; + } + + if (viewMode.equals("parsed")) { + String content = new String(document.getText()); + content = content.replaceAll("\n","
") + .replaceAll("\t","    "); + + prop.put("viewMode",VIEW_MODE_AS_PARSED_TEXT); + prop.put("viewMode_parsedText",content); + } else if (viewMode.equals("iframe")) { + prop.put("viewMode",VIEW_MODE_AS_IFRAME); + prop.put("viewMode_url",url.toString()); + } else { + prop.put("viewMode",VIEW_MODE_AS_PARSED_SENTENCES); + String[] sentences = document.getSentences(); + + boolean dark = true; + for (int i=0; i < sentences.length; i++) { + prop.put("viewMode_sentences_" + i + "_nr",Integer.toString(i+1)); + prop.put("viewMode_sentences_" + i + "_text",sentences[i]); + prop.put("viewMode_sentences_" + i + "_dark",((dark) ? 1 : 0) ); dark=!dark; + } + prop.put("viewMode_sentences",sentences.length); + + } + } + prop.put("error",0); + prop.put("error_url",url.toString()); + prop.put("error_hash",urlHash); + prop.put("error_wordCount",Integer.toString(urlEntry.wordCount())); + prop.put("error_desc",urlEntry.descr()); + prop.put("error_size",urlEntry.size()); + } + + return prop; + } + +} diff --git a/htroot/env/grafics/start.gif b/htroot/env/grafics/start.gif new file mode 100644 index 0000000000000000000000000000000000000000..619bc2c5c8f0b6da59aa7dffe3bc7b503980ec35 GIT binary patch literal 88 zcmZ?wbhEHbOV literal 0 HcmV?d00001 diff --git a/htroot/env/grafics/stop.gif b/htroot/env/grafics/stop.gif new file mode 100644 index 0000000000000000000000000000000000000000..5a5836b927724dddfc48f0d0087013fde88e6491 GIT binary patch literal 90 zcmZ?wbhEHb&J0BG|gt^fc4 literal 0 HcmV?d00001 diff --git a/htroot/index.html b/htroot/index.html index b14449595..5d7e8efd6 100644 --- a/htroot/index.html +++ b/htroot/index.html @@ -101,7 +101,7 @@ from 'late' peers.

#[description]#
#(snippet)#::#[text]#
#(/snippet)# #[urlname]#
-#[date]#

+#[date]# | Info

#{/results}# diff --git a/htroot/yacy/hello.java b/htroot/yacy/hello.java index a9cbe6fd6..6cb611a17 100644 --- a/htroot/yacy/hello.java +++ b/htroot/yacy/hello.java @@ -61,7 +61,7 @@ import de.anomic.yacy.yacyVersion; public final class hello { - public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch ss) { + public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch ss) throws InterruptedException { if (post == null || ss == null || yacyCore.seedDB == null || yacyCore.seedDB.mySeed == null) { return null; } // return variable that accumulates replacements @@ -71,9 +71,9 @@ public final class hello { // final String iam = (String) post.get("iam", ""); // complete seed of the requesting peer // final String pattern = (String) post.get("pattern", ""); // // final String mytime = (String) post.get(MYTIME, ""); // - final String key = (String) post.get("key", ""); // transmission key for response - final String seed = (String) post.get(yacySeed.SEED, ""); - final String countStr = (String) post.get("count", "0"); + final String key = post.get("key", ""); // transmission key for response + final String seed = post.get(yacySeed.SEED, ""); + final String countStr = post.get("count", "0"); int i; int count = 0; try {count = (countStr == null) ? 0 : Integer.parseInt(countStr);} catch (NumberFormatException e) {count = 0;} @@ -93,7 +93,9 @@ public final class hello { // if the remote client has reported its own IP address and the client supports // the port forwarding feature (if client version >= 0.383) then we try to // connect to the reported IP address first - if (reportedip.length() > 0 && !clientip.equals(reportedip) && clientversion >= yacyVersion.YACY_SUPPORTS_PORT_FORWARDING) { + if (reportedip.length() > 0 && !clientip.equals(reportedip) && clientversion >= yacyVersion.YACY_SUPPORTS_PORT_FORWARDING) { + serverCore.checkInterruption(); + // try first the reportedip, since this may be a connect from a port-forwarding host prop.put(yacySeed.YOURIP, reportedip); remoteSeed.put(yacySeed.IP, reportedip); @@ -123,6 +125,8 @@ public final class hello { // we are only allowed to connect to the client IP address if it's not our own address if (!isLocalIP) { + serverCore.checkInterruption(); + prop.put(yacySeed.YOURIP, clientip); remoteSeed.put(yacySeed.IP, clientip); urls = yacyClient.queryUrlCount(remoteSeed); @@ -162,6 +166,7 @@ public final class hello { "' to '" + prop.get(yacySeed.YOURTYPE) + "'."); } + serverCore.checkInterruption(); final StringBuffer seeds = new StringBuffer(768); // attach some more seeds, as requested if ((yacyCore.seedDB != null) && (yacyCore.seedDB.sizeConnected() > 0)) { diff --git a/htroot/yacy/transferRWI.java b/htroot/yacy/transferRWI.java index 782b50518..323d74b6b 100644 --- a/htroot/yacy/transferRWI.java +++ b/htroot/yacy/transferRWI.java @@ -46,13 +46,15 @@ // javac -classpath .:../classes transferRWI.java -import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; +import java.util.LinkedList; + import de.anomic.http.httpHeader; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaWordIndexEntry; import de.anomic.plasma.plasmaWordIndexEntryContainer; +import de.anomic.server.serverCore; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.yacy.yacyCore; @@ -61,7 +63,7 @@ import de.anomic.yacy.yacyDHTAction; public final class transferRWI { - public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch ss) { + public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch ss) throws InterruptedException { if (post == null || ss == null) { return null; } long start = System.currentTimeMillis(); @@ -77,7 +79,7 @@ public final class transferRWI { // final String key = (String) post.get("key", ""); // transmission key final int wordc = Integer.parseInt((String) post.get("wordc", "")); // number of different words final int entryc = Integer.parseInt((String) post.get("entryc", "")); // number of entries in indexes - final byte[] indexes = ((String) post.get("indexes", "")).getBytes(); // the indexes, as list of word entries + byte[] indexes = ((String) post.get("indexes", "")).getBytes(); // the indexes, as list of word entries final boolean granted = sb.getConfig("allowReceiveIndex", "false").equals("true"); // response values @@ -93,7 +95,7 @@ public final class transferRWI { final long startProcess = System.currentTimeMillis(); // decode request - ArrayList v = new ArrayList(); + final LinkedList v = new LinkedList(); int s = 0; int e; while (s < indexes.length) { @@ -101,6 +103,9 @@ public final class transferRWI { if ((e - s) > 0) v.add(new String(indexes, s, e - s)); s = e; while (s < indexes.length) if (indexes[s++] >= 32) {s--; break;} } + // free memory + indexes = null; + // the value-vector should now have the same length as entryc if (v.size() != entryc) sb.getLog().logSevere("ERROR WITH ENTRY COUNTER: v=" + v.size() + ", entryc=" + entryc); @@ -114,13 +119,17 @@ public final class transferRWI { String[] wordhashes = new String[v.size()]; int received = 0; for (int i = 0; i < v.size(); i++) { - estring = (String) v.get(i); + serverCore.checkInterruption(); + + estring = (String) v.removeFirst(); p = estring.indexOf("{"); if (p > 0) { wordHash = estring.substring(0, p); wordhashes[i] = wordHash; entry = new plasmaWordIndexEntry(estring.substring(p)); sb.wordIndex.addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), entry), true); + serverCore.checkInterruption(); + urlHash = entry.getUrlHash(); if ((!(unknownURL.contains(urlHash))) && (!(sb.urlPool.loadedURL.exists(urlHash)))) { @@ -155,5 +164,4 @@ public final class transferRWI { // return rewrite properties return prop; } - } \ No newline at end of file diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java index c344e99cc..f99a2ef54 100644 --- a/htroot/yacy/transferURL.java +++ b/htroot/yacy/transferURL.java @@ -48,6 +48,7 @@ import de.anomic.http.httpHeader; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaCrawlLURL; +import de.anomic.server.serverCore; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.yacy.yacyCore; @@ -55,7 +56,7 @@ import de.anomic.yacy.yacySeed; public final class transferURL { - public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch ss) { + public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch ss) throws InterruptedException { if (post == null || ss == null) { return null; } long start = System.currentTimeMillis(); @@ -69,7 +70,7 @@ public final class transferURL { final String iam = (String) post.get("iam", ""); // seed hash of requester // final String youare = (String) post.get("youare", ""); // seed hash of the target peer, needed for network stability // final String key = (String) post.get("key", ""); // transmission key - final int urlc = Integer.parseInt((String) post.get("urlc", "")); // number of transported urls + final int urlc = Integer.parseInt(post.get("urlc", "")); // number of transported urls final boolean granted = sb.getConfig("allowReceiveIndex", "false").equals("true"); final boolean blockBlacklist = sb.getConfig("indexReceiveBlockBlacklist", "false").equals("true"); @@ -87,13 +88,14 @@ public final class transferURL { String urls; plasmaCrawlLURL.Entry lEntry; for (int i = 0; i < urlc; i++) { + serverCore.checkInterruption(); urls = (String) post.get("url" + i); if (urls == null) { yacyCore.log.logFine("transferURL: got null URL-string from peer " + otherPeerName); } else { lEntry = sb.urlPool.loadedURL.newEntry(urls, true); if (lEntry != null && blockBlacklist && - sb.urlBlacklist.isListed(lEntry.url().getHost().toLowerCase(), lEntry.url().getPath())) { + plasmaSwitchboard.urlBlacklist.isListed(lEntry.url().getHost().toLowerCase(), lEntry.url().getPath())) { yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + lEntry.url() + "' from peer " + otherPeerName); lEntry = null; } @@ -122,5 +124,4 @@ public final class transferURL { prop.put("result", result); return prop; } - } \ No newline at end of file diff --git a/source/de/anomic/data/robotsParser.java b/source/de/anomic/data/robotsParser.java index 3d6b11802..22a19e4a8 100644 --- a/source/de/anomic/data/robotsParser.java +++ b/source/de/anomic/data/robotsParser.java @@ -197,7 +197,10 @@ public final class robotsParser{ robotsTxt = (byte[])result[1]; eTag = (String) result[2]; modDate = (Date) result[3]; - } + } else if (robotsTxt4Host != null) { + robotsTxt4Host.setLoadedDate(new Date()); + plasmaSwitchboard.robots.addEntry(robotsTxt4Host); + } } catch (Exception e) { serverLog.logSevere("ROBOTS","Unable to download the robots.txt file from URL '" + robotsURL + "'. " + e.getMessage()); } @@ -218,7 +221,7 @@ public final class robotsParser{ // storing the data into the robots DB robotsTxt4Host = plasmaSwitchboard.robots.addEntry(urlHostPort,denyPath,new Date(),modDate,eTag); - } + } } if (robotsTxt4Host.isDisallowed(nexturl.getPath())) { @@ -229,7 +232,7 @@ public final class robotsParser{ private static Object[] downloadRobotsTxt(URL robotsURL, int redirectionCount, plasmaCrawlRobotsTxt.Entry entry) throws Exception { - if (redirectionCount < 0) return new Object[]{Boolean.FALSE,null}; + if (redirectionCount < 0) return new Object[]{Boolean.FALSE,null,null}; redirectionCount--; boolean accessCompletelyRestricted = false; @@ -253,7 +256,7 @@ public final class robotsParser{ oldEtag = entry.getETag(); reqHeaders = new httpHeader(); Date modDate = entry.getModDate(); - if (modDate != null) reqHeaders.put(httpHeader.IF_MODIFIED_SINCE,entry.getModDate()); + if (modDate != null) reqHeaders.put(httpHeader.IF_MODIFIED_SINCE,httpc.dateString(entry.getModDate())); } httpc.response res = con.GET(robotsURL.getPath(), reqHeaders); diff --git a/source/de/anomic/data/wikiCode.java b/source/de/anomic/data/wikiCode.java index 899ff8fdf..7ead416a8 100644 --- a/source/de/anomic/data/wikiCode.java +++ b/source/de/anomic/data/wikiCode.java @@ -100,7 +100,7 @@ public class wikiCode { } } - public String replaceHTML(String result) { + public static String replaceHTML(String result) { int p0; // avoide html inside diff --git a/source/de/anomic/http/httpHeader.java b/source/de/anomic/http/httpHeader.java index e45d0c533..19f5e5236 100644 --- a/source/de/anomic/http/httpHeader.java +++ b/source/de/anomic/http/httpHeader.java @@ -80,6 +80,13 @@ import de.anomic.yacy.yacyCore; public final class httpHeader extends TreeMap implements Map { + /* ============================================================= + * Constants defining http versions + * ============================================================= */ + public static final String HTTP_VERSION_0_9 = "HTTP/0.9"; + public static final String HTTP_VERSION_1_0 = "HTTP/1.0"; + public static final String HTTP_VERSION_1_1 = "HTTP/1.1"; + /* ============================================================= * Constants defining http header names * ============================================================= */ diff --git a/source/de/anomic/http/httpd.java b/source/de/anomic/http/httpd.java index eb1f3acbb..d9a325ec6 100644 --- a/source/de/anomic/http/httpd.java +++ b/source/de/anomic/http/httpd.java @@ -1222,12 +1222,13 @@ public final class httpd implements serverHandler { if (respond == null) throw new NullPointerException("The outputstream must not be null."); if (conProp == null) throw new NullPointerException("The connection property structure must not be null."); if (httpVersion == null) httpVersion = conProp.getProperty(httpHeader.CONNECTION_PROP_HTTP_VER,"HTTP/1.1"); + if (header == null) header = new httpHeader(); try { if ((httpStatusText == null)||(httpStatusText.length()==0)) { - if (httpVersion.equals("HTTP/1.0") && httpHeader.http1_0.containsKey(Integer.toString(httpStatusCode))) + if (httpVersion.equals(httpHeader.HTTP_VERSION_1_0) && httpHeader.http1_0.containsKey(Integer.toString(httpStatusCode))) httpStatusText = (String) httpHeader.http1_0.get(Integer.toString(httpStatusCode)); - else if (httpVersion.equals("HTTP/1.1") && httpHeader.http1_1.containsKey(Integer.toString(httpStatusCode))) + else if (httpVersion.equals(httpHeader.HTTP_VERSION_1_1) && httpHeader.http1_1.containsKey(Integer.toString(httpStatusCode))) httpStatusText = (String) httpHeader.http1_1.get(Integer.toString(httpStatusCode)); else httpStatusText = "Unknown"; } @@ -1389,45 +1390,5 @@ public final class httpd implements serverHandler { } } catch (Exception e) {} return false; - } - - -// public static boolean isTextMime(String mime, Set whitelist) { -// if (whitelist.contains(mime)) return true; -// // some mime-types are given as "text/html; charset=...", so look for ";" -// if (mime.length() == 0) return false; -// int pos = mime.indexOf(';'); -// if (pos < 0) return false; -// return whitelist.contains(mime.substring(0, pos)); -// } + } } - -/* - ### - ### Messages of the Server - ### - - # success Messages - HTTPStatus200 = OK; The URL was found. It contents follows. - HTTPStatus201 = Created; A URL was created in response to a POST. - HTTPStatus202 = Accepted; The request was accepted for processing later. - HTTPStatus203 = Non-Authoritative; The information here is unofficial. - HTTPStatus204 = No Response; The request is successful, but there is no data to send. - - # redirection - HTTPStatus300 = Moved; The URL has permanently moved to a new location. - HTTPStatus301 = Found; The URL can be temporarily found at a new location. - - # client errors - HTTPStatus400 = Bad Request; Syntax error in the request. - HTTPStatus401 = Unauthorized; The client is not authorized to access this web page. - HTTPStatus402 = Payment Required; A payment is required to access this web page. - HTTPStatus403 = Forbidden; This URL is forbidden. No authorization is required, it won't help. - HTTPStatus404 = Not Found; This page is not on the server. - - # server errors - HTTPStatus500 = Internal Error; The server encountered an unexpected error. - HTTPStatus501 = Not Implemented; The client requested an unimplemented feature. - HTTPStatus502 = Service Overloaded; The server reached the maximum number of connections. - HTTPStatus503 = Gateway timeout; Fetching data from remote service failed. - */ diff --git a/source/de/anomic/http/httpdFileHandler.java b/source/de/anomic/http/httpdFileHandler.java index 1e59ef301..39a15b792 100644 --- a/source/de/anomic/http/httpdFileHandler.java +++ b/source/de/anomic/http/httpdFileHandler.java @@ -534,6 +534,10 @@ public final class httpdFileHandler extends httpdAbstractHandler implements http tp.put("clientname", switchboard.getConfig("peerName", "anomic")); //System.out.println("respond props: " + ((tp == null) ? "null" : tp.toString())); // debug } catch (InvocationTargetException e) { + if (e.getCause() instanceof InterruptedException) { + throw new InterruptedException(e.getCause().getMessage()); + } + this.theLogger.logSevere("INTERNAL ERROR: " + e.toString() + ":" + e.getMessage() + " target exception at " + targetClass + ": " + diff --git a/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java b/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java index 86ab710a7..33fe17c39 100644 --- a/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java +++ b/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java @@ -60,7 +60,7 @@ import de.anomic.kelondro.kelondroException; import de.anomic.server.logging.serverLog; public class plasmaCrawlRobotsTxt { - private kelondroMap robotsTable; + kelondroMap robotsTable; private final File robotsTableFile; private int bufferkb; @@ -221,6 +221,12 @@ public class plasmaCrawlRobotsTxt { return null; } + public void setLoadedDate(Date newLoadedDate) { + if (newLoadedDate != null) { + this.mem.put(LOADED_DATE,Long.toString(newLoadedDate.getTime())); + } + } + public Date getModDate() { if (this.mem.containsKey(MOD_DATE)) { return new Date(Long.valueOf((String) this.mem.get(MOD_DATE)).longValue()); diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java index 2b8bc4d5d..09bfc984b 100644 --- a/source/de/anomic/plasma/plasmaHTCache.java +++ b/source/de/anomic/plasma/plasmaHTCache.java @@ -84,7 +84,7 @@ public final class plasmaHTCache { public long currCacheSize; public long maxCacheSize; public final File cachePath; - public static serverLog log; + public final serverLog log; public static final HashSet filesInUse = new HashSet(); // can we delete this file public plasmaHTCache(File htCachePath, long maxCacheSize, int bufferkb) { @@ -100,33 +100,33 @@ public final class plasmaHTCache { } if (!(htCachePath.isDirectory())) { // if the cache does not exists or is a file and not a directory, panic - log.logSevere("the cache path " + htCachePath.toString() + " is not a directory or does not exists and cannot be created"); + this.log.logSevere("the cache path " + htCachePath.toString() + " is not a directory or does not exists and cannot be created"); System.exit(0); } // open the response header database - File dbfile = new File(cachePath, "responseHeader.db"); + File dbfile = new File(this.cachePath, "responseHeader.db"); try { if (dbfile.exists()) - responseHeaderDB = new kelondroMap(new kelondroDyn(dbfile, bufferkb * 0x400)); + this.responseHeaderDB = new kelondroMap(new kelondroDyn(dbfile, bufferkb * 0x400)); else - responseHeaderDB = new kelondroMap(new kelondroDyn(dbfile, bufferkb * 0x400, plasmaCrawlLURL.urlHashLength, 150)); + this.responseHeaderDB = new kelondroMap(new kelondroDyn(dbfile, bufferkb * 0x400, plasmaURL.urlHashLength, 150)); } catch (IOException e) { - log.logSevere("the request header database could not be opened: " + e.getMessage()); + this.log.logSevere("the request header database could not be opened: " + e.getMessage()); System.exit(0); } // init stack - cacheStack = new LinkedList(); + this.cacheStack = new LinkedList(); // init cache age and size management - cacheAge = new TreeMap(); - currCacheSize = 0; + this.cacheAge = new TreeMap(); + this.currCacheSize = 0; this.maxCacheSize = maxCacheSize; // start the cache startup thread // this will collect information about the current cache size and elements - serverInstantThread.oneTimeJob(this, "cacheScan", log, 5000); + serverInstantThread.oneTimeJob(this, "cacheScan", this.log, 5000); } public int size() { @@ -136,15 +136,15 @@ public final class plasmaHTCache { } public int dbSize() { - return responseHeaderDB.size(); + return this.responseHeaderDB.size(); } public int[] dbCacheChunkSize() { - return responseHeaderDB.cacheChunkSize(); + return this.responseHeaderDB.cacheChunkSize(); } public int[] dbCacheFillStatus() { - return responseHeaderDB.cacheFillStatus(); + return this.responseHeaderDB.cacheFillStatus(); } public void push(Entry entry) { @@ -157,17 +157,16 @@ public final class plasmaHTCache { synchronized (this.cacheStack) { if (this.cacheStack.size() > 0) return (Entry) this.cacheStack.removeFirst(); - else - return null; + return null; } } public void storeHeader(String urlHash, httpHeader responseHeader) throws IOException { - responseHeaderDB.set(urlHash, responseHeader); + this.responseHeaderDB.set(urlHash, responseHeader); } public long getFreeSize() { - return (currCacheSize > maxCacheSize) ? 0 : maxCacheSize - currCacheSize; + return (this.currCacheSize > this.maxCacheSize) ? 0 : this.maxCacheSize - this.currCacheSize; } public boolean writeFile(URL url, byte[] array) { @@ -181,10 +180,10 @@ public final class plasmaHTCache { // this is the case of a "(Not a directory)" error, which should be prohibited // by the shallStoreCache() property. However, sometimes the error still occurs // In this case do nothing. - log.logSevere("File storage failed (not a directory): " + e.getMessage()); + this.log.logSevere("File storage failed (not a directory): " + e.getMessage()); return false; } catch (IOException e) { - log.logSevere("File storage failed (IO error): " + e.getMessage()); + this.log.logSevere("File storage failed (IO error): " + e.getMessage()); return false; } writeFileAnnouncement(file); @@ -192,10 +191,10 @@ public final class plasmaHTCache { } public void writeFileAnnouncement(File file) { - synchronized (cacheAge) { + synchronized (this.cacheAge) { if (file.exists()) { - currCacheSize += file.length(); - cacheAge.put(ageString(file.lastModified(), file), file); + this.currCacheSize += file.length(); + this.cacheAge.put(ageString(file.lastModified(), file), file); cleanup(); } } @@ -209,22 +208,21 @@ public final class plasmaHTCache { if (deleteFileandDirs(getCachePath(url), msg)) { try { // As the file is gone, the entry in responseHeader.db is not needed anymore - log.logFinest("Trying to remove responseHeader from URL: " + url.toString()); - responseHeaderDB.remove(plasmaURL.urlHash(url)); + this.log.logFinest("Trying to remove responseHeader from URL: " + url.toString()); + this.responseHeaderDB.remove(plasmaURL.urlHash(url)); } catch (IOException e) { - log.logInfo("IOExeption removing response header from DB: " + e.getMessage(), e); + this.log.logInfo("IOExeption removing response header from DB: " + e.getMessage(), e); } return true; - } else { - return false; } + return false; } private boolean deleteFile(File obj) { if (obj.exists() && !filesInUse.contains(obj)) { long size = obj.length(); if (obj.delete()) { - currCacheSize -= size; + this.currCacheSize -= size; return true; } } @@ -233,39 +231,38 @@ public final class plasmaHTCache { private boolean deleteFileandDirs (File obj, String msg) { if (deleteFile(obj)) { - log.logInfo("DELETED " + msg + " CACHE : " + obj.toString()); + this.log.logInfo("DELETED " + msg + " CACHE : " + obj.toString()); obj = obj.getParentFile(); // If the has been emptied, remove it // Loop as long as we produce empty driectoriers, but stop at HTCACHE - while ((!(obj.equals(cachePath))) && (obj.isDirectory()) && (obj.list().length == 0)) { - if (obj.delete()) log.logInfo("DELETED EMPTY DIRECTORY : " + obj.toString()); + while ((!(obj.equals(this.cachePath))) && (obj.isDirectory()) && (obj.list().length == 0)) { + if (obj.delete()) this.log.logInfo("DELETED EMPTY DIRECTORY : " + obj.toString()); obj = obj.getParentFile(); } return true; - } else { - return false; } + return false; } private void cleanupDoIt(long newCacheSize) { - if (cacheAge.size() == 0) return; + if (this.cacheAge.size() == 0) return; File obj; - Iterator iter = cacheAge.keySet().iterator(); - while (iter.hasNext() && (currCacheSize >= newCacheSize)) { + Iterator iter = this.cacheAge.keySet().iterator(); + while (iter.hasNext() && (this.currCacheSize >= newCacheSize)) { Object key = iter.next(); - obj = (File) cacheAge.get(key); + obj = (File) this.cacheAge.get(key); if (obj != null) { if (filesInUse.contains(obj)) continue; - log.logFinest("Trying to delete old file: " + obj.toString()); + this.log.logFinest("Trying to delete old file: " + obj.toString()); if (deleteFileandDirs (obj, "OLD")) { try { // As the file is gone, the entry in responseHeader.db is not needed anymore - log.logFinest("Trying to remove responseHeader for URL: " + - getURL(cachePath ,obj).toString()); - responseHeaderDB.remove(plasmaURL.urlHash(getURL(cachePath ,obj))); + this.log.logFinest("Trying to remove responseHeader for URL: " + + getURL(this.cachePath ,obj).toString()); + this.responseHeaderDB.remove(plasmaURL.urlHash(getURL(this.cachePath ,obj))); } catch (IOException e) { - log.logInfo("IOExeption removing response header from DB: " + + this.log.logInfo("IOExeption removing response header from DB: " + e.getMessage(), e); } } @@ -275,13 +272,13 @@ public final class plasmaHTCache { private void cleanup() { // clean up cache to have 4% (enough) space for next entries - if ((currCacheSize >= maxCacheSize) && (cacheAge.size() > 0)) { - if (maxCacheSize > 0) cleanupDoIt(maxCacheSize - ((maxCacheSize / 100) * 4)); + if ((this.currCacheSize >= this.maxCacheSize) && (this.cacheAge.size() > 0)) { + if (this.maxCacheSize > 0) cleanupDoIt(this.maxCacheSize - ((this.maxCacheSize / 100) * 4)); } } public void close() throws IOException { - responseHeaderDB.close(); + this.responseHeaderDB.close(); } private String ageString(long date, File f) { @@ -299,7 +296,7 @@ public final class plasmaHTCache { //log.logSystem("STARTING CACHE SCANNING"); kelondroMScoreCluster doms = new kelondroMScoreCluster(); int c = 0; - enumerateFiles ef = new enumerateFiles(cachePath, true, false, true, true); + enumerateFiles ef = new enumerateFiles(this.cachePath, true, false, true, true); File f; while (ef.hasMoreElements()) { c++; @@ -307,19 +304,19 @@ public final class plasmaHTCache { long d = f.lastModified(); //System.out.println("Cache: " + dom(f)); doms.incScore(dom(f)); - currCacheSize += f.length(); - cacheAge.put(ageString(d, f), f); + this.currCacheSize += f.length(); + this.cacheAge.put(ageString(d, f), f); } //System.out.println("%" + (String) cacheAge.firstKey() + "=" + cacheAge.get(cacheAge.firstKey())); long ageHours = 0; try { ageHours = (System.currentTimeMillis() - - Long.parseLong(((String) cacheAge.firstKey()).substring(0, 16), 16)) / 3600000; + Long.parseLong(((String) this.cacheAge.firstKey()).substring(0, 16), 16)) / 3600000; } catch (NumberFormatException e) { //e.printStackTrace(); } - log.logConfig("CACHE SCANNED, CONTAINS " + c + - " FILES = " + currCacheSize/1048576 + "MB, OLDEST IS " + + this.log.logConfig("CACHE SCANNED, CONTAINS " + c + + " FILES = " + this.currCacheSize/1048576 + "MB, OLDEST IS " + ((ageHours < 24) ? (ageHours + " HOURS") : ((ageHours / 24) + " DAYS")) + " OLD"); cleanup(); @@ -333,18 +330,18 @@ public final class plasmaHTCache { ip = httpc.dnsResolve(dom); if (ip == null) continue; result += ", " + dom + "=" + ip; - log.logConfig("PRE-FILLED " + dom + "=" + ip); + this.log.logConfig("PRE-FILLED " + dom + "=" + ip); c++; doms.deleteScore(dom); // wait a short while to prevent that this looks like a DoS - try {Thread.currentThread().sleep(100);} catch (InterruptedException e) {} + try {Thread.sleep(100);} catch (InterruptedException e) {} } - if (result.length() > 2) log.logConfig("PRE-FILLED DNS CACHE, FETCHED " + c + + if (result.length() > 2) this.log.logConfig("PRE-FILLED DNS CACHE, FETCHED " + c + " ADDRESSES: " + result.substring(2)); } private String dom(File f) { - String s = f.toString().substring(cachePath.toString().length() + 1); + String s = f.toString().substring(this.cachePath.toString().length() + 1); int p = s.indexOf("/"); if (p < 0) p = s.indexOf("\\"); if (p < 0) return null; @@ -352,17 +349,17 @@ public final class plasmaHTCache { } public httpHeader getCachedResponse(String urlHash) throws IOException { - Map hdb = responseHeaderDB.get(urlHash); + Map hdb = this.responseHeaderDB.get(urlHash); if (hdb == null) return null; return new httpHeader(null, hdb); } public boolean full() { - return (cacheStack.size() > stackLimit); + return (this.cacheStack.size() > stackLimit); } public boolean empty() { - return (cacheStack.size() == 0); + return (this.cacheStack.size() == 0); } public static boolean isPicture(httpHeader response) { @@ -476,9 +473,8 @@ public final class plasmaHTCache { return serverFileUtils.read(f); } catch (IOException e) { return null; - } else { - return null; } + return null; } public static boolean isPOST(String urlString) { @@ -534,14 +530,14 @@ public final class plasmaHTCache { serverLog.logFine("PLASMA", "Entry: URL=" + url.toString()); this.nomalizedURLString = htmlFilterContentScraper.urlNormalform(url); try { - this.url = new URL(nomalizedURLString); + this.url = new URL(this.nomalizedURLString); } catch (MalformedURLException e) { System.out.println("internal error at httpdProxyCache.Entry: " + e); System.exit(-1); } this.name = name; this.cacheFile = getCachePath(this.url); - this.nomalizedURLHash = plasmaCrawlLURL.urlHash(nomalizedURLString); + this.nomalizedURLHash = plasmaURL.urlHash(this.nomalizedURLString); // assigned: this.initDate = initDate; @@ -562,10 +558,10 @@ public final class plasmaHTCache { System.exit(0); } - lastModified = new Date(serverDate.correctedUTCTime()); + this.lastModified = new Date(serverDate.correctedUTCTime()); } else { - lastModified = responseHeader.lastModified(); - if (lastModified == null) lastModified = new Date(serverDate.correctedUTCTime()); // does not exist in header + this.lastModified = responseHeader.lastModified(); + if (this.lastModified == null) this.lastModified = new Date(serverDate.correctedUTCTime()); // does not exist in header } this.doctype = plasmaWordIndexEntry.docType(responseHeader.mime()); if (this.doctype == plasmaWordIndexEntry.DT_UNKNOWN) this.doctype = plasmaWordIndexEntry.docType(url); @@ -576,22 +572,23 @@ public final class plasmaHTCache { } public String name() { - return name; + return this.name; } public String initiator() { - return initiator; + return this.initiator; } public boolean proxy() { return initiator() == null; } public long size() { - if (cacheArray == null) return 0; else return cacheArray.length; + if (this.cacheArray == null) return 0; + return this.cacheArray.length; } public URL referrerURL() { - if (requestHeader == null) return null; + if (this.requestHeader == null) return null; try { - return new URL((String) requestHeader.get(httpHeader.REFERER, "")); + return new URL((String) this.requestHeader.get(httpHeader.REFERER, "")); } catch (Exception e) { return null; } @@ -611,35 +608,35 @@ public final class plasmaHTCache { // in case of FALSE, the reason as String is returned // check profile - if (!(profile.storeHTCache())) return "storage_not_wanted"; + if (!(this.profile.storeHTCache())) return "storage_not_wanted"; // decide upon header information if a specific file should be stored to the cache or not // if the storage was requested by prefetching, the request map is null // check status code - if (!((responseStatus.startsWith("200")) || (responseStatus.startsWith("203")))) return "bad_status_" + responseStatus.substring(0,3); + if (!((this.responseStatus.startsWith("200")) || (this.responseStatus.startsWith("203")))) return "bad_status_" + this.responseStatus.substring(0,3); // check storage location // sometimes a file name is equal to a path name in the same directory; // or sometimes a file name is equal a directory name created earlier; // we cannot match that here in the cache file path and therefore omit writing into the cache - if ((cacheFile.getParentFile().isFile()) || (cacheFile.isDirectory())) return "path_ambiguous"; - if (cacheFile.toString().indexOf("..") >= 0) return "path_dangerous"; + if ((this.cacheFile.getParentFile().isFile()) || (this.cacheFile.isDirectory())) return "path_ambiguous"; + if (this.cacheFile.toString().indexOf("..") >= 0) return "path_dangerous"; // -CGI access in request // CGI access makes the page very individual, and therefore not usable in caches - if ((isPOST(nomalizedURLString)) && (!(profile.crawlingQ()))) return "dynamic_post"; - if (isCGI(nomalizedURLString)) return "dynamic_cgi"; + if ((isPOST(this.nomalizedURLString)) && (!(this.profile.crawlingQ()))) return "dynamic_post"; + if (isCGI(this.nomalizedURLString)) return "dynamic_cgi"; // -authorization cases in request // authorization makes pages very individual, and therefore we cannot use the // content in the cache - if ((requestHeader != null) && (requestHeader.containsKey(httpHeader.AUTHORIZATION))) return "personalized"; + if ((this.requestHeader != null) && (this.requestHeader.containsKey(httpHeader.AUTHORIZATION))) return "personalized"; // -ranges in request and response // we do not cache partial content - if ((requestHeader != null) && (requestHeader.containsKey(httpHeader.RANGE))) return "partial"; - if ((responseHeader != null) && (responseHeader.containsKey(httpHeader.CONTENT_RANGE))) return "partial"; + if ((this.requestHeader != null) && (this.requestHeader.containsKey(httpHeader.RANGE))) return "partial"; + if ((this.responseHeader != null) && (this.responseHeader.containsKey(httpHeader.CONTENT_RANGE))) return "partial"; // -if-modified-since in request // we do not care about if-modified-since, because this case only occurres if the @@ -657,8 +654,8 @@ public final class plasmaHTCache { // -pragma in response // if we have a pragma non-cache, we don't cache. usually if this is wanted from // the server, it makes sense - if ((responseHeader.containsKey(httpHeader.PRAGMA)) && - (((String) responseHeader.get(httpHeader.PRAGMA)).toUpperCase().equals("NO-CACHE"))) return "controlled_no_cache"; + if ((this.responseHeader.containsKey(httpHeader.PRAGMA)) && + (((String) this.responseHeader.get(httpHeader.PRAGMA)).toUpperCase().equals("NO-CACHE"))) return "controlled_no_cache"; // -expires in response // we do not care about expires, because at the time this is called the data is @@ -666,12 +663,12 @@ public final class plasmaHTCache { // -cache-control in response // the cache-control has many value options. - String cacheControl = (String) responseHeader.get(httpHeader.CACHE_CONTROL); + String cacheControl = (String) this.responseHeader.get(httpHeader.CACHE_CONTROL); if (cacheControl != null) { cacheControl = cacheControl.trim().toUpperCase(); if (cacheControl.startsWith("MAX-AGE=")) { // we need also the load date - Date date = responseHeader.date(); + Date date = this.responseHeader.date(); if (date == null) return "stale_no_date_given_in_response"; try { long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live @@ -696,57 +693,57 @@ public final class plasmaHTCache { // -CGI access in request // CGI access makes the page very individual, and therefore not usable in caches - if (isPOST(nomalizedURLString)) return false; - if (isCGI(nomalizedURLString)) return false; + if (isPOST(this.nomalizedURLString)) return false; + if (isCGI(this.nomalizedURLString)) return false; // -authorization cases in request - if (requestHeader.containsKey(httpHeader.AUTHORIZATION)) return false; + if (this.requestHeader.containsKey(httpHeader.AUTHORIZATION)) return false; // -ranges in request // we do not cache partial content - if ((requestHeader != null) && (requestHeader.containsKey(httpHeader.RANGE))) return false; + if ((this.requestHeader != null) && (this.requestHeader.containsKey(httpHeader.RANGE))) return false; //Date d1, d2; // -if-modified-since in request // The entity has to be transferred only if it has // been modified since the date given by the If-Modified-Since header. - if (requestHeader.containsKey(httpHeader.IF_MODIFIED_SINCE)) { + if (this.requestHeader.containsKey(httpHeader.IF_MODIFIED_SINCE)) { // checking this makes only sense if the cached response contains // a Last-Modified field. If the field does not exist, we go the safe way - if (!(responseHeader.containsKey(httpHeader.LAST_MODIFIED))) return false; + if (!(this.responseHeader.containsKey(httpHeader.LAST_MODIFIED))) return false; // parse date Date d1, d2; - d2 = responseHeader.lastModified(); if (d2 == null) d2 = new Date(serverDate.correctedUTCTime()); - d1 = requestHeader.ifModifiedSince(); if (d1 == null) d1 = new Date(serverDate.correctedUTCTime()); + d2 = this.responseHeader.lastModified(); if (d2 == null) d2 = new Date(serverDate.correctedUTCTime()); + d1 = this.requestHeader.ifModifiedSince(); if (d1 == null) d1 = new Date(serverDate.correctedUTCTime()); // finally, we shall treat the cache as stale if the modification time is after the if-.. time if (d2.after(d1)) return false; } - boolean isNotPicture = !isPicture(responseHeader); + boolean isNotPicture = !isPicture(this.responseHeader); // -cookies in request // unfortunately, we should reload in case of a cookie // but we think that pictures can still be considered as fresh - if ((requestHeader.containsKey(httpHeader.COOKIE)) && (isNotPicture)) return false; + if ((this.requestHeader.containsKey(httpHeader.COOKIE)) && (isNotPicture)) return false; // -set-cookie in cached response // this is a similar case as for COOKIE. - if ((responseHeader.containsKey(httpHeader.SET_COOKIE)) && (isNotPicture)) return false; // too strong - if ((responseHeader.containsKey(httpHeader.SET_COOKIE2)) && (isNotPicture)) return false; // too strong + if ((this.responseHeader.containsKey(httpHeader.SET_COOKIE)) && (isNotPicture)) return false; // too strong + if ((this.responseHeader.containsKey(httpHeader.SET_COOKIE2)) && (isNotPicture)) return false; // too strong // -pragma in cached response // logically, we would not need to care about no-cache pragmas in cached response headers, // because they cannot exist since they are not written to the cache. // So this IF should always fail.. - if ((responseHeader.containsKey(httpHeader.PRAGMA)) && - (((String) responseHeader.get(httpHeader.PRAGMA)).toUpperCase().equals("NO-CACHE"))) return false; + if ((this.responseHeader.containsKey(httpHeader.PRAGMA)) && + (((String) this.responseHeader.get(httpHeader.PRAGMA)).toUpperCase().equals("NO-CACHE"))) return false; // calculate often needed values for freshness attributes - Date date = responseHeader.date(); - Date expires = responseHeader.expires(); - Date lastModified = responseHeader.lastModified(); - String cacheControl = (String) responseHeader.get(httpHeader.CACHE_CONTROL); + Date date = this.responseHeader.date(); + Date expires = this.responseHeader.expires(); + Date lastModified = this.responseHeader.lastModified(); + String cacheControl = (String) this.responseHeader.get(httpHeader.CACHE_CONTROL); // see for documentation also: diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index 0c4736152..4f2e98159 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -358,7 +358,7 @@ public class plasmaSnippetCache { } } - private void loadResourceFromWeb(URL url, int socketTimeout) throws IOException { + public void loadResourceFromWeb(URL url, int socketTimeout) throws IOException { plasmaCrawlWorker.load( url, "", diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 8d4b811e2..34af738ff 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1476,7 +1476,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser int p; URL url; plasmaCrawlLURL.Entry urlentry; - String urlstring, urlname, filename; + String urlstring, urlname, filename, urlhash; String host, hash, address, descr = ""; yacySeed seed; plasmaSnippetCache.result snippet; @@ -1484,6 +1484,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser while ((acc.hasMoreElements()) && (i < query.wantedResults)) { urlentry = acc.nextElement(); url = urlentry.url(); + urlhash = urlentry.hash(); host = url.getHost(); if (host.endsWith(".yacyh")) { // translate host into current IP @@ -1525,6 +1526,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } else { prop.put("results_" + i + "_description", descr); prop.put("results_" + i + "_url", urlstring); + prop.put("results_" + i + "_urlhash", urlhash); prop.put("results_" + i + "_urlname", urlname); prop.put("results_" + i + "_date", dateString(urlentry.moddate())); prop.put("results_" + i + "_size", Long.toString(urlentry.size())); diff --git a/source/de/anomic/server/serverCore.java b/source/de/anomic/server/serverCore.java index 5d522ca2f..39a3ea817 100644 --- a/source/de/anomic/server/serverCore.java +++ b/source/de/anomic/server/serverCore.java @@ -578,8 +578,7 @@ public final class serverCore extends serverAbstractThread implements serverThre } super.close(); - } - + } } public final class SessionFactory implements org.apache.commons.pool.PoolableObjectFactory { @@ -707,6 +706,10 @@ public final class serverCore extends serverAbstractThread implements serverThre this.stopped = stopped; } + public boolean isStopped() { + return this.stopped; + } + public void close() { if (this.isAlive()) { try { @@ -817,7 +820,7 @@ public final class serverCore extends serverAbstractThread implements serverThre this.running = true; // The thread keeps running. - while (!this.stopped && !Thread.interrupted()) { + while (!this.stopped && !this.isInterrupted()) { if (this.done) { // We are waiting for a task now. synchronized (this) { @@ -1114,5 +1117,11 @@ public final class serverCore extends serverAbstractThread implements serverThre if (!this.theSessionPool.isClosed) this.theSessionPool.close(); super.finalize(); } + + public static final void checkInterruption() throws InterruptedException { + Thread currentThread = Thread.currentThread(); + if (currentThread.isInterrupted()) throw new InterruptedException(); + if ((currentThread instanceof serverCore.Session) && ((serverCore.Session)currentThread).isStopped()) throw new InterruptedException(); + } } diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 864f134f0..bf7e03e4c 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -696,19 +696,19 @@ public final class yacyClient { try { final plasmaSwitchboard sb = new plasmaSwitchboard(args[0], "httpProxy.init", "DATA/SETTINGS/httpProxy.conf"); final yacyCore core = new yacyCore(sb); - core.peerActions.loadSeedLists(); - final yacySeed target = core.seedDB.getConnected(args[1]); + yacyCore.peerActions.loadSeedLists(); + final yacySeed target = yacyCore.seedDB.getConnected(args[1]); final String wordhashe = plasmaWordIndexEntry.word2hash("test"); //System.out.println("permission=" + permissionMessage(args[1])); final HashMap result = nxTools.table(httpc.wget( new URL("http://" + target.getAddress() + - "/yacy/search.html?myseed=" + core.seedDB.mySeed.genSeedStr(null) + + "/yacy/search.html?myseed=" + yacyCore.seedDB.mySeed.genSeedStr(null) + "&youare=" + target.hash + "&key=" + - "&myseed=" + core.seedDB.mySeed.genSeedStr(null) + + "&myseed=" + yacyCore.seedDB.mySeed.genSeedStr(null) + "&count=10&resource=global" + "&query=" + wordhashe), - 5000, null, null, core.seedDB.sb.remoteProxyHost, core.seedDB.sb.remoteProxyPort)); + 5000, null, null, yacyCore.seedDB.sb.remoteProxyHost, yacyCore.seedDB.sb.remoteProxyPort)); System.out.println("Result=" + result.toString()); } catch (Exception e) { e.printStackTrace();