From 77f795756c0af54e6746670f48ba73cc3a61b20a Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Mon, 25 Jun 2012 18:17:31 +0200 Subject: [PATCH] fixing redirects and status codes: storing of status code in ResponseHeader to make it available for late evaluations, like storage in solr. --- htroot/CacheResource_p.java | 17 +++--- htroot/CookieTest_p.java | 13 ++--- htroot/Crawler_p.java | 2 +- htroot/User.java | 2 +- htroot/suggest.java | 2 +- htroot/yacysearch.java | 2 +- source/de/anomic/crawler/RobotsTxt.java | 2 +- .../anomic/crawler/retrieval/FTPLoader.java | 7 +-- .../anomic/crawler/retrieval/FileLoader.java | 7 +-- .../anomic/crawler/retrieval/HTTPLoader.java | 20 +++---- .../de/anomic/crawler/retrieval/Response.java | 14 ++--- .../anomic/crawler/retrieval/SMBLoader.java | 7 +-- .../anomic/http/server/HTTPDFileHandler.java | 28 +++++----- .../anomic/http/server/HTTPDProxyHandler.java | 27 ++++----- source/de/anomic/http/server/HTTPDemon.java | 10 +--- .../de/anomic/server/servletProperties.java | 56 ++++++++++--------- .../yacy/cora/protocol/HeaderFramework.java | 6 +- .../yacy/cora/protocol/ResponseHeader.java | 19 ++++++- .../yacy/cora/protocol/http/HTTPClient.java | 16 +++--- .../federated/solr/AbstractSolrConnector.java | 1 + .../document/parser/html/ContentScraper.java | 11 ++++ .../yacy/document/parser/sitemapParser.java | 3 +- .../net/yacy/peers/operation/yacyRelease.java | 3 +- .../net/yacy/repository/LoaderDispatcher.java | 1 - source/net/yacy/search/Switchboard.java | 5 +- .../yacy/search/index/SolrConfiguration.java | 14 ++++- 26 files changed, 162 insertions(+), 133 deletions(-) diff --git a/htroot/CacheResource_p.java b/htroot/CacheResource_p.java index 091480c64..490bd2d4a 100644 --- a/htroot/CacheResource_p.java +++ b/htroot/CacheResource_p.java @@ -1,4 +1,4 @@ -// CacheResource_p.java +// CacheResource_p.java // ----------------------- // (C) by Michael Peter Christen; mc@yacy.net // first published on http://www.anomic.de @@ -30,7 +30,6 @@ import net.yacy.cora.protocol.ResponseHeader; import net.yacy.document.ImageParser; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; - import de.anomic.crawler.Cache; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -41,9 +40,9 @@ public class CacheResource_p { public static Object respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { final servletProperties prop = new servletProperties(); prop.put("resource", new byte[0]); - + if (post == null) return prop; - + final String u = post.get("url", ""); DigestURI url; try { @@ -52,10 +51,10 @@ public class CacheResource_p { Log.logException(e); return prop; } - + byte[] resource = Cache.getContent(url.hash()); if (resource == null) return prop; - + // check request type if (header.get("EXT", "html").equals("png")) { // a png was requested @@ -65,11 +64,11 @@ public class CacheResource_p { ResponseHeader responseHeader = Cache.getResponseHeader(url.hash()); String resMime = responseHeader == null ? null : responseHeader.mime(); if (resMime != null) { - final ResponseHeader outgoingHeader = new ResponseHeader(); + final ResponseHeader outgoingHeader = new ResponseHeader(200); outgoingHeader.put(HeaderFramework.CONTENT_TYPE, resMime); prop.setOutgoingHeader(outgoingHeader); - } - + } + // add resource prop.put("resource", resource); return prop; diff --git a/htroot/CookieTest_p.java b/htroot/CookieTest_p.java index db478fdf0..84278482a 100644 --- a/htroot/CookieTest_p.java +++ b/htroot/CookieTest_p.java @@ -31,7 +31,6 @@ import java.util.Iterator; import java.util.Map; import net.yacy.cora.protocol.ResponseHeader; - import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.server.servletProperties; @@ -39,7 +38,7 @@ import de.anomic.server.servletProperties; public class CookieTest_p { public static serverObjects respond(final ResponseHeader header, final serverObjects post, final serverSwitch env) { - + // case if no values are requested if (post == null || env == null) { @@ -48,10 +47,10 @@ public class CookieTest_p { final serverObjects prop = new serverObjects(); return prop; } - + final servletProperties prop = new servletProperties(); if (post.containsKey("act") && "clear_cookie".equals(post.get("act"))) { - final ResponseHeader outgoingHeader = new ResponseHeader(); + final ResponseHeader outgoingHeader = new ResponseHeader(200); final Iterator> it = header.entrySet().iterator(); Map.Entry e; while (it.hasNext()) { @@ -65,15 +64,15 @@ public class CookieTest_p { } } } - + prop.setOutgoingHeader(outgoingHeader); prop.put("coockiesout", "0"); //header. - + } else if (post.containsKey("act") && "set_cookie".equals(post.get("act"))) { final String cookieName = post.get("cookie_name").trim(); final String cookieValue = post.get("cookie_value").trim(); - final ResponseHeader outgoingHeader = new ResponseHeader(); + final ResponseHeader outgoingHeader = new ResponseHeader(200); outgoingHeader.setCookie(cookieName,cookieValue); prop.setOutgoingHeader(outgoingHeader); diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index b0a0c983a..74c89f991 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -455,7 +455,7 @@ public class Crawler_p { prop.put("info", "6"); // Error with url prop.putHTML("info_crawlingStart", crawlingStart); prop.putHTML("info_error", e.getMessage()); - Log.logException(e); + Log.logInfo("Crawler_p", "start url rejected: " + e.getMessage()); } } else if ("file".equals(crawlingMode)) { diff --git a/htroot/User.java b/htroot/User.java index 69e478a7a..0de54d8ee 100644 --- a/htroot/User.java +++ b/htroot/User.java @@ -112,7 +112,7 @@ public class User{ cookie=sb.userDB.getAdminCookie(); if(entry != null || staticAdmin){ - final ResponseHeader outgoingHeader=new ResponseHeader(); + final ResponseHeader outgoingHeader=new ResponseHeader(200); outgoingHeader.setCookie("login", cookie); prop.setOutgoingHeader(outgoingHeader); diff --git a/htroot/suggest.java b/htroot/suggest.java index 7ac179525..cb685796e 100644 --- a/htroot/suggest.java +++ b/htroot/suggest.java @@ -120,7 +120,7 @@ public class suggest { // Adding CORS Access header for xml output if (xml) { - final ResponseHeader outgoingHeader = new ResponseHeader(); + final ResponseHeader outgoingHeader = new ResponseHeader(200); outgoingHeader.put(HeaderFramework.CORS_ALLOW_ORIGIN, "*"); prop.setOutgoingHeader(outgoingHeader); } diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 65734d6c8..f65cebe17 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -205,7 +205,7 @@ public class yacysearch { // Adding CORS Access header for yacysearch.rss output if ( rss ) { - final ResponseHeader outgoingHeader = new ResponseHeader(); + final ResponseHeader outgoingHeader = new ResponseHeader(200); outgoingHeader.put(HeaderFramework.CORS_ALLOW_ORIGIN, "*"); prop.setOutgoingHeader(outgoingHeader); } diff --git a/source/de/anomic/crawler/RobotsTxt.java b/source/de/anomic/crawler/RobotsTxt.java index 64c0f4535..3d4c55beb 100644 --- a/source/de/anomic/crawler/RobotsTxt.java +++ b/source/de/anomic/crawler/RobotsTxt.java @@ -333,7 +333,7 @@ public class RobotsTxt { ByteCount.addAccountCount(ByteCount.CRAWLER, robotsTxt.length); } final int code = client.getHttpResponse().getStatusLine().getStatusCode(); - final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders()); + final ResponseHeader header = new ResponseHeader(code, client.getHttpResponse().getAllHeaders()); // check the response status if (code > 199 && code < 300) { diff --git a/source/de/anomic/crawler/retrieval/FTPLoader.java b/source/de/anomic/crawler/retrieval/FTPLoader.java index 202c87034..aca20dfad 100644 --- a/source/de/anomic/crawler/retrieval/FTPLoader.java +++ b/source/de/anomic/crawler/retrieval/FTPLoader.java @@ -124,7 +124,7 @@ public class FTPLoader { if (dirList == null) { response = null; } else { - final ResponseHeader responseHeader = new ResponseHeader(); + final ResponseHeader responseHeader = new ResponseHeader(200); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date())); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html"); final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes()); @@ -132,7 +132,6 @@ public class FTPLoader { request, requestHeader, responseHeader, - "200", profile, false, dirList.toString().getBytes()); @@ -226,7 +225,7 @@ public class FTPLoader { final DigestURI refurl = this.sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash()); if (refurl != null) requestHeader.put(RequestHeader.REFERER, refurl.toNormalform(true, false)); } - final ResponseHeader responseHeader = new ResponseHeader(); + final ResponseHeader responseHeader = new ResponseHeader(200); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(fileDate)); responseHeader.put(HeaderFramework.CONTENT_TYPE, mime); @@ -252,7 +251,6 @@ public class FTPLoader { request, requestHeader, responseHeader, - "200", profile, false, null); @@ -268,7 +266,6 @@ public class FTPLoader { request, requestHeader, responseHeader, - "200", profile, false, b); diff --git a/source/de/anomic/crawler/retrieval/FileLoader.java b/source/de/anomic/crawler/retrieval/FileLoader.java index 87451c169..a989d11a0 100644 --- a/source/de/anomic/crawler/retrieval/FileLoader.java +++ b/source/de/anomic/crawler/retrieval/FileLoader.java @@ -85,7 +85,7 @@ public class FileLoader { StringBuilder content = FTPClient.dirhtml(u, null, null, null, list, true); - ResponseHeader responseHeader = new ResponseHeader(); + ResponseHeader responseHeader = new ResponseHeader(200); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date())); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html"); final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes()); @@ -93,7 +93,6 @@ public class FileLoader { request, requestHeader, responseHeader, - "200", profile, false, content.toString().getBytes()); @@ -103,7 +102,7 @@ public class FileLoader { // create response header String mime = Classification.ext2mime(url.getFileExtension()); - ResponseHeader responseHeader = new ResponseHeader(); + ResponseHeader responseHeader = new ResponseHeader(200); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified()))); responseHeader.put(HeaderFramework.CONTENT_TYPE, mime); @@ -134,7 +133,6 @@ public class FileLoader { request, requestHeader, responseHeader, - "200", profile, false, url.toTokens().getBytes()); @@ -152,7 +150,6 @@ public class FileLoader { request, requestHeader, responseHeader, - "200", profile, false, b); diff --git a/source/de/anomic/crawler/retrieval/HTTPLoader.java b/source/de/anomic/crawler/retrieval/HTTPLoader.java index 6d8726fbe..57b71434a 100644 --- a/source/de/anomic/crawler/retrieval/HTTPLoader.java +++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java @@ -80,8 +80,8 @@ public final class HTTPLoader { private Response load(final Request request, final int retryCount, final int maxFileSize, final boolean checkBlacklist) throws IOException { if (retryCount < 0) { - this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection counter exceeded", -1); - throw new IOException("Redirection counter exceeded for URL " + request.url().toString() + ". Processing aborted."); + this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1); + throw new IOException("retry counter exceeded for URL " + request.url().toString() + ". Processing aborted."); } DigestURI url = request.url(); @@ -131,15 +131,15 @@ public final class HTTPLoader { // send request final byte[] responseBody = client.GETbytes(url, maxFileSize); - final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders()); - final int code = client.getHttpResponse().getStatusLine().getStatusCode(); + final int code = client.getHttpResponse().getStatusLine().getStatusCode(); + final ResponseHeader responseHeader = new ResponseHeader(code, client.getHttpResponse().getAllHeaders()); if (code > 299 && code < 310) { // redirection (content may be empty) if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) { - if (header.containsKey(HeaderFramework.LOCATION)) { + if (responseHeader.containsKey(HeaderFramework.LOCATION)) { // getting redirection URL - String redirectionUrlString = header.get(HeaderFramework.LOCATION); + String redirectionUrlString = responseHeader.get(HeaderFramework.LOCATION); redirectionUrlString = redirectionUrlString.trim(); if (redirectionUrlString.length() == 0) { @@ -202,8 +202,7 @@ public final class HTTPLoader { response = new Response( request, requestHeader, - header, - Integer.toString(code), + responseHeader, profile, false, responseBody @@ -254,8 +253,8 @@ public final class HTTPLoader { client.setTimout(20000); client.setHeader(requestHeader.entrySet()); final byte[] responseBody = client.GETbytes(request.url()); - final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders()); - final int code = client.getHttpResponse().getStatusLine().getStatusCode(); + final int code = client.getHttpResponse().getStatusLine().getStatusCode(); + final ResponseHeader header = new ResponseHeader(code, client.getHttpResponse().getAllHeaders()); // FIXME: 30*-handling (bottom) is never reached // we always get the final content because httpClient.followRedirects = true @@ -272,7 +271,6 @@ public final class HTTPLoader { request, requestHeader, header, - Integer.toString(code), null, false, responseBody diff --git a/source/de/anomic/crawler/retrieval/Response.java b/source/de/anomic/crawler/retrieval/Response.java index 34715a520..dec791021 100644 --- a/source/de/anomic/crawler/retrieval/Response.java +++ b/source/de/anomic/crawler/retrieval/Response.java @@ -63,7 +63,6 @@ public class Response { private final Request request; private final RequestHeader requestHeader; private final ResponseHeader responseHeader; - private final String responseStatus; private final CrawlProfile profile; private byte[] content; private int status; // tracker indexing status, see status defs below @@ -151,7 +150,6 @@ public class Response { final Request request, final RequestHeader requestHeader, final ResponseHeader responseHeader, - final String responseStatus, final CrawlProfile profile, final boolean fromCache, final byte[] content) { @@ -159,7 +157,6 @@ public class Response { // request and response headers may be zero in case that we process surrogates this.requestHeader = requestHeader; this.responseHeader = responseHeader; - this.responseStatus = responseStatus; this.profile = profile; this.status = QUEUE_STATE_FRESH; this.content = content; @@ -176,10 +173,9 @@ public class Response { this.request = request; // request and response headers may be zero in case that we process surrogates this.requestHeader = new RequestHeader(); - this.responseHeader = new ResponseHeader(); + this.responseHeader = new ResponseHeader(200); this.responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain"); // tell parser how to handle the content if (request.size() > 0) this.responseHeader.put(HeaderFramework.CONTENT_LENGTH, Long.toString(request.size())); - this.responseStatus = "200"; this.profile = profile; this.status = QUEUE_STATE_FRESH; this.content = request.name().length() > 0 ? request.name().getBytes() : request.url().toTokens().getBytes(); @@ -190,10 +186,9 @@ public class Response { final Request request, final RequestHeader requestHeader, final ResponseHeader responseHeader, - final String responseStatus, final CrawlProfile profile, final boolean fromCache) { - this(request, requestHeader, responseHeader, responseStatus, profile, fromCache, null); + this(request, requestHeader, responseHeader, profile, fromCache, null); } public void updateStatus(final int newStatus) { @@ -371,7 +366,7 @@ public class Response { // check status code if (!validResponseStatus()) { - return "bad_status_" + this.responseStatus; + return "bad_status_" + this.responseHeader.getStatusCode(); } if (this.requestHeader != null) { @@ -796,7 +791,8 @@ public class Response { } public boolean validResponseStatus() { - return (this.responseStatus == null) ? false : this.responseStatus.startsWith("200") || this.responseStatus.startsWith("203"); + int status = this.responseHeader.getStatusCode(); + return status == 200 || status == 203; } public Date ifModifiedSince() { diff --git a/source/de/anomic/crawler/retrieval/SMBLoader.java b/source/de/anomic/crawler/retrieval/SMBLoader.java index e968263be..0726aabfe 100644 --- a/source/de/anomic/crawler/retrieval/SMBLoader.java +++ b/source/de/anomic/crawler/retrieval/SMBLoader.java @@ -104,7 +104,7 @@ public class SMBLoader { StringBuilder content = FTPClient.dirhtml(u, null, null, null, list, true); - ResponseHeader responseHeader = new ResponseHeader(); + ResponseHeader responseHeader = new ResponseHeader(200); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date())); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html"); final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes()); @@ -112,7 +112,6 @@ public class SMBLoader { request, requestHeader, responseHeader, - "200", profile, false, content.toString().getBytes()); @@ -122,7 +121,7 @@ public class SMBLoader { // create response header String mime = Classification.ext2mime(url.getFileExtension()); - ResponseHeader responseHeader = new ResponseHeader(); + ResponseHeader responseHeader = new ResponseHeader(200); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified()))); responseHeader.put(HeaderFramework.CONTENT_TYPE, mime); @@ -153,7 +152,6 @@ public class SMBLoader { request, requestHeader, responseHeader, - "200", profile, false, url.toTokens().getBytes()); @@ -171,7 +169,6 @@ public class SMBLoader { request, requestHeader, responseHeader, - "200", profile, false, b); diff --git a/source/de/anomic/http/server/HTTPDFileHandler.java b/source/de/anomic/http/server/HTTPDFileHandler.java index f0961effb..52496a822 100644 --- a/source/de/anomic/http/server/HTTPDFileHandler.java +++ b/source/de/anomic/http/server/HTTPDFileHandler.java @@ -227,7 +227,7 @@ public final class HTTPDFileHandler { } private static final ResponseHeader getDefaultHeaders(final String path) { - final ResponseHeader headers = new ResponseHeader(); + final ResponseHeader headers = new ResponseHeader(200); String ext; int pos; if ((pos = path.lastIndexOf('.')) < 0) { @@ -526,7 +526,7 @@ public final class HTTPDFileHandler { aBuffer.append(" \n\n\n"); // write the list to the client - HTTPDemon.sendRespondHeader(conProp, out, httpVersion, 200, null, "text/html; charset=UTF-8", aBuffer.length(), new Date(targetFile.lastModified()), null, new ResponseHeader(), null, null, true); + HTTPDemon.sendRespondHeader(conProp, out, httpVersion, 200, null, "text/html; charset=UTF-8", aBuffer.length(), new Date(targetFile.lastModified()), null, new ResponseHeader(200), null, null, true); if (!method.equals(HeaderFramework.METHOD_HEAD)) { out.write(UTF8.getBytes(aBuffer.toString())); } @@ -1075,9 +1075,10 @@ public final class HTTPDFileHandler { // apply templates TemplateEngine.writeTemplate(fis, o, templatePatterns, UNRESOLVED_PATTERN); fis.close(); + ResponseHeader rh = (templatePatterns == null) ? new ResponseHeader(200) : templatePatterns.getOutgoingHeader(); HTTPDemon.sendRespondHeader(conProp, out, - httpVersion, 200, null, mimeType, -1, - targetDate, expireDate, (templatePatterns == null) ? new ResponseHeader() : templatePatterns.getOutgoingHeader(), + httpVersion, rh.getStatusCode(), null, mimeType, -1, + targetDate, expireDate, rh, null, "chunked", nocache); // send the content in chunked parts, see RFC 2616 section 3.6.1 final ChunkedOutputStream chos = new ChunkedOutputStream(out); @@ -1107,16 +1108,17 @@ public final class HTTPDFileHandler { ServerSideIncludes.writeSSI(o1, o, realmProp, clientIP, requestHeader); //httpTemplate.writeTemplate(fis, o, tp, "-UNRESOLVED_PATTERN-".getBytes("UTF-8")); } + ResponseHeader rh = (templatePatterns == null) ? new ResponseHeader(200) : templatePatterns.getOutgoingHeader(); if (method.equals(HeaderFramework.METHOD_HEAD)) { HTTPDemon.sendRespondHeader(conProp, out, - httpVersion, 200, null, mimeType, o.length(), - targetDate, expireDate, (templatePatterns == null) ? new ResponseHeader() : templatePatterns.getOutgoingHeader(), + httpVersion, rh.getStatusCode(), null, mimeType, o.length(), + targetDate, expireDate, rh, contentEncoding, null, nocache); } else { final byte[] result = o.getBytes(); // this interrupts streaming (bad idea!) HTTPDemon.sendRespondHeader(conProp, out, - httpVersion, 200, null, mimeType, result.length, - targetDate, expireDate, (templatePatterns == null) ? new ResponseHeader() : templatePatterns.getOutgoingHeader(), + httpVersion, rh.getStatusCode(), null, mimeType, result.length, + targetDate, expireDate, rh, contentEncoding, null, nocache); FileUtils.copy(result, out); } @@ -1125,7 +1127,7 @@ public final class HTTPDFileHandler { int statusCode = 200; int rangeStartOffset = 0; - final ResponseHeader header = new ResponseHeader(); + final ResponseHeader header = new ResponseHeader(statusCode); // adding the accept ranges header header.put(HeaderFramework.ACCEPT_RANGES, "bytes"); @@ -1429,8 +1431,8 @@ public final class HTTPDFileHandler { String strARGS = (String) conProp.get("ARGS"); if(strARGS.startsWith("action=")) { int detectnextargument = strARGS.indexOf("&"); - action = strARGS.substring (7, detectnextargument); - strARGS = strARGS.substring(detectnextargument+1); + action = strARGS.substring (7, detectnextargument); + strARGS = strARGS.substring(detectnextargument+1); } if(strARGS.startsWith("url=")) { final String strUrl = strARGS.substring(4); // strip url= @@ -1467,7 +1469,7 @@ public final class HTTPDFileHandler { requestHeader.remove("Authorization"); requestHeader.remove("Connection"); requestHeader.put(HeaderFramework.HOST, proxyurl.getHost()); - + // temporarily add argument to header to pass it on to augmented browsing requestHeader.put("YACYACTION", action); @@ -1475,7 +1477,7 @@ public final class HTTPDFileHandler { HTTPDProxyHandler.doGet(prop, requestHeader, o); // reparse header to extract content-length and mimetype - final ResponseHeader outgoingHeader = new ResponseHeader(); + final ResponseHeader outgoingHeader = new ResponseHeader(200); final InputStream in = new ByteArrayInputStream(o.toByteArray()); String line = readLine(in); while(line != null && !line.equals("")) { diff --git a/source/de/anomic/http/server/HTTPDProxyHandler.java b/source/de/anomic/http/server/HTTPDProxyHandler.java index 99c04062f..137fab708 100644 --- a/source/de/anomic/http/server/HTTPDProxyHandler.java +++ b/source/de/anomic/http/server/HTTPDProxyHandler.java @@ -403,7 +403,6 @@ public final class HTTPDProxyHandler { request, requestHeader, cachedResponseHeader, - "200 OK", sb.crawler.defaultProxyProfile, true ); @@ -495,20 +494,20 @@ public final class HTTPDProxyHandler { if (log.isFinest()) log.logFinest(reqID +" response status: "+ client.getHttpResponse().getStatusLine()); conProp.put(HeaderFramework.CONNECTION_PROP_CLIENT_REQUEST_HEADER, requestHeader); - final ResponseHeader responseHeader = new ResponseHeader(client.getHttpResponse().getAllHeaders()); + int statusCode = client.getHttpResponse().getStatusLine().getStatusCode(); + final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders()); // determine if it's an internal error of the httpc if (responseHeader.isEmpty()) { throw new Exception(client.getHttpResponse().getStatusLine().toString()); } - if(AugmentedHtmlStream.supportsMime(responseHeader.mime())) { + if (AugmentedHtmlStream.supportsMime(responseHeader.mime())) { // enable chunk encoding, because we don't know the length after annotating responseHeader.remove(HeaderFramework.CONTENT_LENGTH); responseHeader.put(HeaderFramework.TRANSFER_ENCODING, "chunked"); - } - ChunkedOutputStream chunkedOut = setTransferEncoding(conProp, responseHeader, client.getHttpResponse().getStatusLine().getStatusCode(), respond); + ChunkedOutputStream chunkedOut = setTransferEncoding(conProp, responseHeader, statusCode, respond); // the cache does either not exist or is (supposed to be) stale long sizeBeforeDelete = -1; @@ -558,7 +557,7 @@ public final class HTTPDProxyHandler { conProp, respond, httpVer, - client.getHttpResponse().getStatusLine().getStatusCode(), + statusCode, client.getHttpResponse().getStatusLine().toString(), // status text responseHeader); @@ -569,7 +568,6 @@ public final class HTTPDProxyHandler { request, requestHeader, responseHeader, - Integer.toString(client.getHttpResponse().getStatusLine().getStatusCode()), sb.crawler.defaultProxyProfile, true ); @@ -845,7 +843,8 @@ public final class HTTPDProxyHandler { // if (responseHeader.isEmpty()) { // throw new Exception(res.getStatusLine()); // } - final ResponseHeader responseHeader = new ResponseHeader(client.getHttpResponse().getAllHeaders()); + int statusCode = client.getHttpResponse().getStatusLine().getStatusCode(); + final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders()); if (responseHeader.isEmpty()) { throw new Exception(client.getHttpResponse().getStatusLine().toString()); } @@ -860,7 +859,7 @@ public final class HTTPDProxyHandler { conProp, respond, httpVer, - client.getHttpResponse().getStatusLine().getStatusCode(), + statusCode, client.getHttpResponse().getStatusLine().toString(), responseHeader); respond.flush(); @@ -951,7 +950,8 @@ public final class HTTPDProxyHandler { client.POST(getUrl, body, contentLength); if (log.isFinest()) log.logFinest(reqID +" response status: "+ client.getHttpResponse().getStatusLine()); - final ResponseHeader responseHeader = new ResponseHeader(client.getHttpResponse().getAllHeaders()); + int statusCode = client.getHttpResponse().getStatusLine().getStatusCode(); + final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders()); // determine if it's an internal error of the httpc if (responseHeader.isEmpty()) { throw new Exception(client.getHttpResponse().getStatusLine().toString()); @@ -971,7 +971,7 @@ public final class HTTPDProxyHandler { HTTPDemon.sendRespondHeader(conProp, countedRespond, httpVer, - client.getHttpResponse().getStatusLine().getStatusCode(), + statusCode, client.getHttpResponse().getStatusLine().toString(), // status text responseHeader); @@ -1249,11 +1249,12 @@ public final class HTTPDProxyHandler { try { remoteProxy.HEADResponse("http://" + host + ":" + port); - final ResponseHeader header = new ResponseHeader(remoteProxy.getHttpResponse().getAllHeaders()); + int statusCode = remoteProxy.getHttpResponse().getStatusLine().getStatusCode(); + final ResponseHeader header = new ResponseHeader(statusCode, remoteProxy.getHttpResponse().getAllHeaders()); // outputs a logline to the serverlog with the current status log.logInfo("CONNECT-RESPONSE: status=" + remoteProxy.getHttpResponse().getStatusLine() + ", header=" + header.toString()); - final boolean success = remoteProxy.getHttpResponse().getStatusLine().getStatusCode() >= 200 && remoteProxy.getHttpResponse().getStatusLine().getStatusCode() <= 399; + final boolean success = statusCode >= 200 && statusCode <= 399; if (success) { // replace connection details host = ProxySettings.host; diff --git a/source/de/anomic/http/server/HTTPDemon.java b/source/de/anomic/http/server/HTTPDemon.java index b3d0a598d..5deeec498 100644 --- a/source/de/anomic/http/server/HTTPDemon.java +++ b/source/de/anomic/http/server/HTTPDemon.java @@ -1136,9 +1136,7 @@ public final class HTTPDemon implements serverHandler, Cloneable { final byte[] result = o.toByteArray(); o.close(); o = null; - if(header == null) { - header = new ResponseHeader(); - } + if (header == null) header = new ResponseHeader(httpStatusCode); header.put(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_STATUS, Integer.toString(httpStatusCode)); header.put(HeaderFramework.DATE, systemDate); header.put(HeaderFramework.CONTENT_TYPE, "text/html"); @@ -1189,9 +1187,7 @@ public final class HTTPDemon implements serverHandler, Cloneable { } } - if (headers == null) { - headers = new ResponseHeader(); - } + if (headers == null) headers = new ResponseHeader(httpStatusCode); final Date now = new Date(System.currentTimeMillis()); headers.put(HeaderFramework.SERVER, "AnomicHTTPD (www.anomic.de)"); @@ -1240,7 +1236,7 @@ public final class HTTPDemon implements serverHandler, Cloneable { if (respond == null) throw new NullPointerException("The outputstream must not be null."); if (conProp == null) throw new NullPointerException("The connection property structure must not be null."); if (httpVersion == null) httpVersion = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HTTP_VER); if (httpVersion == null) httpVersion = HeaderFramework.HTTP_VERSION_1_1; - if (responseHeader == null) responseHeader = new ResponseHeader(); + if (responseHeader == null) responseHeader = new ResponseHeader(httpStatusCode); try { if ((httpStatusText == null)||(httpStatusText.length()==0)) { diff --git a/source/de/anomic/server/servletProperties.java b/source/de/anomic/server/servletProperties.java index 7bf73a697..86fa7df31 100644 --- a/source/de/anomic/server/servletProperties.java +++ b/source/de/anomic/server/servletProperties.java @@ -26,7 +26,7 @@ import net.yacy.cora.protocol.ResponseHeader; public class servletProperties extends serverObjects { private static final long serialVersionUID = 1L; - + public static final String PEER_STAT_VERSION = "version"; public static final String PEER_STAT_UPTIME = "uptime"; public static final String PEER_STAT_MYTIME = "mytime"; @@ -34,56 +34,62 @@ public class servletProperties extends serverObjects { public static final String PEER_STAT_CLIENTID = "clientid"; private String prefix=""; - + private ResponseHeader outgoingHeader; - + public servletProperties(){ super(); } - + public servletProperties(final serverObjects so) { super(so); } - + public void setOutgoingHeader(final ResponseHeader outgoingHeader) { this.outgoingHeader = outgoingHeader; } - + public ResponseHeader getOutgoingHeader() { - if(outgoingHeader == null) - return new ResponseHeader(); - return outgoingHeader; + if (this.outgoingHeader == null) return new ResponseHeader(200); + return this.outgoingHeader; } - + public void setPrefix(final String myprefix) { - prefix=myprefix; + this.prefix=myprefix; } - + + @Override public String put(final String key, final byte[] value) { - return super.put(prefix + key, value); + return super.put(this.prefix + key, value); } - + + @Override public long put(final String key, final long value) { - return super.put(prefix + key, value); + return super.put(this.prefix + key, value); } - + + @Override public long inc(final String key) { - return super.inc(prefix+key); + return super.inc(this.prefix+key); } - + + @Override public Object get(final String key, final Object dflt) { - return super.get(prefix+key, dflt); + return super.get(this.prefix+key, dflt); } - + + @Override public String get(final String key, final String dflt) { - return super.get(prefix+key, dflt); + return super.get(this.prefix+key, dflt); } - + + @Override public int getInt(final String key, final int dflt) { - return super.getInt(prefix+key, dflt); + return super.getInt(this.prefix+key, dflt); } - + + @Override public long getLong(final String key, final long dflt) { - return super.getLong(prefix+key, dflt); + return super.getLong(this.prefix+key, dflt); } } diff --git a/source/net/yacy/cora/protocol/HeaderFramework.java b/source/net/yacy/cora/protocol/HeaderFramework.java index e7215f1e5..c8a593ecb 100644 --- a/source/net/yacy/cora/protocol/HeaderFramework.java +++ b/source/net/yacy/cora/protocol/HeaderFramework.java @@ -82,7 +82,6 @@ public class HeaderFramework extends TreeMap implements Map implements Map reverseMappingCache) { + public ResponseHeader(final int statusCode, final HashMap reverseMappingCache) { super(reverseMappingCache); + this.put(HeaderFramework.STATUS_CODE, Integer.toString(statusCode)); } public ResponseHeader(final HashMap reverseMappingCache, final Map othermap) { super(reverseMappingCache, othermap); } + public int getStatusCode() { + String statuscode = this.get(HeaderFramework.STATUS_CODE); + if (statuscode == null) return 200; + try { + return Integer.parseInt(statuscode); + } catch (NumberFormatException e) { + return 200; + } + } + public Date date() { final Date d = headerDate(HeaderFramework.DATE); if (d == null) return new Date(); else return d; diff --git a/source/net/yacy/cora/protocol/http/HTTPClient.java b/source/net/yacy/cora/protocol/http/HTTPClient.java index d5bfd252a..fc3c5e396 100644 --- a/source/net/yacy/cora/protocol/http/HTTPClient.java +++ b/source/net/yacy/cora/protocol/http/HTTPClient.java @@ -47,6 +47,7 @@ import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.ConnectionInfo; +import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; import org.apache.http.Header; @@ -156,7 +157,7 @@ public class HTTPClient { // connections per host (2 default) clientConnectionManager.setDefaultMaxPerRoute(2); // Increase max connections for localhost - final HttpHost localhost = new HttpHost("localhost"); + final HttpHost localhost = new HttpHost("127.0.0.1"); clientConnectionManager.setMaxPerRoute(new HttpRoute(localhost), maxcon); /** * HTTP protocol settings @@ -339,7 +340,7 @@ public class HTTPClient { * @throws IOException */ public byte[] GETbytes(final MultiProtocolURI url, final int maxBytes) throws IOException { - final boolean localhost = url.getHost().equals("localhost"); + final boolean localhost = Domains.isLocalhost(url.getHost()); final String urix = url.toNormalform(true, false); final HttpGet httpGet = new HttpGet(urix); if (!localhost) setHost(url.getHost()); // overwrite resolved IP, needed for shared web hosting DO NOT REMOVE, see http://en.wikipedia.org/wiki/Shared_web_hosting_service @@ -708,17 +709,18 @@ public class HTTPClient { final SSLSocketFactory sslSF = new SSLSocketFactory(sslContext, SSLSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER); return sslSF; } - + /** * If the Keep-Alive header is not present in the response, * HttpClient assumes the connection can be kept alive indefinitely. - * Here we limit this to 5 seconds. - * + * Here we limit this to 5 seconds. + * * @param defaultHttpClient */ private static void addCustomKeepAliveStrategy(final DefaultHttpClient defaultHttpClient) { defaultHttpClient.setKeepAliveStrategy(new ConnectionKeepAliveStrategy() { - public long getKeepAliveDuration(HttpResponse response, HttpContext context) { + @Override + public long getKeepAliveDuration(HttpResponse response, HttpContext context) { // Honor 'keep-alive' header String param, value; HeaderElement element; @@ -726,7 +728,7 @@ public class HTTPClient { response.headerIterator(HTTP.CONN_KEEP_ALIVE)); while (it.hasNext()) { element = it.nextElement(); - param = element.getName(); + param = element.getName(); value = element.getValue(); if (value != null && param.equalsIgnoreCase("timeout")) { try { diff --git a/source/net/yacy/cora/services/federated/solr/AbstractSolrConnector.java b/source/net/yacy/cora/services/federated/solr/AbstractSolrConnector.java index 2c9e86317..981eda488 100644 --- a/source/net/yacy/cora/services/federated/solr/AbstractSolrConnector.java +++ b/source/net/yacy/cora/services/federated/solr/AbstractSolrConnector.java @@ -78,6 +78,7 @@ public class AbstractSolrConnector implements SolrConnector { public synchronized void close() { try { this.server.commit(); + this.server = null; } catch (SolrServerException e) { Log.logException(e); } catch (IOException e) { diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 386722b59..18f7cbe54 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -625,6 +625,17 @@ public class ContentScraper extends AbstractScraper implements Scraper { return this.li.toArray(new String[this.li.size()]); } + public MultiProtocolURI[] getFlash() { + String ext; + ArrayList f = new ArrayList(); + for (final MultiProtocolURI url: this.anchors.keySet()) { + ext = url.getFileExtension(); + if (ext == null) continue; + if (ext.equals("swf")) f.add(url); + } + return f.toArray(new MultiProtocolURI[f.size()]); + } + public boolean containsFlash() { String ext; for (final MultiProtocolURI url: this.anchors.keySet()) { diff --git a/source/net/yacy/document/parser/sitemapParser.java b/source/net/yacy/document/parser/sitemapParser.java index c7a5ab11b..d92703800 100644 --- a/source/net/yacy/document/parser/sitemapParser.java +++ b/source/net/yacy/document/parser/sitemapParser.java @@ -128,7 +128,8 @@ public class sitemapParser extends AbstractParser implements Parser { } // get some metadata - final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders()); + int statusCode = client.getHttpResponse().getStatusLine().getStatusCode(); + final ResponseHeader header = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders()); final String contentMimeType = header.mime(); InputStream contentStream = client.getContentstream(); diff --git a/source/net/yacy/peers/operation/yacyRelease.java b/source/net/yacy/peers/operation/yacyRelease.java index ccb75449a..796686dee 100644 --- a/source/net/yacy/peers/operation/yacyRelease.java +++ b/source/net/yacy/peers/operation/yacyRelease.java @@ -309,7 +309,8 @@ public final class yacyRelease extends yacyVersion { } client.setTimout(120000); client.GET(getUrl().toString()); - final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders()); + int statusCode = client.getHttpResponse().getStatusLine().getStatusCode(); + final ResponseHeader header = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders()); final boolean unzipped = header.gzip() && (header.mime().toLowerCase().equals("application/x-tar")); // if true, then the httpc has unzipped the file if (unzipped && name.endsWith(".tar.gz")) { diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 0b579ba9e..e782c61e7 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -217,7 +217,6 @@ public final class LoaderDispatcher { request, requestHeader, cachedResponse, - "200", crawlProfile, true, content); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index ae26b5132..e4a207073 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -1807,7 +1807,7 @@ public final class Switchboard extends serverSwitch 0, 0, 0); - response = new Response(request, null, null, "200", this.crawler.defaultSurrogateProfile, false); + response = new Response(request, null, null, this.crawler.defaultSurrogateProfile, false); final indexingQueueEntry queueEntry = new indexingQueueEntry(Segments.Process.SURROGATES, response, new Document[] { document @@ -3357,7 +3357,8 @@ public final class Switchboard extends serverSwitch url = new DigestURI(seedListFileURL); //final long start = System.currentTimeMillis(); client.HEADResponse(url.toString()); - header = new ResponseHeader(client.getHttpResponse().getAllHeaders()); + int statusCode = client.getHttpResponse().getStatusLine().getStatusCode(); + header = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders()); //final long loadtime = System.currentTimeMillis() - start; /*if (header == null) { if (loadtime > getConfigLong("bootstrapLoadTimeout", 6000)) { diff --git a/source/net/yacy/search/index/SolrConfiguration.java b/source/net/yacy/search/index/SolrConfiguration.java index f898ed248..d8a0b2adb 100644 --- a/source/net/yacy/search/index/SolrConfiguration.java +++ b/source/net/yacy/search/index/SolrConfiguration.java @@ -86,7 +86,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable it.remove(); } } - } + } protected void addSolr(final SolrDoc solrdoc, final SolrField key, final String value) { if (isEmpty() || contains(key.name())) solrdoc.addSolr(key, value); @@ -344,7 +344,15 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable } // flash embedded - addSolr(solrdoc, SolrField.flash_b, html.containsFlash()); + if (isEmpty() || contains(SolrField.flash_b.name())) { + MultiProtocolURI[] flashURLs = html.getFlash(); + for (MultiProtocolURI u: flashURLs) { + // remove all flash links from ibound/outbound links + inboundLinks.remove(u); + ouboundLinks.remove(u); + } + addSolr(solrdoc, SolrField.flash_b, flashURLs.length > 0); + } // generic evaluation pattern for (final String model: html.getEvaluationModelNames()) { @@ -446,7 +454,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable addSolr(solrdoc, SolrField.lon_coordinate, yacydoc.lon()); addSolr(solrdoc, SolrField.lat_coordinate, yacydoc.lat()); } - addSolr(solrdoc, SolrField.httpstatus_i, 200); + addSolr(solrdoc, SolrField.httpstatus_i, header.getStatusCode()); return solrdoc; }