From b7e8358645c06fa5ed65b2b96bd2fcc193ca6a46 Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 20 Dec 2015 15:49:24 +0100 Subject: [PATCH] make use of header.getContentType where possible (mime is normalized afterwards) otherwise use header.mime() differentiated in prev. commit. --- .../yacy/cora/protocol/HeaderFramework.java | 5 +- .../net/yacy/crawler/retrieval/Response.java | 51 ++++++++----------- .../yacy/http/servlets/YaCyProxyServlet.java | 2 +- .../net/yacy/repository/LoaderDispatcher.java | 4 +- .../yacy/server/http/HTTPDProxyHandler.java | 3 -- 5 files changed, 27 insertions(+), 38 deletions(-) diff --git a/source/net/yacy/cora/protocol/HeaderFramework.java b/source/net/yacy/cora/protocol/HeaderFramework.java index 5a7af2529..2aef01f6e 100644 --- a/source/net/yacy/cora/protocol/HeaderFramework.java +++ b/source/net/yacy/cora/protocol/HeaderFramework.java @@ -425,8 +425,9 @@ public class HeaderFramework extends TreeMap implements Map 0) { - return tmpstr.substring(0,tmpstr.indexOf(';')).trim(); + final int pos = tmpstr.indexOf(';'); + if (pos > 0) { + return tmpstr.substring(0, pos).trim(); } else { return tmpstr; } diff --git a/source/net/yacy/crawler/retrieval/Response.java b/source/net/yacy/crawler/retrieval/Response.java index 642994a4a..306341cc9 100644 --- a/source/net/yacy/crawler/retrieval/Response.java +++ b/source/net/yacy/crawler/retrieval/Response.java @@ -254,12 +254,16 @@ public class Response { return doctype; } + /** + * Get respons header last modified date + * if missing the first seen date or current date + * @return valid date always != null + */ public Date lastModified() { Date docDate = null; if (this.responseHeader != null) { - docDate = this.responseHeader.lastModified(); - if (docDate == null) docDate = this.responseHeader.date(); + docDate = this.responseHeader.lastModified(); // is always != null } if (docDate == null && this.request != null) docDate = this.request.appdate(); if (docDate == null) docDate = new Date(); @@ -594,13 +598,6 @@ public class Response { // -ranges in request // we checked that in shallStoreCache - // a picture cannot be indexed - /* - if (Classification.isMediaExtension(url().getFileExtension())) { - return "Media_Content_(forbidden)"; - } - */ - // -cookies in request // unfortunately, we cannot index pages which have been requested with a cookie // because the returned content may be special for the client @@ -614,14 +611,7 @@ public class Response { // the set-cookie from the server does not indicate that the content is special // thus we do not care about it here for indexing - // a picture cannot be indexed - final String mimeType = this.responseHeader.mime(); - /* - if (Classification.isPictureMime(mimeType)) { - return "Media_Content_(Picture)"; - } - */ - final String parserError = TextParser.supportsMime(mimeType); + final String parserError = TextParser.supportsMime(this.responseHeader.getContentType()); if (parserError != null) { return "Media_Content, no parser: " + parserError; } @@ -736,16 +726,10 @@ public class Response { // check if document can be indexed if (this.responseHeader != null) { - final String mimeType = this.responseHeader.mime(); + final String mimeType = this.responseHeader.getContentType(); final String parserError = TextParser.supportsMime(mimeType); if (parserError != null && TextParser.supportsExtension(url()) != null) return "no parser available: " + parserError; } - /* - if (Classification.isMediaExtension(url().getFileExtension()) && - !Classification.isImageExtension((url().getFileExtension()))) { - return "Media_Content_(forbidden)"; - } - */ // -if-modified-since in request // if the page is fresh at the very moment we can index it @@ -783,14 +767,21 @@ public class Response { return null; } + /** + * Get Mime type from http header or null if unknown (not included in response header) + * @return mime (trimmed and lowercase) or null + */ public String getMimeType() { if (this.responseHeader == null) return null; - String mimeType = this.responseHeader.mime(); - mimeType = mimeType.trim().toLowerCase(); + String mimeType = this.responseHeader.getContentType(); + if (mimeType != null) { + mimeType = mimeType.trim().toLowerCase(); - final int pos = mimeType.indexOf(';'); - return ((pos < 0) ? mimeType : mimeType.substring(0, pos)); + final int pos = mimeType.indexOf(';'); + return ((pos < 0) ? mimeType : mimeType.substring(0, pos)); + } + return null; } public String getCharacterEncoding() { @@ -864,10 +855,10 @@ public class Response { } public Document[] parse() throws Parser.Failure { - final String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.mime()); + final String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.getContentType()); if (supportError != null) throw new Parser.Failure("no parser support:" + supportError, url()); try { - return TextParser.parseSource(new AnchorURL(url()), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), new VocabularyScraper(), this.request.timezoneOffset(), this.request.depth(), this.content); + return TextParser.parseSource(new AnchorURL(url()), this.responseHeader == null ? null : this.responseHeader.getContentType(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), new VocabularyScraper(), this.request.timezoneOffset(), this.request.depth(), this.content); } catch (final Exception e) { return null; } diff --git a/source/net/yacy/http/servlets/YaCyProxyServlet.java b/source/net/yacy/http/servlets/YaCyProxyServlet.java index fed2e3da5..690debcfe 100644 --- a/source/net/yacy/http/servlets/YaCyProxyServlet.java +++ b/source/net/yacy/http/servlets/YaCyProxyServlet.java @@ -172,7 +172,7 @@ public class YaCyProxyServlet extends ProxyServlet implements Servlet { response.setContentType(mimeType); response.setStatus(httpStatus); - if ((mimeType != null) && (mimeType.startsWith("text/html") || mimeType.startsWith("text"))) { + if ((mimeType != null) && (mimeType.startsWith("text"))) { final StringWriter buffer = new StringWriter(); if (proxyResponseHeader.containsKey(HeaderFramework.TRANSFER_ENCODING) && proxyResponseHeader.get(HeaderFramework.TRANSFER_ENCODING).contains("chunked")) { diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index da52e15ab..f4b534baa 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -568,10 +568,10 @@ public final class LoaderDispatcher { if (responseHeader == null) throw new IOException("responseHeader == null"); Document[] documents = null; - final String supportError = TextParser.supports(url, responseHeader.mime()); + final String supportError = TextParser.supports(url, responseHeader.getContentType()); if (supportError != null) throw new IOException("no parser support: " + supportError); try { - documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), response.profile().scraper(), timezoneOffset, response.depth(), response.getContent()); + documents = TextParser.parseSource(url, responseHeader.getContentType(), responseHeader.getCharacterEncoding(), response.profile().scraper(), timezoneOffset, response.depth(), response.getContent()); if (documents == null) throw new IOException("document == null"); } catch (final Exception e) { throw new IOException("parser error: " + e.getMessage()); diff --git a/source/net/yacy/server/http/HTTPDProxyHandler.java b/source/net/yacy/server/http/HTTPDProxyHandler.java index fe46a700e..b97b6be2f 100644 --- a/source/net/yacy/server/http/HTTPDProxyHandler.java +++ b/source/net/yacy/server/http/HTTPDProxyHandler.java @@ -1187,9 +1187,6 @@ public final class HTTPDProxyHandler { if (conProp.containsKey(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_HEADER)) { final HeaderFramework proxyRespondHeader = (HeaderFramework) conProp.get(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_HEADER); mime = proxyRespondHeader.mime(); - if (mime.indexOf(';') != -1) { - mime = mime.substring(0,mime.indexOf(';')); - } } logMessage.append(mime);