From 6e1dc444c3cd404c693a6ff639309f3c1299c879 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Thu, 24 Jul 2014 14:59:37 +0200 Subject: [PATCH] added a snippet test function in ViewFile: you can now search for a specific word on the document; the servlet returns the snippet in the same way as it would be shown in a search result. --- htroot/ViewFile.html | 19 ++++++- htroot/ViewFile.java | 57 +++++++++++++++---- source/net/yacy/crawler/robots/RobotsTxt.java | 8 ++- .../http/servlets/YaCyDefaultServlet.java | 1 - source/net/yacy/search/query/QueryGoal.java | 2 +- .../search/schema/WebgraphConfiguration.java | 2 - 6 files changed, 72 insertions(+), 17 deletions(-) diff --git a/htroot/ViewFile.html b/htroot/ViewFile.html index f4597af96..5ec8ae81f 100644 --- a/htroot/ViewFile.html +++ b/htroot/ViewFile.html @@ -86,9 +86,16 @@ function updatepage(str) {
- + #(moar)#::#(/moar)#
+ #(moar)#:: +
Search in Document:
+
+ + +
+ #(/moar)# @@ -142,6 +149,16 @@ function updatepage(str) { Unsupported protocol. #(/error)# +#(showSnippet)#:: +
+
Snippet +
+
Headline
#[headline]#
+
Teaser Text
#[teasertext]#
+
+
+
+#(/showSnippet)# #(viewMode)# ::
Original Content from Web diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index ecfe14f8b..eea8c3275 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -33,6 +33,7 @@ import java.util.Arrays; import java.util.Collection; import java.util.Iterator; import java.util.Map; + import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.AnchorURL; @@ -54,6 +55,8 @@ import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.search.Switchboard; import net.yacy.search.index.Segment; +import net.yacy.search.query.QueryGoal; +import net.yacy.search.snippet.TextSnippet; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; @@ -77,15 +80,15 @@ public class ViewFile { final serverObjects prop = new serverObjects(); final Switchboard sb = (Switchboard)env; prop.put("topmenu", sb.getConfigBool("publicTopmenu", true) ? 1 : 0); - - if (post == null) { - prop.putHTML("error_words", ""); - prop.put("error_vMode-sentences", "1"); - prop.put("error", "1"); - prop.put("url", ""); - prop.put("viewMode", VIEW_MODE_NO_TEXT); - return prop; - } + prop.put("moar", 0); + prop.put("viewMode", VIEW_MODE_NO_TEXT); + prop.putHTML("error_words", ""); + prop.put("error_vMode-sentences", "1"); + prop.put("error", "1"); + prop.put("url", ""); + prop.put("showSnippet", 0); + + if (post == null) return prop; // get segment Segment indexSegment = sb.index; @@ -97,9 +100,18 @@ public class ViewFile { prop.putHTML("error_words", ""); } - final String viewMode = post.get("viewMode","parsed"); + prop.put("error_vMode-iframeWeb", "0"); + prop.put("error_vMode-iframeCache", "0"); + prop.put("error_vMode-plain", "0"); + prop.put("error_vMode-parsed", "0"); + prop.put("error_vMode-sentences", "0"); + prop.put("error_vMode-words", "0"); + prop.put("error_vMode-links", "0"); + prop.put("error_vMode-iframeCitations", "0"); + final boolean showSnippet = post.get("show", "").equals("Show Snippet"); + final String viewMode = showSnippet ? "sentences" : post.get("viewMode", "sentences"); prop.put("error_vMode-" + viewMode, "1"); - + DigestURL url = null; String descr = ""; final int wordCount = 0; @@ -155,6 +167,8 @@ public class ViewFile { return prop; } prop.put("url", url.toNormalform(true)); + prop.put("moar", 1); + prop.put("moar_search", post.get("search","")); // loading the resource content as byte array prop.put("error_incache", Cache.has(url.hash()) ? 1 : 0); @@ -337,6 +351,27 @@ public class ViewFile { prop.put("viewMode_links", i); } + // optional: generate snippet + if (showSnippet) { + QueryGoal goal = new QueryGoal(post.get("search", "")); + TextSnippet snippet = new TextSnippet( + null, + urlEntry, + goal.getIncludeHashes(), + CacheStrategy.CACHEONLY, + false, + 180, + false); + String titlestr = urlEntry.dc_title(); + // if title is empty use filename as title + if (titlestr.isEmpty()) { // if url has no filename, title is still empty (e.g. "www.host.com/" ) + titlestr = urlEntry.url() != null ? urlEntry.url().getFileName() : ""; + } + final String desc = (snippet == null) ? "" : snippet.isMarked() ? snippet.getLineRaw() : snippet.getLineMarked(goal); + prop.put("showSnippet_headline", titlestr); + prop.put("showSnippet_teasertext", desc); + prop.put("showSnippet", 1); + } if (document != null) document.close(); } prop.put("error", "0"); diff --git a/source/net/yacy/crawler/robots/RobotsTxt.java b/source/net/yacy/crawler/robots/RobotsTxt.java index a570f4099..06d3a536b 100644 --- a/source/net/yacy/crawler/robots/RobotsTxt.java +++ b/source/net/yacy/crawler/robots/RobotsTxt.java @@ -338,7 +338,13 @@ public class RobotsTxt { return sb.toString(); } - public static DigestURL robotsURL(final String urlHostPort) { + /** + * generate a robots.txt url. + * @param urlHostPort a string of the form ':' or just + * @return the full robots.txt url + */ + public static DigestURL robotsURL(String urlHostPort) { + if (urlHostPort.endsWith(":80")) urlHostPort = urlHostPort.substring(0, urlHostPort.length() - 3); DigestURL robotsURL = null; try { robotsURL = new DigestURL((urlHostPort.endsWith(":443") ? "https://" : "http://") + urlHostPort + "/robots.txt"); diff --git a/source/net/yacy/http/servlets/YaCyDefaultServlet.java b/source/net/yacy/http/servlets/YaCyDefaultServlet.java index 7eb2eb16d..7cd2d2fb6 100644 --- a/source/net/yacy/http/servlets/YaCyDefaultServlet.java +++ b/source/net/yacy/http/servlets/YaCyDefaultServlet.java @@ -75,7 +75,6 @@ import net.yacy.search.SwitchboardConstants; import net.yacy.server.http.HTTPDFileHandler; import net.yacy.server.http.TemplateEngine; import net.yacy.server.serverClassLoader; -import net.yacy.server.serverCore; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; import net.yacy.server.servletProperties; diff --git a/source/net/yacy/search/query/QueryGoal.java b/source/net/yacy/search/query/QueryGoal.java index eb7e2d296..455955da2 100644 --- a/source/net/yacy/search/query/QueryGoal.java +++ b/source/net/yacy/search/query/QueryGoal.java @@ -100,7 +100,7 @@ public class QueryGoal { } /** - * Creates a QueryGoal from a serach query string + * Creates a QueryGoal from a search query string * @param query_words search string (the actual search terms, excluding application specific modifier) */ public QueryGoal(String query_words) { diff --git a/source/net/yacy/search/schema/WebgraphConfiguration.java b/source/net/yacy/search/schema/WebgraphConfiguration.java index 6b86fc27f..d1364ec69 100644 --- a/source/net/yacy/search/schema/WebgraphConfiguration.java +++ b/source/net/yacy/search/schema/WebgraphConfiguration.java @@ -51,8 +51,6 @@ import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.HostBalancer; -import net.yacy.document.parser.htmlParser; -import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ImageEntry; import net.yacy.search.schema.CollectionConfiguration.Subgraph;