From 17ae51e741abb7d60593a2e7e378895324e67457 Mon Sep 17 00:00:00 2001 From: orbiter Date: Sun, 17 Mar 2013 22:13:56 +0100 Subject: [PATCH 1/6] increased number of links limitation from 1000 to 10000 for rss feeds and html documents --- defaults/solr.webgraph.schema | 2 +- source/net/yacy/cora/document/RSSFeed.java | 2 +- source/net/yacy/document/parser/htmlParser.java | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/defaults/solr.webgraph.schema b/defaults/solr.webgraph.schema index f7fb37f76..dec0dcb02 100644 --- a/defaults/solr.webgraph.schema +++ b/defaults/solr.webgraph.schema @@ -156,7 +156,7 @@ target_path_folders_sxt ## the values from key-value pairs in the search part of the url (target) #target_parameter_value_sxt -## "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target) +## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target) #target_clickdepth_i ## host of the url (target) diff --git a/source/net/yacy/cora/document/RSSFeed.java b/source/net/yacy/cora/document/RSSFeed.java index bb65405da..cb688e39a 100644 --- a/source/net/yacy/cora/document/RSSFeed.java +++ b/source/net/yacy/cora/document/RSSFeed.java @@ -31,7 +31,7 @@ import java.util.Set; public class RSSFeed implements Iterable { - public static final int DEFAULT_MAXSIZE = 1000; + public static final int DEFAULT_MAXSIZE = 10000; // class variables private RSSMessage channel; diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java index fe0bd8184..a8d84a39b 100644 --- a/source/net/yacy/document/parser/htmlParser.java +++ b/source/net/yacy/document/parser/htmlParser.java @@ -51,7 +51,7 @@ import com.ibm.icu.text.CharsetDetector; public class htmlParser extends AbstractParser implements Parser { private static final Pattern patternUnderline = Pattern.compile("_"); - private static final int maxLinks = 1000; + private static final int maxLinks = 10000; public htmlParser() { super("Streaming HTML Parser"); From 65d73e56523785aa423fd9d23bacd7551a9ddb52 Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 19 Mar 2013 00:59:47 +0100 Subject: [PATCH 2/6] renamed callback function to 'callback' because that is a standard for jsonp which is also used in backbone.js/jquery --- .../cora/federate/solr/responsewriter/JsonResponseWriter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/net/yacy/cora/federate/solr/responsewriter/JsonResponseWriter.java b/source/net/yacy/cora/federate/solr/responsewriter/JsonResponseWriter.java index 7f3a2ca3a..9f14cf6d2 100644 --- a/source/net/yacy/cora/federate/solr/responsewriter/JsonResponseWriter.java +++ b/source/net/yacy/cora/federate/solr/responsewriter/JsonResponseWriter.java @@ -107,7 +107,7 @@ public class JsonResponseWriter implements QueryResponseWriter { resHead.offset = response.offset(); // equal to 'start' resHead.numFound = response.matches(); - String jsonp = request.getParams().get("jsonp"); // check for JSONP + String jsonp = request.getParams().get("callback"); // check for JSONP if (jsonp != null) { writer.write(jsonp.toCharArray()); writer.write("([".toCharArray()); From 342ba1049b1465e65430a5eeef6e7bc6a6be90d2 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 19 Mar 2013 10:32:01 +0100 Subject: [PATCH 3/6] - callback fix - memory allocation problem in RowCollection: if memory is too low, do not to try to increase by 1 because this leads to very long execution time and at the end to the same OOM as if we allocate the memory at the moment we need it even if the resource observer states that this memory is not there. To compensate this, the increase size is reduced. --- htroot/portalsearch/yacy-portalsearch.js | 2 +- source/net/yacy/kelondro/index/RowCollection.java | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/htroot/portalsearch/yacy-portalsearch.js b/htroot/portalsearch/yacy-portalsearch.js index dbaf9d6ed..8121347e5 100644 --- a/htroot/portalsearch/yacy-portalsearch.js +++ b/htroot/portalsearch/yacy-portalsearch.js @@ -207,7 +207,7 @@ function yrun() { function yacysearch(clear) { var url = yconf.url + '/yacysearch.json?callback=?' // JSONP (cross domain) request URL - //var url = yconf.url + '/solr/select?wt=yjson&jsonp=?' // JSONP (cross domain) request URL + //var url = yconf.url + '/solr/select?wt=yjson&callback=?' // JSONP (cross domain) request URL if(clear) { $('#ypopup').empty(); diff --git a/source/net/yacy/kelondro/index/RowCollection.java b/source/net/yacy/kelondro/index/RowCollection.java index 523bad0e4..065c641a0 100644 --- a/source/net/yacy/kelondro/index/RowCollection.java +++ b/source/net/yacy/kelondro/index/RowCollection.java @@ -54,7 +54,7 @@ public class RowCollection implements Sortable, Iterable, private static final byte[] EMPTY_CACHE = new byte[0]; public static final long growfactorLarge100 = 140L; - public static final long growfactorSmall100 = 120L; + public static final long growfactorSmall100 = 110L; private static final int isortlimit = 20; private static final int exp_chunkcount = 0; @@ -246,12 +246,11 @@ public class RowCollection implements Sortable, Iterable, long allocram = needed * growfactorLarge100 / 100L; allocram -= allocram % this.rowdef.objectsize; assert allocram > 0 : "elements = " + elements + ", new = " + allocram; - if (allocram <= Integer.MAX_VALUE && MemoryControl.request(allocram, false)) return allocram; + if (allocram <= Integer.MAX_VALUE && MemoryControl.request(allocram, forcegc)) return allocram; allocram = needed * growfactorSmall100 / 100L; allocram -= allocram % this.rowdef.objectsize; assert allocram >= 0 : "elements = " + elements + ", new = " + allocram; - if (allocram <= Integer.MAX_VALUE && MemoryControl.request(allocram, forcegc)) return allocram; - return needed; + return allocram; } private final void ensureSize(final int elements) throws SpaceExceededException { From 5512be6673734a9f017b13a52737a71d2df8a112 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 19 Mar 2013 10:33:35 +0100 Subject: [PATCH 4/6] fix in GSA result writer which evaluates result context fields as String. After the migration to Solr 4.1.0 'some' of these fields suddenly are stored as String[]; this patch compensates this confusion. --- .../responsewriter/GSAResponseWriter.java | 40 +++++++++++++------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/source/net/yacy/cora/federate/solr/responsewriter/GSAResponseWriter.java b/source/net/yacy/cora/federate/solr/responsewriter/GSAResponseWriter.java index 776894c67..a16469788 100644 --- a/source/net/yacy/cora/federate/solr/responsewriter/GSAResponseWriter.java +++ b/source/net/yacy/cora/federate/solr/responsewriter/GSAResponseWriter.java @@ -164,21 +164,26 @@ public class GSAResponseWriter implements QueryResponseWriter { // write header writer.write(XML_START); String query = request.getParams().get("originalQuery"); - String site = (String) context.get("site"); + String site = getContextString(context, "site", ""); + String sort = getContextString(context, "sort", ""); + String client = getContextString(context, "client", ""); + String ip = getContextString(context, "ip", ""); + String access = getContextString(context, "access", ""); + String entqr = getContextString(context, "entqr", ""); OpensearchResponseWriter.solitaireTag(writer, "TM", Long.toString(System.currentTimeMillis() - start)); OpensearchResponseWriter.solitaireTag(writer, "Q", query); - paramTag(writer, "sort", (String) context.get("sort")); + paramTag(writer, "sort", sort); paramTag(writer, "output", "xml_no_dtd"); paramTag(writer, "ie", "UTF-8"); paramTag(writer, "oe", "UTF-8"); - paramTag(writer, "client", (String) context.get("client")); + paramTag(writer, "client", client); paramTag(writer, "q", query); paramTag(writer, "site", site); paramTag(writer, "start", Integer.toString(resHead.offset)); paramTag(writer, "num", Integer.toString(resHead.rows)); - paramTag(writer, "ip", (String) context.get("ip")); - paramTag(writer, "access", (String) context.get("access")); // p - search only public content, s - search only secure content, a - search all content, both public and secure - paramTag(writer, "entqr", (String) context.get("entqr")); // query expansion policy; (entqr=1) -- Uses only the search appliance's synonym file, (entqr=1) -- Uses only the search appliance's synonym file, (entqr=3) -- Uses both standard and local synonym files. + paramTag(writer, "ip", ip); + paramTag(writer, "access", access); // p - search only public content, s - search only secure content, a - search all content, both public and secure + paramTag(writer, "entqr", entqr); // query expansion policy; (entqr=1) -- Uses only the search appliance's synonym file, (entqr=1) -- Uses only the search appliance's synonym file, (entqr=3) -- Uses both standard and local synonym files. // body introduction final int responseCount = response.size(); @@ -192,16 +197,16 @@ public class GSAResponseWriter implements QueryResponseWriter { writer.write(""); if (prevStart >= 0) { writer.write(""); - XML.escapeCharData("/gsa/search?q=" + request.getParams().get("q") + "&site=" + (String) context.get("site") + - "&lr=&ie=UTF-8&oe=UTF-8&output=xml_no_dtd&client=" + (String) context.get("client") + "&access=" + (String) context.get("access") + - "&sort=" + (String) context.get("sort") + "&start=" + prevStart + "&sa=N", writer); // a relative URL pointing to the NEXT results page. + XML.escapeCharData("/gsa/search?q=" + request.getParams().get("q") + "&site=" + site + + "&lr=&ie=UTF-8&oe=UTF-8&output=xml_no_dtd&client=" + client + "&access=" + access + + "&sort=" + sort + "&start=" + prevStart + "&sa=N", writer); // a relative URL pointing to the NEXT results page. writer.write(""); } if (nextNum > 0) { writer.write(""); - XML.escapeCharData("/gsa/search?q=" + request.getParams().get("q") + "&site=" + (String) context.get("site") + - "&lr=&ie=UTF-8&oe=UTF-8&output=xml_no_dtd&client=" + (String) context.get("client") + "&access=" + (String) context.get("access") + - "&sort=" + (String) context.get("sort") + "&start=" + nextStart + "&num=" + nextNum + "&sa=N", writer); // a relative URL pointing to the NEXT results page. + XML.escapeCharData("/gsa/search?q=" + request.getParams().get("q") + "&site=" + site + + "&lr=&ie=UTF-8&oe=UTF-8&output=xml_no_dtd&client=" + client + "&access=" + access + + "&sort=" + sort + "&start=" + nextStart + "&num=" + nextNum + "&sa=N", writer); // a relative URL pointing to the NEXT results page. writer.write(""); } writer.write(""); @@ -296,6 +301,17 @@ public class GSAResponseWriter implements QueryResponseWriter { writer.write(XML_STOP); } + private static String getContextString(Map context, String key, String dflt) { + Object v = context.get(key); + if (v == null) return dflt; + if (v instanceof String) return (String) v; + if (v instanceof String[]) { + String[] va = (String[]) v; + return va.length == 0 ? dflt : va[0]; + } + return dflt; + } + public static void paramTag(final Writer writer, final String tagname, String value) throws IOException { if (value == null || value.length() == 0) return; writer.write(" Date: Tue, 19 Mar 2013 11:23:18 +0100 Subject: [PATCH 5/6] better search timing; prevents '0 results' for very large local indexes >> 10 mio documents --- htroot/yacysearchitem.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index b96dd8028..2f4486092 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -104,11 +104,13 @@ public class yacysearchitem { prop.put("navurlBase", QueryParams.navurlBase("html", theSearch.query, null).toString()); final String target_special_pattern = sb.getConfig(SwitchboardConstants.SEARCH_TARGET_SPECIAL_PATTERN, ""); + long timeout = item == 0 ? 10000 : (theSearch.query.isLocal() ? 1000 : 3000); + if (theSearch.query.contentdom == Classification.ContentDomain.TEXT || theSearch.query.contentdom == Classification.ContentDomain.ALL) { // text search // generate result object - final ResultEntry result = theSearch.oneResult(item, theSearch.query.isLocal() ? 1000 : 3000); + final ResultEntry result = theSearch.oneResult(item, timeout); if (result == null) return prop; // no content final String resultUrlstring = result.urlstring(); final DigestURI resultURL = result.url(); @@ -261,7 +263,7 @@ public class yacysearchitem { prop.put("content", theSearch.query.contentdom.getCode() + 1); // switch on specific content //final MediaSnippet ms = theSearch.result().oneImage(item); - final ResultEntry ms = theSearch.oneResult(item, theSearch.query.isLocal() ? 1000 : 5000); + final ResultEntry ms = theSearch.oneResult(item, timeout); if (ms == null) { prop.put("content_item", "0"); } else { @@ -297,7 +299,7 @@ public class yacysearchitem { // any other media content // generate result object - final ResultEntry ms = theSearch.oneResult(item, theSearch.query.isLocal() ? 1000 : 5000); + final ResultEntry ms = theSearch.oneResult(item, timeout); prop.put("content", theSearch.query.contentdom.getCode() + 1); // switch on specific content if (ms == null) { prop.put("content_item", "0"); From 870aedf3c6d55536e2f176c6737f9ecc60e472ab Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 20 Mar 2013 16:19:49 +0100 Subject: [PATCH 6/6] fixes for better search interface integration in yaml templates --- htroot/solr/select.java | 11 ++++++----- htroot/yacysearch.java | 4 ++-- .../solr/responsewriter/JsonResponseWriter.java | 4 +++- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/htroot/solr/select.java b/htroot/solr/select.java index b39638fcf..1b1d2ba8c 100644 --- a/htroot/solr/select.java +++ b/htroot/solr/select.java @@ -177,11 +177,12 @@ public class select { // if this is a call to YaCys special search formats, enhance the query with field assignments if ((responseWriter instanceof JsonResponseWriter || responseWriter instanceof OpensearchResponseWriter) && "true".equals(post.get("hl", "true"))) { // add options for snippet generation - post.put("hl", "true"); - post.put("hl.fl", "text_t,h1,h2"); - post.put("hl.simple.pre", ""); - post.put("hl.simple.post", ""); - post.put("hl.fragsize", Integer.toString(SearchEvent.SNIPPET_MAX_LENGTH)); + if (!post.containsKey("hl.q")) post.put("hl.q", q); + if (!post.containsKey("hl.fl")) post.put("hl.fl", CollectionSchema.h1_txt.getSolrFieldName() + "," + CollectionSchema.h2_txt.getSolrFieldName() + "," + CollectionSchema.text_t.getSolrFieldName()); + if (!post.containsKey("hl.alternateField")) post.put("hl.alternateField", CollectionSchema.description.getSolrFieldName()); + if (!post.containsKey("hl.simple.pre")) post.put("hl.simple.pre", ""); + if (!post.containsKey("hl.simple.post")) post.put("hl.simple.post", ""); + if (!post.containsKey("hl.fragsize")) post.put("hl.fragsize", Integer.toString(SearchEvent.SNIPPET_MAX_LENGTH)); } // get the embedded connector diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index cfcd28a8d..46ec5dcb7 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -212,8 +212,8 @@ public class yacysearch { ? 100 : 5000) : (snippetFetchStrategy != null && snippetFetchStrategy.isAllowedToFetchOnline() ? 20 : 1000), - post.getInt("maximumRecords", post.getInt("count", 10))); // SRU syntax with old property as alternative - int startRecord = post.getInt("startRecord", post.getInt("offset", 0)); + post.getInt("maximumRecords", post.getInt("count", post.getInt("rows", 10)))); // SRU syntax with old property as alternative + int startRecord = post.getInt("startRecord", post.getInt("offset", post.getInt("start", 0))); boolean global = post.get("resource", "local").equals("global") && sb.peers.sizeConnected() > 0; final boolean indexof = (post != null && post.get("indexof", "").equals("on")); diff --git a/source/net/yacy/cora/federate/solr/responsewriter/JsonResponseWriter.java b/source/net/yacy/cora/federate/solr/responsewriter/JsonResponseWriter.java index 9f14cf6d2..1dfd38d1c 100644 --- a/source/net/yacy/cora/federate/solr/responsewriter/JsonResponseWriter.java +++ b/source/net/yacy/cora/federate/solr/responsewriter/JsonResponseWriter.java @@ -127,6 +127,7 @@ public class JsonResponseWriter implements QueryResponseWriter { SolrIndexSearcher searcher = request.getSearcher(); DocIterator iterator = response.iterator(); for (int i = 0; i < responseCount; i++) { + try { writer.write("{\n".toCharArray()); int id = iterator.nextDoc(); Document doc = searcher.doc(id, OpensearchResponseWriter.SOLR_FIELDS); @@ -216,6 +217,7 @@ public class JsonResponseWriter implements QueryResponseWriter { if (i < responseCount - 1) { writer.write(",\n".toCharArray()); } + } catch (Throwable ee) {} } writer.write("],\n".toCharArray()); @@ -233,7 +235,7 @@ public class JsonResponseWriter implements QueryResponseWriter { NamedList authors = facetFields == null ? null : (NamedList) facetFields.get(CollectionSchema.author_sxt.getSolrFieldName()); if (domains != null) { - writer.write("{\"facetname\":\"domains\",\"displayname\":\"Domains\",\"type\":\"String\",\"min\":\"0\",\"max\":\"0\",\"mean\":\"0\",\"elements\":[\n".toCharArray()); + writer.write("{\"facetname\":\"domains\",\"displayname\":\"Provider\",\"type\":\"String\",\"min\":\"0\",\"max\":\"0\",\"mean\":\"0\",\"elements\":[\n".toCharArray()); for (int i = 0; i < domains.size(); i++) { facetEntry(writer, "site", domains.getName(i), Integer.toString(domains.getVal(i))); if (i < domains.size() - 1) writer.write(',');