From 5e45f1a460f074432980a3ec7c8ff35f90d26d44 Mon Sep 17 00:00:00 2001 From: reger Date: Tue, 1 Sep 2015 21:47:25 +0200 Subject: [PATCH 1/3] enable Solr schema dynamicField _p (type=location) for YaCy coordinate_p field --- defaults/solr/schema.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/defaults/solr/schema.xml b/defaults/solr/schema.xml index a774593ce..9dde45e73 100644 --- a/defaults/solr/schema.xml +++ b/defaults/solr/schema.xml @@ -93,6 +93,7 @@ + @@ -103,7 +104,6 @@ - From dba7f15073b0114e0b56363db01de2ffbbd5ebcb Mon Sep 17 00:00:00 2001 From: reger Date: Tue, 1 Sep 2015 23:22:48 +0200 Subject: [PATCH 2/3] apply same size constrain on result image from doc as for linked images see https://github.com/yacy/yacy_search_server/commit/19f1308bf09172d2be66c58289d52ba2b2c0cf9d --- source/net/yacy/search/query/SearchEvent.java | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index 49ee55380..b2df72d37 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -1592,8 +1592,17 @@ public final class SearchEvent { // boolean fakeImageHost = ms.url().getHost() != null && ms.url().getHost().indexOf("wikipedia") > 0; // pages with image extension from wikipedia do not contain image files but html files... I know this is a bad hack, but many results come from wikipedia and we must handle that // generalize above hack (regarding url with file extension but beeing a html (with html mime) if (doc.doctype() == Response.DT_IMAGE) { - String id = ASCII.String(doc.hash()); - if (!imageViewed.containsKey(id) && !containsSpare(id)) imageSpareGood.put(id, new ImageResult(doc.url(), doc.url(), doc.mime(), doc.title(), 0, 0, 0)); + if (!doc.url().getFileName().endsWith(".ico")) { // we don't want favicons + final String id = ASCII.String(doc.hash()); + // check image size + final Collection height = doc.getFieldValues(CollectionSchema.images_height_val.getSolrFieldName()); + final Collection width = doc.getFieldValues(CollectionSchema.images_width_val.getSolrFieldName()); + int h = height == null ? 0 : (Integer) height.iterator().next(); // might be -1 for unknown + int w = width == null ? 0 : (Integer) width.iterator().next(); + if ((h <= 0 || h > 16) && (w <= 0 || w > 16)) { // we don't want too small images (< 16x16) + if (!imageViewed.containsKey(id) && !containsSpare(id)) imageSpareGood.put(id, new ImageResult(doc.url(), doc.url(), doc.mime(), doc.title(), w, h, 0)); + } + } } else { Collection altO = doc.getFieldValues(CollectionSchema.images_alt_sxt.getSolrFieldName()); Collection imgO = doc.getFieldValues(CollectionSchema.images_urlstub_sxt.getSolrFieldName()); From 802ccaead6089c1305d4ecffc24aafc304582123 Mon Sep 17 00:00:00 2001 From: reger Date: Wed, 2 Sep 2015 02:36:31 +0200 Subject: [PATCH 3/3] fix init of error cache, use latest faildates => load_date_dt --- source/net/yacy/search/index/ErrorCache.java | 3 +-- source/net/yacy/search/schema/CollectionSchema.java | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/source/net/yacy/search/index/ErrorCache.java b/source/net/yacy/search/index/ErrorCache.java index 005ca3e45..0f600e08b 100644 --- a/source/net/yacy/search/index/ErrorCache.java +++ b/source/net/yacy/search/index/ErrorCache.java @@ -42,7 +42,6 @@ import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.robots.RobotsTxt; -import net.yacy.search.index.Fulltext; import net.yacy.search.schema.CollectionConfiguration; import net.yacy.search.schema.CollectionSchema; @@ -67,7 +66,7 @@ public class ErrorCache { params.setStart(0); params.setRows(1000); params.setFacet(false); - params.setSort(new SortClause(CollectionSchema.last_modified.getSolrFieldName(), SolrQuery.ORDER.desc)); + params.setSort(new SortClause(CollectionSchema.load_date_dt.getSolrFieldName(), SolrQuery.ORDER.desc)); // load_date_dt = faildate params.setFields(CollectionSchema.id.getSolrFieldName()); params.setQuery(CollectionSchema.failreason_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM); params.set(CommonParams.DF, CollectionSchema.id.getSolrFieldName()); // DisMaxParams.QF or CommonParams.DF must be given diff --git a/source/net/yacy/search/schema/CollectionSchema.java b/source/net/yacy/search/schema/CollectionSchema.java index a4c66cbaf..edb28cc59 100644 --- a/source/net/yacy/search/schema/CollectionSchema.java +++ b/source/net/yacy/search/schema/CollectionSchema.java @@ -34,7 +34,7 @@ public enum CollectionSchema implements SchemaDeclaration { id(SolrType.string, true, true, false, false, false, "primary key of document, the URL hash **mandatory field**"), sku(SolrType.string, true, true, false, true, true, "url of document"), // a 'sku' is a stock-keeping unit, a unique identifier and a default field in unmodified solr. //sku(SolrType.text_en_splitting_tight, true, true, false, true, true, "url of document"), // a 'sku' is a stock-keeping unit, a unique identifier and a default field in unmodified solr. - last_modified(SolrType.date, true, true, false, false, false, "last-modified from http header"), + last_modified(SolrType.date, true, true, false, false, false, "last-modified from http header"), // date document was last modified dates_in_content_dts(SolrType.date, true, true, true, false, true, "if date expressions can be found in the content, these dates are listed here as date objects in order of the appearances"), dates_in_content_count_i(SolrType.num_integer, true, true, false, false, false, "the number of entries in dates_in_content_sxt"), startDates_dts(SolrType.date, true, true, true, false, true, "content of itemprop attributes with content='startDate'"),