Merge branch 'master' of ssh://git@github.com/yacy/yacy_search_server

10 years ago · cc8d6ad75f
parent 70e483ecc6 802ccaead6
commit cc8d6ad75f
4 changed files with 14 additions and 6 deletions
--- a/defaults/solr/schema.xml
+++ b/defaults/solr/schema.xml
@ -93,6 +93,7 @@
 	<dynamicField name="*_dt"  type="date"    indexed="true"  stored="true"/>
 	<dynamicField name="*_dts" type="date"    indexed="true"  stored="true" multiValued="true"/>
 	<dynamicField name="*_t"  type="text_general"    indexed="true"  stored="true"/>
+        <dynamicField name="*_p"  type="location" indexed="true" stored="true"/>
 	<dynamicField name="*_coordinate"  type="tdouble" indexed="true"  stored="true" />
 	<dynamicField name="*_txt" type="text_general"   indexed="true"  stored="true" multiValued="true"/>
 	<dynamicField name="*_val" type="int" indexed="true" stored="true" multiValued="true"/> <!-- YaCy special -->
@ -103,7 +104,6 @@
 	<dynamicField name="*_fs" type="float"  indexed="true"  stored="true"  multiValued="true"/>
 	<dynamicField name="*_d"  type="double" indexed="true"  stored="true"/>
 	<dynamicField name="*_ds" type="double" indexed="true"  stored="true"  multiValued="true"/>
-	<dynamicField name="*_p"  type="location" indexed="true" stored="true"/>
 	<dynamicField name="*_ti" type="tint"    indexed="true"  stored="true"/>
 	<dynamicField name="*_tl" type="tlong"   indexed="true"  stored="true"/>
 	<dynamicField name="*_tf" type="tfloat"  indexed="true"  stored="true"/>
--- a/source/net/yacy/search/index/ErrorCache.java
+++ b/source/net/yacy/search/index/ErrorCache.java
@ -42,7 +42,6 @@ import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
 import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.crawler.data.CrawlProfile;
 import net.yacy.crawler.robots.RobotsTxt;
-import net.yacy.search.index.Fulltext;
 import net.yacy.search.schema.CollectionConfiguration;
 import net.yacy.search.schema.CollectionSchema;

@ -67,7 +66,7 @@ public class ErrorCache {
                params.setStart(0);
                params.setRows(1000);
                params.setFacet(false);
-                params.setSort(new SortClause(CollectionSchema.last_modified.getSolrFieldName(), SolrQuery.ORDER.desc));
+                params.setSort(new SortClause(CollectionSchema.load_date_dt.getSolrFieldName(), SolrQuery.ORDER.desc)); // load_date_dt = faildate
                params.setFields(CollectionSchema.id.getSolrFieldName());
                params.setQuery(CollectionSchema.failreason_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM);
                params.set(CommonParams.DF, CollectionSchema.id.getSolrFieldName()); // DisMaxParams.QF or CommonParams.DF must be given
--- a/source/net/yacy/search/query/SearchEvent.java
+++ b/source/net/yacy/search/query/SearchEvent.java
@ -1592,8 +1592,17 @@ public final class SearchEvent {
        // boolean fakeImageHost = ms.url().getHost() != null && ms.url().getHost().indexOf("wikipedia") > 0; // pages with image extension from wikipedia do not contain image files but html files... I know this is a bad hack, but many results come from wikipedia and we must handle that
        // generalize above hack (regarding url with file extension but beeing a html (with html mime)
        if (doc.doctype() == Response.DT_IMAGE) {
-            String id = ASCII.String(doc.hash());
-            if (!imageViewed.containsKey(id) && !containsSpare(id)) imageSpareGood.put(id, new ImageResult(doc.url(), doc.url(), doc.mime(), doc.title(), 0, 0, 0));
+            if (!doc.url().getFileName().endsWith(".ico")) { // we don't want favicons
+                final String id = ASCII.String(doc.hash());
+                // check image size
+                final Collection<Object> height = doc.getFieldValues(CollectionSchema.images_height_val.getSolrFieldName());
+                final Collection<Object> width = doc.getFieldValues(CollectionSchema.images_width_val.getSolrFieldName());
+                int h = height == null ? 0 : (Integer) height.iterator().next(); // might be -1 for unknown
+                int w = width == null ? 0 : (Integer) width.iterator().next();
+                if ((h <= 0 || h > 16) && (w <= 0 || w > 16)) { // we don't want too small images (< 16x16)
+                    if (!imageViewed.containsKey(id) && !containsSpare(id)) imageSpareGood.put(id, new ImageResult(doc.url(), doc.url(), doc.mime(), doc.title(), w, h, 0));
+                }
+            }
        } else {
            Collection<Object> altO = doc.getFieldValues(CollectionSchema.images_alt_sxt.getSolrFieldName());
            Collection<Object> imgO = doc.getFieldValues(CollectionSchema.images_urlstub_sxt.getSolrFieldName());
--- a/source/net/yacy/search/schema/CollectionSchema.java
+++ b/source/net/yacy/search/schema/CollectionSchema.java
@ -34,7 +34,7 @@ public enum CollectionSchema implements SchemaDeclaration {
    id(SolrType.string, true, true, false, false, false, "primary key of document, the URL hash **mandatory field**"),
    sku(SolrType.string, true, true, false, true, true, "url of document"), // a 'sku' is a stock-keeping unit, a unique identifier and a default field in unmodified solr.
    //sku(SolrType.text_en_splitting_tight, true, true, false, true, true, "url of document"), // a 'sku' is a stock-keeping unit, a unique identifier and a default field in unmodified solr.
-    last_modified(SolrType.date, true, true, false, false, false, "last-modified from http header"),
+    last_modified(SolrType.date, true, true, false, false, false, "last-modified from http header"), // date document was last modified
    dates_in_content_dts(SolrType.date, true, true, true, false, true, "if date expressions can be found in the content, these dates are listed here as date objects in order of the appearances"),
    dates_in_content_count_i(SolrType.num_integer, true, true, false, false, false, "the number of entries in dates_in_content_sxt"),
    startDates_dts(SolrType.date, true, true, true, false, true, "content of itemprop attributes with content='startDate'"),