From f1032fb8fe78d2e5a2305db656eeb6c371c8c59f Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 16 Sep 2014 13:41:01 +0200 Subject: [PATCH] more enhancements to image search in case that a restriction to a single domain is done --- source/net/yacy/cora/protocol/Domains.java | 21 +++++++++++++++++++ source/net/yacy/peers/RemoteSearch.java | 4 ++-- source/net/yacy/search/query/QueryGoal.java | 6 ++++-- source/net/yacy/search/query/QueryParams.java | 2 +- source/net/yacy/search/query/SearchEvent.java | 5 +++-- 5 files changed, 31 insertions(+), 7 deletions(-) diff --git a/source/net/yacy/cora/protocol/Domains.java b/source/net/yacy/cora/protocol/Domains.java index e428e14a0..bf759fd2a 100644 --- a/source/net/yacy/cora/protocol/Domains.java +++ b/source/net/yacy/cora/protocol/Domains.java @@ -1150,6 +1150,7 @@ public class Domains { * @return the TLD or ccSLD+TLD if that is on a list */ public static String getDNC(String host) { + if (host == null || host.length() == 0) return ""; int p0 = host.lastIndexOf('.'); if (p0 < 0) return host.toLowerCase(); int p1 = host.lastIndexOf('.', p0 - 1); @@ -1158,6 +1159,26 @@ public class Domains { return ccSLD_TLD.contains(ccSLDTLD) ? ccSLDTLD : host.substring(p0 + 1).toLowerCase(); } + /** + * Compute the Second Level Domain of a host name excluding a possible use of a ccSLD. + * If the SLD is a ccSLD, then the Third Level Domain is returned + * @param host + * @return the SLD or the Third Level Domain, if the SLD is a ccSLD + */ + public static String getSmartSLD(String host) { + if (host == null || host.length() == 0) return ""; + int p0 = host.lastIndexOf('.'); + if (p0 < 0) return host.toLowerCase(); // no subdomain present + int p1 = host.lastIndexOf('.', p0 - 1); + if (p1 < 0) return host.substring(0, p0).toLowerCase(); // no third-level domain present, just use the second level + String ccSLDTLD = host.substring(p1 + 1).toLowerCase(); + if (!ccSLD_TLD.contains(ccSLDTLD)) return host.substring(p1 + 1, p0).toLowerCase(); // because the ccSLDTLD is not contained in the list of knwon ccSDL, we use the SLD from p1 to p0 + // the third level domain is the correct one + int p2 = host.lastIndexOf('.', p1 - 1); + if (p2 < 0) return host.substring(0, p1).toLowerCase(); + return host.substring(p2 + 1, p1); + } + public static void main(final String[] args) { /* try { diff --git a/source/net/yacy/peers/RemoteSearch.java b/source/net/yacy/peers/RemoteSearch.java index 02af62cf3..facd3a25b 100644 --- a/source/net/yacy/peers/RemoteSearch.java +++ b/source/net/yacy/peers/RemoteSearch.java @@ -36,6 +36,7 @@ import org.apache.solr.client.solrj.SolrQuery; import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.protocol.Domains; import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.Memory; @@ -176,8 +177,7 @@ public class RemoteSearch extends Thread { if (event.query.modifier.sitehost != null && event.query.modifier.sitehost.length() > 0) { // select peers according to host name, not the query goal - String[] hp = event.query.modifier.sitehost.split("\\."); - String newGoal = hp.length <= 1 ? event.query.modifier.sitehost : hp.length == 2 ? hp[0] : hp[hp.length - 2].length() == 2 ? hp[hp.length - 3] : hp[hp.length - 2]; + String newGoal = Domains.getSmartSLD(event.query.modifier.sitehost); dhtPeers = DHTSelection.selectDHTSearchTargets( event.peers, QueryParams.hashes2Set(ASCII.String(Word.word2hash(newGoal))), diff --git a/source/net/yacy/search/query/QueryGoal.java b/source/net/yacy/search/query/QueryGoal.java index 8f9624754..17f837298 100644 --- a/source/net/yacy/search/query/QueryGoal.java +++ b/source/net/yacy/search/query/QueryGoal.java @@ -38,6 +38,7 @@ import net.yacy.cora.federate.solr.SchemaDeclaration; import net.yacy.cora.federate.solr.SolrType; import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; import net.yacy.cora.order.NaturalOrder; +import net.yacy.cora.protocol.Domains; import net.yacy.cora.storage.HandleSet; import net.yacy.document.parser.html.AbstractScraper; import net.yacy.document.parser.html.CharacterCoding; @@ -360,7 +361,7 @@ public class QueryGoal { return q; } - public StringBuilder collectionImageQueryString() { + public StringBuilder collectionImageQueryString(final QueryModifier modifier) { final StringBuilder q = new StringBuilder(80); // add filter to prevent that results come from failed urls @@ -377,8 +378,9 @@ public class QueryGoal { // combine these queries for all relevant fields if (w.length() > 0) { + String hostname = modifier == null || modifier.sitehost == null || modifier.sitehost.length() == 0 ? null : Domains.getSmartSLD(modifier.sitehost); q.append(" AND ("); - q.append('(').append(CollectionSchema.images_text_t.getSolrFieldName()).append(':').append(w).append("^100.0) OR "); + q.append('(').append(CollectionSchema.images_text_t.getSolrFieldName()).append(':').append(hostname == null ? w : "(" + w + " " /*NOT an OR!, the hostname shall only boost*/ + hostname + ")").append("^100.0) OR "); q.append('(').append(CollectionSchema.title.getSolrFieldName()).append(':').append(w).append("^50.0) OR "); q.append('(').append(CollectionSchema.keywords.getSolrFieldName()).append(':').append(w).append("^10.0) OR "); q.append('(').append(CollectionSchema.text_t.getSolrFieldName()).append(':').append(w).append(')'); diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java index e1a93ba31..f181215f5 100644 --- a/source/net/yacy/search/query/QueryParams.java +++ b/source/net/yacy/search/query/QueryParams.java @@ -382,7 +382,7 @@ public final class QueryParams { // construct query final SolrQuery params = getBasicParams(getFacets); - params.setQuery(this.queryGoal.collectionImageQueryString().toString()); + params.setQuery(this.queryGoal.collectionImageQueryString(this.modifier).toString()); // set boosts StringBuilder bq = new StringBuilder(); diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index db1e400cf..077eaec61 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -55,6 +55,7 @@ import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.Distribution; import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.cora.order.Base64Order; +import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.Scanner; import net.yacy.cora.sorting.ConcurrentScoreMap; import net.yacy.cora.sorting.ReversibleScoreMap; @@ -466,8 +467,7 @@ public final class SearchEvent { SearchEvent.this.query.modifier.sitehost != null && SearchEvent.this.query.modifier.sitehost.length() > 0 ) { // try again with sitehost - String[] hp = SearchEvent.this.query.modifier.sitehost.split("\\."); - String newGoal = hp.length <= 1 ? SearchEvent.this.query.modifier.sitehost : hp.length == 2 ? hp[0] : hp[hp.length - 2].length() == 2 ? hp[hp.length - 3] : hp[hp.length - 2]; + String newGoal = Domains.getSmartSLD(SearchEvent.this.query.modifier.sitehost); search = SearchEvent.this.query .getSegment() @@ -1571,6 +1571,7 @@ public final class SearchEvent { List width = widthO == null ? new ArrayList(img.size()) : (List) widthO; for (int c = 0; c < img.size(); c++) { String image_urlstub = (String) img.get(c); + if (image_urlstub.endsWith(".ico")) continue; // we don't want favicons, makes the result look idiotic String image_alt = alt != null && alt.size() > c ? (String) alt.get(c) : ""; boolean match = (query.getQueryGoal().matches(image_urlstub) || query.getQueryGoal().matches(image_alt)); try {