more enhancements to image search in case that a restriction to a single

domain is done
pull/1/head
Michael Peter Christen 10 years ago
parent 19c6b97572
commit f1032fb8fe

@ -1150,6 +1150,7 @@ public class Domains {
* @return the TLD or ccSLD+TLD if that is on a list
*/
public static String getDNC(String host) {
if (host == null || host.length() == 0) return "";
int p0 = host.lastIndexOf('.');
if (p0 < 0) return host.toLowerCase();
int p1 = host.lastIndexOf('.', p0 - 1);
@ -1158,6 +1159,26 @@ public class Domains {
return ccSLD_TLD.contains(ccSLDTLD) ? ccSLDTLD : host.substring(p0 + 1).toLowerCase();
}
/**
* Compute the Second Level Domain of a host name excluding a possible use of a ccSLD.
* If the SLD is a ccSLD, then the Third Level Domain is returned
* @param host
* @return the SLD or the Third Level Domain, if the SLD is a ccSLD
*/
public static String getSmartSLD(String host) {
if (host == null || host.length() == 0) return "";
int p0 = host.lastIndexOf('.');
if (p0 < 0) return host.toLowerCase(); // no subdomain present
int p1 = host.lastIndexOf('.', p0 - 1);
if (p1 < 0) return host.substring(0, p0).toLowerCase(); // no third-level domain present, just use the second level
String ccSLDTLD = host.substring(p1 + 1).toLowerCase();
if (!ccSLD_TLD.contains(ccSLDTLD)) return host.substring(p1 + 1, p0).toLowerCase(); // because the ccSLDTLD is not contained in the list of knwon ccSDL, we use the SLD from p1 to p0
// the third level domain is the correct one
int p2 = host.lastIndexOf('.', p1 - 1);
if (p2 < 0) return host.substring(0, p1).toLowerCase();
return host.substring(p2 + 1, p1);
}
public static void main(final String[] args) {
/*
try {

@ -36,6 +36,7 @@ import org.apache.solr.client.solrj.SolrQuery;
import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.Memory;
@ -176,8 +177,7 @@ public class RemoteSearch extends Thread {
if (event.query.modifier.sitehost != null && event.query.modifier.sitehost.length() > 0) {
// select peers according to host name, not the query goal
String[] hp = event.query.modifier.sitehost.split("\\.");
String newGoal = hp.length <= 1 ? event.query.modifier.sitehost : hp.length == 2 ? hp[0] : hp[hp.length - 2].length() == 2 ? hp[hp.length - 3] : hp[hp.length - 2];
String newGoal = Domains.getSmartSLD(event.query.modifier.sitehost);
dhtPeers = DHTSelection.selectDHTSearchTargets(
event.peers,
QueryParams.hashes2Set(ASCII.String(Word.word2hash(newGoal))),

@ -38,6 +38,7 @@ import net.yacy.cora.federate.solr.SchemaDeclaration;
import net.yacy.cora.federate.solr.SolrType;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
import net.yacy.cora.order.NaturalOrder;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.storage.HandleSet;
import net.yacy.document.parser.html.AbstractScraper;
import net.yacy.document.parser.html.CharacterCoding;
@ -360,7 +361,7 @@ public class QueryGoal {
return q;
}
public StringBuilder collectionImageQueryString() {
public StringBuilder collectionImageQueryString(final QueryModifier modifier) {
final StringBuilder q = new StringBuilder(80);
// add filter to prevent that results come from failed urls
@ -377,8 +378,9 @@ public class QueryGoal {
// combine these queries for all relevant fields
if (w.length() > 0) {
String hostname = modifier == null || modifier.sitehost == null || modifier.sitehost.length() == 0 ? null : Domains.getSmartSLD(modifier.sitehost);
q.append(" AND (");
q.append('(').append(CollectionSchema.images_text_t.getSolrFieldName()).append(':').append(w).append("^100.0) OR ");
q.append('(').append(CollectionSchema.images_text_t.getSolrFieldName()).append(':').append(hostname == null ? w : "(" + w + " " /*NOT an OR!, the hostname shall only boost*/ + hostname + ")").append("^100.0) OR ");
q.append('(').append(CollectionSchema.title.getSolrFieldName()).append(':').append(w).append("^50.0) OR ");
q.append('(').append(CollectionSchema.keywords.getSolrFieldName()).append(':').append(w).append("^10.0) OR ");
q.append('(').append(CollectionSchema.text_t.getSolrFieldName()).append(':').append(w).append(')');

@ -382,7 +382,7 @@ public final class QueryParams {
// construct query
final SolrQuery params = getBasicParams(getFacets);
params.setQuery(this.queryGoal.collectionImageQueryString().toString());
params.setQuery(this.queryGoal.collectionImageQueryString(this.modifier).toString());
// set boosts
StringBuilder bq = new StringBuilder();

@ -55,6 +55,7 @@ import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.federate.yacy.Distribution;
import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.Scanner;
import net.yacy.cora.sorting.ConcurrentScoreMap;
import net.yacy.cora.sorting.ReversibleScoreMap;
@ -466,8 +467,7 @@ public final class SearchEvent {
SearchEvent.this.query.modifier.sitehost != null && SearchEvent.this.query.modifier.sitehost.length() > 0
) {
// try again with sitehost
String[] hp = SearchEvent.this.query.modifier.sitehost.split("\\.");
String newGoal = hp.length <= 1 ? SearchEvent.this.query.modifier.sitehost : hp.length == 2 ? hp[0] : hp[hp.length - 2].length() == 2 ? hp[hp.length - 3] : hp[hp.length - 2];
String newGoal = Domains.getSmartSLD(SearchEvent.this.query.modifier.sitehost);
search =
SearchEvent.this.query
.getSegment()
@ -1571,6 +1571,7 @@ public final class SearchEvent {
List<Object> width = widthO == null ? new ArrayList<Object>(img.size()) : (List<Object>) widthO;
for (int c = 0; c < img.size(); c++) {
String image_urlstub = (String) img.get(c);
if (image_urlstub.endsWith(".ico")) continue; // we don't want favicons, makes the result look idiotic
String image_alt = alt != null && alt.size() > c ? (String) alt.get(c) : "";
boolean match = (query.getQueryGoal().matches(image_urlstub) || query.getQueryGoal().matches(image_alt));
try {

Loading…
Cancel
Save