redesign of the image search process (with much better results,

unfortunately the index schema has changed and p2p image search will not
be muchmuch better until many people update)
pull/1/head
Michael Peter Christen 11 years ago
parent 6184fd9d9a
commit cb85b22725

@ -112,7 +112,7 @@ public class searchresult {
// get a solr query string
QueryGoal qg = new QueryGoal(originalQuery, originalQuery);
StringBuilder solrQ = qg.collectionQueryString(sb.index.fulltext().getDefaultConfiguration(), 0);
StringBuilder solrQ = qg.collectionTextQueryString(sb.index.fulltext().getDefaultConfiguration(), 0);
post.put("defType", "edismax");
post.put(CommonParams.Q, solrQ.toString());
post.put(CommonParams.ROWS, post.remove("num"));

@ -168,7 +168,7 @@ public class select {
querystring = modifier.parse(querystring);
modifier.apply(post);
QueryGoal qg = new QueryGoal(querystring, querystring);
StringBuilder solrQ = qg.collectionQueryString(sb.index.fulltext().getDefaultConfiguration(), profileNr);
StringBuilder solrQ = qg.collectionTextQueryString(sb.index.fulltext().getDefaultConfiguration(), profileNr);
post.put(CommonParams.Q, solrQ.toString()); // sru patch
}
String q = post.get(CommonParams.Q, "");

@ -277,32 +277,32 @@ public class yacysearchitem {
// image search; shows thumbnails
prop.put("content", theSearch.query.contentdom.getCode() + 1); // switch on specific content
//final MediaSnippet ms = theSearch.result().oneImage(item);
final ResultEntry ms = theSearch.oneResult(item, timeout);
if (ms == null) {
prop.put("content_item", "0");
} else {
final String resultUrlstring = ms.url().toNormalform(true);
final String target = sb.getConfig(resultUrlstring.matches(target_special_pattern) ? SwitchboardConstants.SEARCH_TARGET_SPECIAL : SwitchboardConstants.SEARCH_TARGET_DEFAULT, "_self");
SearchEvent.ImageResult image = null;
try {
image = theSearch.oneImageResult(item, timeout);
final String imageUrlstring = image.imageUrl.toNormalform(true);
final String target = sb.getConfig(imageUrlstring.matches(target_special_pattern) ? SwitchboardConstants.SEARCH_TARGET_SPECIAL : SwitchboardConstants.SEARCH_TARGET_DEFAULT, "_self");
final String license = URLLicense.aquireLicense(ms.url());
sb.loader.loadIfNotExistBackground(ms.url(), 1024 * 1024 * 10, null, ClientIdentification.yacyIntranetCrawlerAgent);
prop.putHTML("content_item_hrefCache", (auth) ? "/ViewImage.png?url=" + resultUrlstring : resultUrlstring);
prop.putHTML("content_item_href", resultUrlstring);
final String license = URLLicense.aquireLicense(image.imageUrl);
sb.loader.loadIfNotExistBackground(image.imageUrl, 1024 * 1024 * 10, null, ClientIdentification.yacyIntranetCrawlerAgent);
prop.putHTML("content_item_hrefCache", (auth) ? "/ViewImage.png?url=" + imageUrlstring : imageUrlstring);
prop.putHTML("content_item_href", imageUrlstring);
prop.putHTML("content_item_target", target);
prop.put("content_item_code", license);
prop.putHTML("content_item_name", shorten(ms.title(), MAX_NAME_LENGTH));
prop.put("content_item_mimetype", "");
prop.putHTML("content_item_name", shorten(image.imagetext, MAX_NAME_LENGTH));
prop.put("content_item_mimetype", image.mimetype);
prop.put("content_item_fileSize", 0);
prop.put("content_item_width", 0);
prop.put("content_item_height", 0);
prop.put("content_item_width", image.width);
prop.put("content_item_height", image.height);
prop.put("content_item_attr", ""/*(ms.attr.equals("-1 x -1")) ? "" : "(" + ms.attr + ")"*/); // attributes, here: original size of image
prop.put("content_item_urlhash", ASCII.String(ms.url().hash()));
prop.put("content_item_source", ms.url().toNormalform(true));
prop.putXML("content_item_source-xml", ms.url().toNormalform(true));
prop.put("content_item_sourcedom", ms.url().getHost());
prop.put("content_item_urlhash", ASCII.String(image.imageUrl.hash()));
prop.put("content_item_source", image.sourceUrl.toNormalform(true));
prop.putXML("content_item_source-xml", image.sourceUrl.toNormalform(true));
prop.put("content_item_sourcedom", image.sourceUrl.getHost());
prop.put("content_item_nl", (item == theSearch.query.offset) ? 0 : 1);
prop.put("content_item", 1);
} catch (MalformedURLException e) {
prop.put("content_item", "0");
}
theSearch.query.transmitcount = item + 1;
return prop;

@ -559,6 +559,13 @@ public final class SetTools {
return sb.toString();
}
public static Object nth(Collection<?> c, int n) {
if (c == null || c.size() <= n) return null;
int i = 0;
for (Object o: c) if (i++ == n) return o;
return null;
}
// ------------------------------------------------------------------------------------------------

@ -69,6 +69,7 @@ import net.yacy.cora.document.RSSFeed;
import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.document.RSSReader;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.federate.opensearch.SRURSSConnector;
import net.yacy.cora.federate.solr.connector.RemoteSolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector;
@ -1017,14 +1018,17 @@ public final class Protocol {
solrQuery.setRows(count);
// set highlighting query attributes
solrQuery.setHighlight(true);
solrQuery.setHighlightFragsize(SearchEvent.SNIPPET_MAX_LENGTH);
//solrQuery.setHighlightRequireFieldMatch();
solrQuery.setHighlightSimplePost("</b>");
solrQuery.setHighlightSimplePre("<b>");
solrQuery.setHighlightSnippets(1);
for (CollectionSchema field: snippetFields) solrQuery.addHighlightField(field.getSolrFieldName());
if (event.query.contentdom == Classification.ContentDomain.TEXT || event.query.contentdom == Classification.ContentDomain.ALL) {
solrQuery.setHighlight(true);
solrQuery.setHighlightFragsize(SearchEvent.SNIPPET_MAX_LENGTH);
//solrQuery.setHighlightRequireFieldMatch();
solrQuery.setHighlightSimplePost("</b>");
solrQuery.setHighlightSimplePre("<b>");
solrQuery.setHighlightSnippets(1);
for (CollectionSchema field: snippetFields) solrQuery.addHighlightField(field.getSolrFieldName());
} else {
solrQuery.setHighlight(false);
}
boolean localsearch = target == null || target.equals(event.peers.mySeed());
if (localsearch && Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.DEBUG_SEARCH_REMOTE_SOLR_TESTLOCAL, false)) {
target = event.peers.mySeed();

@ -172,7 +172,7 @@ public class RemoteSearch extends Thread {
nodePeers.add(event.peers.mySeed());
}
if (!Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.DEBUG_SEARCH_REMOTE_SOLR_OFF, false)) {
final SolrQuery solrQuery = event.query.solrQuery(start == 0);
final SolrQuery solrQuery = event.query.solrQuery(event.getQuery().contentdom, start == 0);
for (Seed s: nodePeers) {
Thread t = solrRemoteSearch(event, solrQuery, start, count, s, blacklist);
event.nodeSearchThreads.add(t);

@ -32,10 +32,8 @@ import java.io.PrintWriter;
import java.net.InetAddress;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;

@ -179,6 +179,14 @@ public class QueryGoal {
return exclude_strings;
}
public boolean matches(String text) {
if (text == null || text.length() == 0) return false;
String t = text.toLowerCase();
for (String i: this.include_strings) if (t.indexOf(i.toLowerCase()) < 0) return false;
for (String e: this.exclude_strings) if (t.indexOf(e.toLowerCase()) >= 0) return false;
return true;
}
public ArrayList<String> getAllStrings() {
return all_strings;
}
@ -189,30 +197,22 @@ public class QueryGoal {
final HandleSet blues = Word.words2hashesHandles(blueList);
for (final byte[] b: blues) this.include_hashes.remove(b);
}
public StringBuilder collectionQueryString(CollectionConfiguration configuration, int rankingProfile) {
public StringBuilder collectionTextQueryString(CollectionConfiguration configuration, int rankingProfile) {
final StringBuilder q = new StringBuilder(80);
// add filter to prevent that results come from failed urls
q.append(CollectionSchema.httpstatus_i.getSolrFieldName()).append(":200").append(" AND (");
// parse special requests
if (include_strings.size() == 1 && exclude_strings.size() == 0) {
String w = include_strings.get(0);
if (Segment.catchallString.equals(w)) return new StringBuilder(AbstractSolrConnector.CATCHALL_TERM);
}
// add text query
// add goal query
int wc = 0;
StringBuilder w = new StringBuilder(80);
for (String s: include_strings) {
if (wc > 0) w.append(" AND ");
w.append(dq).append(s).append(dq);
wc++;
}
for (String s: exclude_strings){
if (wc > 0) w.append(" AND -");
w.append(dq).append(s).append(dq);
wc++;
}
if (wc > 1) {w.insert(0, '('); w.append(')');}
StringBuilder w = getGoalQuery();
// combine these queries for all relevant fields
wc = 0;
@ -231,14 +231,52 @@ public class QueryGoal {
q.append(')');
wc++;
}
q.insert(0, '(');
q.append(')');
return q;
}
public StringBuilder collectionImageQueryString() {
final StringBuilder q = new StringBuilder(80);
// add filter to prevent that results come from failed urls
q.append(" AND ").append(CollectionSchema.httpstatus_i.getSolrFieldName()).append(":200");
//q.append(" AND -").append(YaCySchema.failreason_s.getSolrFieldName()).append(":[* TO *]");
q.append(CollectionSchema.httpstatus_i.getSolrFieldName()).append(":200").append(" AND ");
q.append(CollectionSchema.images_urlstub_sxt.getSolrFieldName()).append(":[* TO *]").append(" AND (");
// parse special requests
if (include_strings.size() == 1 && exclude_strings.size() == 0) {
String w = include_strings.get(0);
if (Segment.catchallString.equals(w)) return new StringBuilder(AbstractSolrConnector.CATCHALL_TERM);
}
// add goal query
StringBuilder w = getGoalQuery();
// combine these queries for all relevant fields
q.append('(').append(CollectionSchema.images_alt_txt.getSolrFieldName()).append(':').append(w).append("^20.0) OR ");
q.append('(').append(CollectionSchema.images_text_t.getSolrFieldName()).append(':').append(w).append("^10.0) OR ");
q.append('(').append(CollectionSchema.text_t.getSolrFieldName()).append(':').append(w).append(')');
q.append(')');
return q;
}
private StringBuilder getGoalQuery() {
int wc = 0;
StringBuilder w = new StringBuilder(80);
for (String s: include_strings) {
if (wc > 0) w.append(" AND ");
w.append(dq).append(s).append(dq);
wc++;
}
for (String s: exclude_strings){
if (wc > 0) w.append(" AND -");
w.append(dq).append(s).append(dq);
wc++;
}
if (wc > 1) {w.insert(0, '('); w.append(')');}
return w;
}
}

@ -250,7 +250,7 @@ public final class QueryParams {
this.constraint = constraint;
this.allofconstraint = allofconstraint;
this.siteexcludes = siteexcludes != null && siteexcludes.isEmpty() ? null: siteexcludes;
this.snippetCacheStrategy = snippetCacheStrategy;
this.snippetCacheStrategy = contentdom == ContentDomain.TEXT ? snippetCacheStrategy : contentdom == null ? null : CacheStrategy.CACHEONLY;
this.clienthost = host;
this.remotepeer = null;
this.starttime = Long.valueOf(System.currentTimeMillis());
@ -376,26 +376,124 @@ public final class QueryParams {
return SetTools.anymatch(wordhashes, keyhashes);
}
public SolrQuery solrQuery(boolean getFacets) {
public SolrQuery solrQuery(ContentDomain cd, boolean getFacets) {
if (cd == ContentDomain.IMAGE) return solrImageQuery(getFacets);
return solrTextQuery(getFacets);
}
private SolrQuery solrTextQuery(boolean getFacets) {
if (this.cachedQuery != null) {
this.cachedQuery.setStart(this.offset);
return this.cachedQuery;
}
if (this.queryGoal.getIncludeStrings().size() == 0) return null;
// construct query
final SolrQuery params = new SolrQuery();
final SolrQuery params = getBasicParams(getFacets);
int rankingProfile = this.ranking.coeff_date == RankingProfile.COEFF_MAX ? 1 : (this.modifier.sitehash != null || this.modifier.sitehost != null) ? 2 : 0;
params.setQuery(this.queryGoal.collectionQueryString(this.indexSegment.fulltext().getDefaultConfiguration(), rankingProfile).toString());
params.setParam("defType", "edismax");
params.setQuery(this.queryGoal.collectionTextQueryString(this.indexSegment.fulltext().getDefaultConfiguration(), rankingProfile).toString());
Ranking ranking = indexSegment.fulltext().getDefaultConfiguration().getRanking(rankingProfile); // for a by-date ranking select different ranking profile
String bq = ranking.getBoostQuery();
String bf = ranking.getBoostFunction();
if (bq.length() > 0) params.setParam("bq", bq);
if (bf.length() > 0) params.setParam("boost", bf); // a boost function extension, see http://wiki.apache.org/solr/ExtendedDisMax#bf_.28Boost_Function.2C_additive.29
/*
if (this.contentdom == ContentDomain.IMAGE) {
fq.append(" AND (").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"jpg\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"tif\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"tiff\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"png\")");
}
if (this.contentdom == ContentDomain.AUDIO) {
fq.append(" AND (").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"aif\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"aiff\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"mp3\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"ogg\")");
}
if (this.contentdom == ContentDomain.VIDEO) {
fq.append(" AND (").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"mpg\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"avi\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"mp4\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"mkv\")");
}
if (this.contentdom == ContentDomain.APP) {
fq.append(" AND (").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"apk\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"exe\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"dmg\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"gz\")");
}
*/
// prepare result
ConcurrentLog.info("Protocol", "SOLR QUERY: " + params.toString());
this.cachedQuery = params;
return params;
}
private SolrQuery solrImageQuery(boolean getFacets) {
if (this.cachedQuery != null) {
this.cachedQuery.setStart(this.offset);
return this.cachedQuery;
}
if (this.queryGoal.getIncludeStrings().size() == 0) return null;
// construct query
final SolrQuery params = getBasicParams(getFacets);
params.setQuery(this.queryGoal.collectionImageQueryString().toString());
// set boosts
StringBuilder bq = new StringBuilder();
bq.append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"jpg\"");
bq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"tif\"");
bq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"tiff\"");
bq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"png\"");
params.setParam("bq", bq.toString());
// prepare result
ConcurrentLog.info("Protocol", "SOLR QUERY: " + params.toString());
this.cachedQuery = params;
return params;
}
private SolrQuery getBasicParams(boolean getFacets) {
final SolrQuery params = new SolrQuery();
params.setParam("defType", "edismax");
params.setStart(this.offset);
params.setRows(this.itemsPerPage);
params.setFacet(false);
if (this.ranking.coeff_date == RankingProfile.COEFF_MAX) {
// set a most-recent ordering
params.setSort(new SortClause(CollectionSchema.last_modified.getSolrFieldName(), SolrQuery.ORDER.desc));
//params.setSortField(CollectionSchema.last_modified.getSolrFieldName(), ORDER.desc); // deprecated in Solr 4.2
}
// add site facets
final String fq = getFacets();
if (fq.length() > 0) {
params.setFilterQueries(fq);
}
// set facet query attributes
if (getFacets && this.facetfields.size() > 0) {
params.setFacet(true);
params.setFacetLimit(this.maxfacets);
params.setFacetSort(FacetParams.FACET_SORT_COUNT);
params.setParam(FacetParams.FACET_METHOD, FacetParams.FACET_METHOD_fcs);
for (String field: this.facetfields) params.addFacetField(field);
} else {
params.setFacet(false);
}
params.setFields("*", "score"); // we need the score for post-ranking
return params;
}
private String getFacets() {
// add site facets
final StringBuilder fq = new StringBuilder();
@ -441,34 +539,6 @@ public final class QueryParams {
fq.append(" AND ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"").append(this.modifier.filetype).append('\"');
}
if (this.contentdom == ContentDomain.IMAGE) {
fq.append(" AND (").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"jpg\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"tif\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"tiff\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"png\")");
}
if (this.contentdom == ContentDomain.AUDIO) {
fq.append(" AND (").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"aif\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"aiff\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"mp3\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"ogg\")");
}
if (this.contentdom == ContentDomain.VIDEO) {
fq.append(" AND (").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"mpg\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"avi\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"mp4\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"mkv\")");
}
if (this.contentdom == ContentDomain.APP) {
fq.append(" AND (").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"apk\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"exe\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"dmg\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"gz\")");
}
if (this.inlink != null) {
fq.append(" AND ").append(CollectionSchema.outboundlinks_urlstub_sxt.getSolrFieldName()).append(":\"").append(this.inlink).append('\"');
}
@ -495,45 +565,15 @@ public final class QueryParams {
//params.set("d", GeoLocation.degreeToKm(this.radius));
fq.append(" AND ").append("{!bbox sfield=" + CollectionSchema.coordinate_p.getSolrFieldName() + " pt=" + Double.toString(this.lat) + "," + Double.toString(this.lon) + " d=" + GeoLocation.degreeToKm(this.radius) + "}");
//params.setRows(Integer.MAX_VALUE);
} else {
// set ranking
if (this.ranking.coeff_date == RankingProfile.COEFF_MAX) {
// set a most-recent ordering
params.setSort(new SortClause(CollectionSchema.last_modified.getSolrFieldName(), SolrQuery.ORDER.desc));
//params.setSortField(CollectionSchema.last_modified.getSolrFieldName(), ORDER.desc); // deprecated in Solr 4.2
}
}
if (this.modifier.collection != null && this.modifier.collection.length() > 0) {
fq.append(" AND ").append(QueryModifier.parseCollectionExpression(this.modifier.collection));
}
if (fq.length() > 0) {
params.setFilterQueries(fq.substring(5));
}
params.setStart(offset);
params.setRows(itemsPerPage);
// set facet query attributes
if (getFacets && this.facetfields.size() > 0) {
params.setFacet(true);
params.setFacetLimit(this.maxfacets);
params.setFacetSort(FacetParams.FACET_SORT_COUNT);
params.setParam(FacetParams.FACET_METHOD, FacetParams.FACET_METHOD_fcs);
for (String field: this.facetfields) params.addFacetField(field);
} else {
params.setFacet(false);
}
params.setFields("*", "score"); // we need the score for post-ranking
// prepare result
ConcurrentLog.info("Protocol", "SOLR QUERY: " + params.toString());
this.cachedQuery = params;
return params;
return fq.length() > 0 ? fq.substring(5) : fq.toString();
}
public QueryGoal getQueryGoal() {
return this.queryGoal;
}

@ -26,11 +26,13 @@
package net.yacy.search.query;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.ConcurrentModificationException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
@ -66,6 +68,7 @@ import net.yacy.document.Condenser;
import net.yacy.document.LargeNumberCache;
import net.yacy.document.LibraryProvider;
import net.yacy.document.TextParser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
@ -77,6 +80,7 @@ import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.rwi.TermSearch;
import net.yacy.kelondro.util.Bitfield;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.kelondro.util.SetTools;
import net.yacy.peers.RemoteSearch;
import net.yacy.peers.SeedDB;
import net.yacy.peers.graphics.ProfilingGraph;
@ -278,7 +282,7 @@ public final class SearchEvent {
// start a local solr search
if (!Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.DEBUG_SEARCH_LOCAL_SOLR_OFF, false)) {
this.localsolrsearch = RemoteSearch.solrRemoteSearch(this, this.query.solrQuery(true), 0, this.query.itemsPerPage, null /*this peer*/, Switchboard.urlBlacklist);
this.localsolrsearch = RemoteSearch.solrRemoteSearch(this, this.query.solrQuery(this.query.contentdom, true), 0, this.query.itemsPerPage, null /*this peer*/, Switchboard.urlBlacklist);
}
this.localsolroffset = this.query.itemsPerPage;
@ -1322,7 +1326,6 @@ public final class SearchEvent {
return new ResultEntry(page, this.query.getSegment(), this.peers, null, null, 0); // result without snippet
}
public ResultEntry oneResult(final int item, final long timeout) {
// check if we already retrieved this item
// (happens if a search pages is accessed a second time)
@ -1337,7 +1340,7 @@ public final class SearchEvent {
int nextitems = item - this.localsolroffset + this.query.itemsPerPage; // example: suddenly switch to item 60, just 10 had been shown, 20 loaded.
if (this.localsolrsearch != null && this.localsolrsearch.isAlive()) {try {this.localsolrsearch.join();} catch (final InterruptedException e) {}}
if (!Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.DEBUG_SEARCH_LOCAL_SOLR_OFF, false)) {
this.localsolrsearch = RemoteSearch.solrRemoteSearch(this, this.query.solrQuery(this.localsolroffset == 0), this.localsolroffset, nextitems, null /*this peer*/, Switchboard.urlBlacklist);
this.localsolrsearch = RemoteSearch.solrRemoteSearch(this, this.query.solrQuery(this.query.contentdom, this.localsolroffset == 0), this.localsolroffset, nextitems, null /*this peer*/, Switchboard.urlBlacklist);
}
this.localsolroffset += nextitems;
}
@ -1358,7 +1361,7 @@ public final class SearchEvent {
if (this.localsolrsearch == null || !this.localsolrsearch.isAlive() && this.local_solr_stored.get() > this.localsolroffset && (item + 1) % this.query.itemsPerPage == 0) {
// at the end of a list, trigger a next solr search
if (!Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.DEBUG_SEARCH_LOCAL_SOLR_OFF, false)) {
this.localsolrsearch = RemoteSearch.solrRemoteSearch(this, this.query.solrQuery(this.localsolroffset == 0), this.localsolroffset, this.query.itemsPerPage, null /*this peer*/, Switchboard.urlBlacklist);
this.localsolrsearch = RemoteSearch.solrRemoteSearch(this, this.query.solrQuery(this.query.contentdom, this.localsolroffset == 0), this.localsolroffset, this.query.itemsPerPage, null /*this peer*/, Switchboard.urlBlacklist);
}
this.localsolroffset += this.query.itemsPerPage;
}
@ -1370,6 +1373,81 @@ public final class SearchEvent {
return null;
}
private LinkedHashMap<String, ImageResult> imageViewed = new LinkedHashMap<String, ImageResult>();
private LinkedHashMap<String, ImageResult> imageSpare = new LinkedHashMap<String, ImageResult>();
private ImageResult nthImage(int item) {
Object o = SetTools.nth(this.imageViewed.values(), item);
if (o == null) return null;
return (ImageResult) o;
}
private ImageResult nextSpare() {
Map.Entry<String, ImageResult> next = imageSpare.entrySet().iterator().next();
imageViewed.put(next.getKey(), next.getValue());
imageSpare.remove(next.getKey());
return next.getValue();
}
public ImageResult oneImageResult(final int item, final long timeout) throws MalformedURLException {
if (item < imageViewed.size()) return nthImage(item);
if (imageSpare.size() > 0) return nextSpare();
ResultEntry ms = oneResult(item, timeout);
// check if the match was made in the url or in the image links
if (ms == null) throw new MalformedURLException("nUll");
int height = 0, width = 0, fileSize = 0;
SolrDocument doc = ms.getNode().getDocument();
Collection<Object> alt = doc.getFieldValues(CollectionSchema.images_alt_txt.getSolrFieldName());
Collection<Object> img = doc.getFieldValues(CollectionSchema.images_urlstub_sxt.getSolrFieldName());
Collection<Object> prt = doc.getFieldValues(CollectionSchema.images_protocol_sxt.getSolrFieldName());
if (img != null) {
int c = 0;
for (Object i: img) {
String a = alt != null && alt.size() > c ? (String) SetTools.nth(alt, c) : "";
if (query.getQueryGoal().matches((String) i) || query.getQueryGoal().matches(a)) {
try {
DigestURI imageUrl = new DigestURI((prt != null && prt.size() > c ? SetTools.nth(prt, c) : "http") + "://" + i);
Object heightO = SetTools.nth(doc.getFieldValues(CollectionSchema.images_height_val.getSolrFieldName()), c);
Object widthO = SetTools.nth(doc.getFieldValues(CollectionSchema.images_width_val.getSolrFieldName()), c);
if (heightO != null) height = (Integer) heightO;
if (widthO != null) width = (Integer) widthO;
String id = ASCII.String(imageUrl.hash());
if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), imageUrl, "", a, width, height, fileSize));
} catch (MalformedURLException e) {
continue;
}
}
c++;
}
}
if (MultiProtocolURI.isImage(MultiProtocolURI.getFileExtension(ms.url().getFileName()))) {
String id = ASCII.String(ms.hash());
if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), ms.url(), "", ms.title(), width, height, fileSize));
}
if (img != null && img.size() > 0) {
DigestURI imageUrl = new DigestURI((prt != null && prt.size() > 0 ? SetTools.nth(prt, 0) : "http") + "://" + SetTools.nth(img, 0));
String imagetext = alt != null && alt.size() > 0 ? (String) SetTools.nth(alt, 0) : "";
String id = ASCII.String(imageUrl.hash());
if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), imageUrl, "", imagetext, width, height, fileSize));
}
if (imageSpare.size() > 0) return nextSpare();
throw new MalformedURLException("no image url found");
}
public class ImageResult {
public DigestURI imageUrl, sourceUrl;
public String mimetype = "", imagetext = "";
public int width = 0, height = 0, fileSize = 0;
public ImageResult(DigestURI sourceUrl, DigestURI imageUrl, String mimetype, String imagetext, int width, int height, int fileSize) {
this.sourceUrl = sourceUrl;
this.imageUrl = imageUrl;
this.mimetype = mimetype;
this.imagetext = imagetext;
this.width = width;
this.height = height;
this.fileSize = fileSize;
}
}
public ArrayList<WeakPriorityBlockingQueue.Element<ResultEntry>> completeResults(final long waitingtime) {
final long timeout = waitingtime == Long.MAX_VALUE ? Long.MAX_VALUE : System.currentTimeMillis() + waitingtime;
int i = 0;

@ -122,6 +122,9 @@ public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEn
ResultEntry other = (ResultEntry) obj;
return Base64Order.enhancedCoder.equal(this.urlentry.hash(), other.urlentry.hash());
}
public URIMetadataNode getNode() {
return this.urlentry;
}
public byte[] hash() {
return this.urlentry.hash();
}

Loading…
Cancel
Save