enhanced search result processing behavior

- query less at one time; query more often
- in between the small queries, evaluate results
- remove fields from search results which are not needed
pull/1/head
Michael Peter Christen 12 years ago
parent bf512e6350
commit d48e9788d2

@ -115,7 +115,7 @@ public class searchresult {
post.put(CommonParams.ROWS, post.remove("num"));
post.put(CommonParams.ROWS, Math.min(post.getInt(CommonParams.ROWS, 10), (authenticated) ? 5000 : 100));
post.put("defType", "edismax");
post.put("bq", YaCySchema.fuzzy_signature_unique_b.getSolrFieldName() + ":true^100000.0"); // a bost query that moves double content to the back
post.put("bq", YaCySchema.fuzzy_signature_unique_b.getSolrFieldName() + ":true^100000.0"); // a boost query that moves double content to the back
post.put(CommonParams.FL,
YaCySchema.content_type.getSolrFieldName() + ',' +
YaCySchema.id.getSolrFieldName() + ',' +
@ -145,7 +145,7 @@ public class searchresult {
// add sites operator
if (site != null && site.length() > 0) {
String[] s0 = site.split(Pattern.quote("|"));
String[] s0 = Pattern.compile(Pattern.quote("|")).split(site, 0);
ArrayList<String> sites = new ArrayList<String>(2);
for (String s: s0) {
s = s.trim().toLowerCase();

@ -83,9 +83,10 @@ public class GSAResponseWriter implements QueryResponseWriter {
// pre-select a set of YaCy schema fields for the solr searcher which should cause a better caching
private static final YaCySchema[] extrafields = new YaCySchema[]{
YaCySchema.id, YaCySchema.title, YaCySchema.description, YaCySchema.text_t,
YaCySchema.h1_txt, YaCySchema.h2_txt, YaCySchema.h3_txt, YaCySchema.h4_txt, YaCySchema.h5_txt, YaCySchema.h6_txt,
};
YaCySchema.id, YaCySchema.sku, YaCySchema.title, YaCySchema.description,
YaCySchema.last_modified, YaCySchema.load_date_dt, YaCySchema.size_i, YaCySchema.language_s
};
private static final Set<String> SOLR_FIELDS = new HashSet<String>();
static {
field2tag.put(YaCySchema.language_s.getSolrFieldName(), GSAToken.LANG.name());
@ -278,17 +279,6 @@ public class GSAResponseWriter implements QueryResponseWriter {
//texts.add(value.stringValue());
continue;
}
if (YaCySchema.text_t.getSolrFieldName().equals(fieldName)) {
//texts.add(value.stringValue());
continue;
}
if (YaCySchema.h1_txt.getSolrFieldName().equals(fieldName) || YaCySchema.h2_txt.getSolrFieldName().equals(fieldName) ||
YaCySchema.h3_txt.getSolrFieldName().equals(fieldName) || YaCySchema.h4_txt.getSolrFieldName().equals(fieldName) ||
YaCySchema.h5_txt.getSolrFieldName().equals(fieldName) || YaCySchema.h6_txt.getSolrFieldName().equals(fieldName)) {
// because these are multi-valued fields, there can be several of each
//texts.add(value.stringValue());
continue;
}
if (YaCySchema.size_i.getSolrFieldName().equals(fieldName)) {
size = value.stringValue() != null && value.stringValue().length() > 0 ? Integer.parseInt(value.stringValue()) : -1;
continue;

@ -513,7 +513,7 @@ public final class CrawlSwitchboard {
String handle = r.profileHandle();
RowHandleSet us = this.profilesActiveCrawlsCounter.get(handle);
if (us == null) {us = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); this.profilesActiveCrawlsCounter.put(handle, us);}
us.put(r.url().hash());
if (us.size() < 100) us.put(r.url().hash()); // store the hash, but not too many
deletionCandidate.remove(handle);
if (deletionCandidate.size() == 0) return 0;
if (System.currentTimeMillis() > timeout) return 0; // give up; this is too large

@ -1030,6 +1030,7 @@ public final class Protocol
final SearchEvent event,
final int offset,
final int count,
boolean getFacets,
final Seed target,
final Blacklist blacklist) {
@ -1044,11 +1045,13 @@ public final class Protocol
solrQuery.setRows(count);
// set facet query attributes
if (event.query.facetfields.length > 0) {
if (getFacets && event.query.facetfields.length > 0) {
solrQuery.setFacet(true);
solrQuery.setFacetLimit(event.query.maxfacets);
solrQuery.setFacetSort(FacetParams.FACET_SORT_COUNT);
for (String field: event.query.facetfields) solrQuery.addFacetField(field);
} else {
solrQuery.setFacet(false);
}
// set highlightning query attributes

@ -253,36 +253,40 @@ public class RemoteSearch extends Thread {
final Seed targetPeer,
final Blacklist blacklist) {
// check own peer status
if (event.peers.mySeed() == null || event.peers.mySeed().getPublicAddress() == null) { return null; }
// prepare seed targets and threads
if (targetPeer != null && targetPeer.hash != null && event.preselectedPeerHashes != null) targetPeer.setAlternativeAddress(event.preselectedPeerHashes.get(ASCII.getBytes(targetPeer.hash)));
Thread solr = new Thread() {
@Override
public void run() {
event.rankingProcess.oneFeederStarted();
try {
int urls = Protocol.solrQuery(
event,
0,
count,
targetPeer,
blacklist);
if (urls >= 0) {
// urls is an array of url hashes. this is only used for log output
event.peers.mySeed().incRI(urls);
event.peers.mySeed().incRU(urls);
} else {
if (targetPeer != null) {
Network.log.logInfo("REMOTE SEARCH - no answer from remote peer " + targetPeer.hash + ":" + targetPeer.getName());
int tmpoffset = 0;
int tmpcount = 10;
while (tmpoffset + tmpcount <= count) {
try {
event.rankingProcess.oneFeederStarted();
int urls = Protocol.solrQuery(
event,
tmpoffset,
tmpcount,
tmpoffset == 0,
targetPeer,
blacklist);
if (urls >= 0) {
// urls is an array of url hashes. this is only used for log output
event.peers.mySeed().incRI(urls);
event.peers.mySeed().incRU(urls);
} else {
if (targetPeer != null) {
Network.log.logInfo("REMOTE SEARCH - no answer from remote peer " + targetPeer.hash + ":" + targetPeer.getName());
}
}
} catch (final Exception e) {
Log.logException(e);
} finally {
event.rankingProcess.oneFeederTerminated();
}
} catch (final Exception e) {
Log.logException(e);
} finally {
event.rankingProcess.oneFeederTerminated();
tmpoffset += tmpcount;
}
}
};

@ -430,7 +430,7 @@ public final class QueryParams {
// construct query
final SolrQuery params = new SolrQuery();
params.setParam("defType", "edismax");
params.setParam("bq", YaCySchema.fuzzy_signature_unique_b.getSolrFieldName() + ":true^100000.0"); // a bost query that moves double content to the back
params.setParam("bq", YaCySchema.fuzzy_signature_unique_b.getSolrFieldName() + ":true^100000.0"); // a boost query that moves double content to the back
params.setStart(this.offset);
params.setRows(this.itemsPerPage);
params.setFacet(false);

@ -878,7 +878,12 @@ public final class SearchEvent {
}
public ResultEntry oneResult(final int item, final long timeout) {
if (this.localsearch != null && this.localsearch.isAlive()) try {this.localsearch.join();} catch (InterruptedException e) {}
// if there is not yet a worker alive, start one
if (!anyWorkerAlive()) {
deployWorker(Math.min(SNIPPET_WORKER_THREADS, this.query.itemsPerPage), this.query.neededResults());
}
// wait until local data is there
while (this.localsearch != null && this.localsearch.isAlive() && this.result.sizeAvailable() < item) try {this.localsearch.join(10);} catch (InterruptedException e) {}
// check if we already retrieved this item
// (happens if a search pages is accessed a second time)
final long finishTime = System.currentTimeMillis() + timeout;

Loading…
Cancel
Save