hack to get more results when doing a remote site search

pull/1/head
Michael Peter Christen 11 years ago
parent 81f9b34da7
commit 475125f9d7

@ -915,6 +915,7 @@ public final class Protocol {
}
final HTTPClient httpClient = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent, 8000);
//System.out.println("Protocol: http://" + hostaddress + "/yacy/search.html" + requestPartsToString(parts)); // DEBUG
byte[] a = httpClient.POSTbytes(new MultiProtocolURL("http://" + hostaddress + "/yacy/search.html"), hostname, parts, false, true);
if (a != null && a.length > 200000) {
// there is something wrong. This is too large, maybe a hack on the other side?
@ -1762,4 +1763,17 @@ public final class Protocol {
return parts;
}
public static String requestPartsToString(Map<String, ContentBody> parts) {
StringBuilder sb = new StringBuilder();
for (Map.Entry<String, ContentBody> part: parts.entrySet()) {
try {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
part.getValue().writeTo(baos);
baos.close();
sb.append("&").append(part.getKey()).append("=").append(ASCII.String(baos.toByteArray()));
} catch (IOException e) {}
}
return "?" + sb.toString().substring(1);
}
}

@ -447,7 +447,7 @@ public final class SearchEvent {
// so following sortings together with the global results will be fast
try {
final long timer = System.currentTimeMillis();
final TermSearch<WordReference> search =
TermSearch<WordReference> search =
SearchEvent.this.query
.getSegment()
.termIndex()
@ -458,18 +458,41 @@ public final class SearchEvent {
Segment.wordReferenceFactory,
SearchEvent.this.query.maxDistance);
SearchEvent.this.localSearchInclusion = search.inclusion();
final ReferenceContainer<WordReference> index = search.joined();
EventTracker.update(
EventTracker.EClass.SEARCH,
new ProfilingGraph.EventSearch(
SearchEvent.this.query.id(true),
SearchEventType.JOIN,
SearchEvent.this.query.getQueryGoal().getQueryString(false),
index.size(),
System.currentTimeMillis() - timer),
false);
ReferenceContainer<WordReference> index = search.joined();
if ( !index.isEmpty() ) {
addRWIs(index, true, "local index: " + SearchEvent.this.query.getSegment().getLocation(), index.size(), SearchEvent.this.maxtime);
int successcount = addRWIs(index, true, "local index: " + SearchEvent.this.query.getSegment().getLocation(), index.size(), SearchEvent.this.maxtime);
if (successcount == 0 &&
SearchEvent.this.query.getQueryGoal().getIncludeHashes().has(Segment.catchallHash) &&
SearchEvent.this.query.modifier.sitehost != null && SearchEvent.this.query.modifier.sitehost.length() > 0
) {
// try again with sitehost
String[] hp = SearchEvent.this.query.modifier.sitehost.split("\\.");
String newGoal = hp.length <= 1 ? SearchEvent.this.query.modifier.sitehost : hp.length == 2 ? hp[0] : hp[hp.length - 2].length() == 2 ? hp[hp.length - 3] : hp[hp.length - 2];
search =
SearchEvent.this.query
.getSegment()
.termIndex()
.query(
QueryParams.hashes2Set(ASCII.String(Word.word2hash(newGoal))),
SearchEvent.this.query.getQueryGoal().getExcludeHashes(),
null,
Segment.wordReferenceFactory,
SearchEvent.this.query.maxDistance);
SearchEvent.this.localSearchInclusion = search.inclusion();
index = search.joined();
if (!index.isEmpty()) {
successcount = addRWIs(index, true, "local index: " + SearchEvent.this.query.getSegment().getLocation(), index.size(), SearchEvent.this.maxtime);
}
}
EventTracker.update(
EventTracker.EClass.SEARCH,
new ProfilingGraph.EventSearch(
SearchEvent.this.query.id(true),
SearchEventType.JOIN,
SearchEvent.this.query.getQueryGoal().getQueryString(false),
successcount,
System.currentTimeMillis() - timer),
false);
SearchEvent.this.addFinalize();
}
} catch (final Exception e ) {
@ -480,7 +503,7 @@ public final class SearchEvent {
}
}
public void addRWIs(
public int addRWIs(
final ReferenceContainer<WordReference> index,
final boolean local,
final String resourceName,
@ -492,7 +515,7 @@ public final class SearchEvent {
this.addRunning = true;
assert (index != null);
if (index.isEmpty()) return;
if (index.isEmpty()) return 0;
if (local) {
assert fullResource >= 0 : "fullResource = " + fullResource;
this.local_rwi_stored.addAndGet(fullResource);
@ -519,9 +542,14 @@ public final class SearchEvent {
// apply all constraints
long timeout = maxtime == Long.MAX_VALUE ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime;
int successcounter = 0;
try {
WordReferenceVars iEntry;
long remaining;
String acceptableAlternativeSitehash = null;
if (this.query.modifier.sitehost != null && this.query.modifier.sitehost.length() > 0) try {
acceptableAlternativeSitehash = DigestURL.hosthash(this.query.modifier.sitehost.startsWith("www.") ? this.query.modifier.sitehost.substring(4) : "www." + this.query.modifier.sitehost, 80);
} catch (MalformedURLException e1) {}
pollloop: while ( true ) {
remaining = timeout - System.currentTimeMillis();
if (remaining <= 0) {
@ -578,7 +606,7 @@ public final class SearchEvent {
}
} else {
// filter out all domains that do not match with the site constraint
if (!hosthash.equals(this.query.modifier.sitehash)) {
if (!hosthash.equals(this.query.modifier.sitehash) && (acceptableAlternativeSitehash == null || !hosthash.equals(acceptableAlternativeSitehash))) {
if (log.isFine()) log.fine("dropped RWI: modifier.sitehash");
continue pollloop;
}
@ -598,6 +626,7 @@ public final class SearchEvent {
}
// increase counter for statistics
if (local) this.local_rwi_available.incrementAndGet(); else this.remote_rwi_available.incrementAndGet();
successcounter++;
}
if (System.currentTimeMillis() >= timeout) ConcurrentLog.warn("SearchEvent", "rwi normalization ended with timeout = " + maxtime);
@ -612,6 +641,7 @@ public final class SearchEvent {
resourceName,
index.size(),
System.currentTimeMillis() - timer), false);
return successcounter;
}
public long getEventTime() {

Loading…
Cancel
Save