better ranking because we wait a very little time during the search

process more to get better remote sear results into the ranking priority
stack
pull/1/head
Michael Christen 13 years ago
parent 89dc04115f
commit f14faf503b

@ -75,9 +75,13 @@ import de.anomic.http.client.Cache;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
public class IndexControlRWIs_p {
public class IndexControlRWIs_p
{
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) throws IOException {
public static serverObjects respond(
final RequestHeader header,
final serverObjects post,
final serverSwitch env) throws IOException {
// return variable that accumulates replacements
final Switchboard sb = (Switchboard) env;
final serverObjects prop = new serverObjects();
@ -87,7 +91,8 @@ public class IndexControlRWIs_p {
prop.put("keyhash", "");
prop.put("result", "");
prop.put("cleanup", post == null || post.containsKey("maxReferencesLimit") ? 1 : 0);
prop.put("cleanup_solr", sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr() == null || !sb.getConfigBool("federated.service.solr.indexing.enabled", false) ? 0 : 1);
prop.put("cleanup_solr", sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr() == null
|| !sb.getConfigBool("federated.service.solr.indexing.enabled", false) ? 0 : 1);
String segmentName = sb.getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default");
int i = 0;
@ -126,8 +131,20 @@ public class IndexControlRWIs_p {
// read values from checkboxes
final String[] urls = post.getAll("urlhx.*");
HandleSet urlb = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, urls.length);
if (urls != null) for (final String s: urls) try { urlb.put(s.getBytes()); } catch (final RowSpaceExceededException e) { Log.logException(e); }
HandleSet urlb =
new HandleSet(
URIMetadataRow.rowdef.primaryKeyLength,
URIMetadataRow.rowdef.objectOrder,
urls.length);
if ( urls != null ) {
for ( final String s : urls ) {
try {
urlb.put(s.getBytes());
} catch ( final RowSpaceExceededException e ) {
Log.logException(e);
}
}
}
final boolean delurl = post.containsKey("delurl");
final boolean delurlref = post.containsKey("delurlref");
@ -157,11 +174,14 @@ public class IndexControlRWIs_p {
if ( post.get("deleteIndex", "").equals("on") ) {
segment.clear();
}
if (post.get("deleteSolr", "").equals("on") && sb.getConfigBool("federated.service.solr.indexing.enabled", false)) try {
if ( post.get("deleteSolr", "").equals("on")
&& sb.getConfigBool("federated.service.solr.indexing.enabled", false) ) {
try {
sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr().clear();
} catch ( final Exception e ) {
Log.logException(e);
}
}
if ( post.get("deleteCrawlQueues", "").equals("on") ) {
sb.crawlQueues.clear();
sb.crawlStacker.clear();
@ -190,14 +210,25 @@ public class IndexControlRWIs_p {
}
// delete word
if (post.containsKey("keyhashdeleteall")) try {
if ( post.containsKey("keyhashdeleteall") ) {
try {
if ( delurl || delurlref ) {
// generate urlx: an array of url hashes to be deleted
ReferenceContainer<WordReference> index = null;
index = segment.termIndex().get(keyhash, null);
final Iterator<WordReference> en = index.entries();
urlb = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, index.size());
while (en.hasNext()) try { urlb.put(en.next().urlhash()); } catch (final RowSpaceExceededException e) { Log.logException(e); }
urlb =
new HandleSet(
URIMetadataRow.rowdef.primaryKeyLength,
URIMetadataRow.rowdef.objectOrder,
index.size());
while ( en.hasNext() ) {
try {
urlb.put(en.next().urlhash());
} catch ( final RowSpaceExceededException e ) {
Log.logException(e);
}
}
index = null;
}
if ( delurlref ) {
@ -207,24 +238,40 @@ public class IndexControlRWIs_p {
segment.termIndex().delete(keyhash);
// now delete all urls if demanded
if ( delurl || delurlref ) {
for (final byte[] b: urlb) sb.urlRemove(segment, b);
for ( final byte[] b : urlb ) {
sb.urlRemove(segment, b);
}
}
post.remove("keyhashdeleteall");
post.put("urllist", "generated");
} catch ( final IOException e ) {
Log.logException(e);
}
}
// delete selected URLs
if (post.containsKey("keyhashdelete")) try {
if ( post.containsKey("keyhashdelete") ) {
try {
if ( delurlref ) {
segment.removeAllUrlReferences(urlb, sb.loader, CacheStrategy.IFEXIST);
}
if ( delurl || delurlref ) {
for (final byte[] b: urlb) sb.urlRemove(segment, b);
for ( final byte[] b : urlb ) {
sb.urlRemove(segment, b);
}
}
final HandleSet urlHashes =
new HandleSet(
URIMetadataRow.rowdef.primaryKeyLength,
URIMetadataRow.rowdef.objectOrder,
0);
for ( final byte[] b : urlb ) {
try {
urlHashes.put(b);
} catch ( final RowSpaceExceededException e ) {
Log.logException(e);
}
}
final HandleSet urlHashes = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
for (final byte[] b: urlb) try { urlHashes.put(b); } catch (final RowSpaceExceededException e) { Log.logException(e); }
segment.termIndex().remove(keyhash, urlHashes);
// this shall lead to a presentation of the list; so handle that the remaining program
// thinks that it was called for a list presentation
@ -233,6 +280,7 @@ public class IndexControlRWIs_p {
} catch ( final IOException e ) {
Log.logException(e);
}
}
if ( post.containsKey("urllist") ) {
if ( keystring.length() == 0 || !ByteBuffer.equals(Word.word2hash(keystring), keyhash) ) {
@ -245,7 +293,8 @@ public class IndexControlRWIs_p {
}
// transfer to other peer
if (post.containsKey("keyhashtransfer")) try {
if ( post.containsKey("keyhashtransfer") ) {
try {
if ( keystring.length() == 0 || !ByteBuffer.equals(Word.word2hash(keystring), keyhash) ) {
prop.put("keystring", "&lt;not possible to compute word from hash&gt;");
}
@ -272,8 +321,13 @@ public class IndexControlRWIs_p {
index = segment.termIndex().get(keyhash, null);
// built urlCache
final Iterator<WordReference> urlIter = index.entries();
final TreeMap<byte[], URIMetadataRow> knownURLs = new TreeMap<byte[], URIMetadataRow>(Base64Order.enhancedCoder);
final HandleSet unknownURLEntries = new HandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, index.size());
final TreeMap<byte[], URIMetadataRow> knownURLs =
new TreeMap<byte[], URIMetadataRow>(Base64Order.enhancedCoder);
final HandleSet unknownURLEntries =
new HandleSet(
WordReferenceRow.urlEntryRow.primaryKeyLength,
WordReferenceRow.urlEntryRow.objectOrder,
index.size());
Reference iEntry;
URIMetadataRow lurl;
while ( urlIter.hasNext() ) {
@ -292,7 +346,11 @@ public class IndexControlRWIs_p {
}
// make an indexContainerCache
final ReferenceContainerCache<WordReference> icc = new ReferenceContainerCache<WordReference>(Segment.wordReferenceFactory, Segment.wordOrder, Word.commonHashLength);
final ReferenceContainerCache<WordReference> icc =
new ReferenceContainerCache<WordReference>(
Segment.wordReferenceFactory,
Segment.wordOrder,
Word.commonHashLength);
try {
icc.add(index);
} catch ( final RowSpaceExceededException e ) {
@ -302,28 +360,33 @@ public class IndexControlRWIs_p {
// transport to other peer
final boolean gzipBody = sb.getConfigBool("indexControl.gzipBody", false);
final int timeout = (int) sb.getConfigLong("indexControl.timeout", 60000);
final String error = Protocol.transferIndex(
seed,
icc,
knownURLs,
gzipBody,
timeout);
prop.put("result", (error == null) ? ("Successfully transferred " + knownURLs.size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds, " + unknownURLEntries.size() + " URL not found") : "error: " + error);
final String error = Protocol.transferIndex(seed, icc, knownURLs, gzipBody, timeout);
prop.put("result", (error == null) ? ("Successfully transferred "
+ knownURLs.size()
+ " words in "
+ ((System.currentTimeMillis() - starttime) / 1000)
+ " seconds, "
+ unknownURLEntries.size() + " URL not found") : "error: " + error);
index = null;
} catch ( final IOException e ) {
Log.logException(e);
}
}
// generate list
if (post.containsKey("keyhashsimilar")) try {
final Iterator<ReferenceContainer<WordReference>> containerIt = segment.termIndex().referenceContainer(keyhash, true, 256, false).iterator();
if ( post.containsKey("keyhashsimilar") ) {
try {
final Iterator<ReferenceContainer<WordReference>> containerIt =
segment.termIndex().referenceContainer(keyhash, true, 256, false).iterator();
ReferenceContainer<WordReference> container;
i = 0;
int rows = 0, cols = 0;
prop.put("keyhashsimilar", "1");
while ( containerIt.hasNext() && i < 256 ) {
container = containerIt.next();
prop.put("keyhashsimilar_rows_"+rows+"_cols_"+cols+"_wordHash", container.getTermHash());
prop.put(
"keyhashsimilar_rows_" + rows + "_cols_" + cols + "_wordHash",
container.getTermHash());
cols++;
if ( cols == 8 ) {
prop.put("keyhashsimilar_rows_" + rows + "_cols", cols);
@ -338,15 +401,22 @@ public class IndexControlRWIs_p {
} catch ( final IOException e ) {
Log.logException(e);
}
}
if ( post.containsKey("blacklist") ) {
final String blacklist = post.get("blacklist", "");
final HandleSet urlHashes = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, urlb.size());
final HandleSet urlHashes =
new HandleSet(
URIMetadataRow.rowdef.primaryKeyLength,
URIMetadataRow.rowdef.objectOrder,
urlb.size());
if ( post.containsKey("blacklisturls") ) {
PrintWriter pw;
try {
final String[] supportedBlacklistTypes = env.getConfig("BlackLists.types", "").split(",");
pw = new PrintWriter(new FileWriter(new File(ListManager.listsPath, blacklist), true));
final String[] supportedBlacklistTypes =
env.getConfig("BlackLists.types", "").split(",");
pw =
new PrintWriter(new FileWriter(new File(ListManager.listsPath, blacklist), true));
DigestURI url;
for ( final byte[] b : urlb ) {
try {
@ -360,7 +430,9 @@ public class IndexControlRWIs_p {
url = e.metadata().url();
pw.println(url.getHost() + "/" + url.getFile());
for ( final String supportedBlacklistType : supportedBlacklistTypes ) {
if (ListManager.listSetContains(supportedBlacklistType + ".BlackLists", blacklist)) {
if ( ListManager.listSetContains(
supportedBlacklistType + ".BlackLists",
blacklist) ) {
Switchboard.urlBlacklist.add(
supportedBlacklistType,
url.getHost(),
@ -379,7 +451,8 @@ public class IndexControlRWIs_p {
PrintWriter pw;
try {
final String[] supportedBlacklistTypes = Blacklist.BLACKLIST_TYPES_STRING.split(",");
pw = new PrintWriter(new FileWriter(new File(ListManager.listsPath, blacklist), true));
pw =
new PrintWriter(new FileWriter(new File(ListManager.listsPath, blacklist), true));
DigestURI url;
for ( final byte[] b : urlb ) {
try {
@ -393,10 +466,13 @@ public class IndexControlRWIs_p {
url = e.metadata().url();
pw.println(url.getHost() + "/.*");
for ( final String supportedBlacklistType : supportedBlacklistTypes ) {
if (ListManager.listSetContains(supportedBlacklistType + ".BlackLists", blacklist)) {
if ( ListManager.listSetContains(
supportedBlacklistType + ".BlackLists",
blacklist) ) {
Switchboard.urlBlacklist.add(
supportedBlacklistType,
url.getHost(), ".*");
url.getHost(),
".*");
}
}
}
@ -412,20 +488,29 @@ public class IndexControlRWIs_p {
}
}
if (prop.getInt("searchresult", 0) == 3) listHosts(prop, keyhash, sb);
if ( prop.getInt("searchresult", 0) == 3 ) {
listHosts(prop, keyhash, sb);
}
}
// insert constants
prop.putNum("wcount", segment.termIndex().sizesMax());
prop.put("cleanup_maxReferencesRadioChecked", ReferenceContainer.maxReferences > 0 ? 1 : 0);
prop.put("cleanup_maxReferences", ReferenceContainer.maxReferences > 0 ? ReferenceContainer.maxReferences : 100000);
prop.put("cleanup_maxReferences", ReferenceContainer.maxReferences > 0
? ReferenceContainer.maxReferences
: 100000);
// return rewrite properties
return prop;
}
public static void genURLList(final serverObjects prop, final byte[] keyhash, final String keystring, final RWIProcess ranked, final Bitfield flags, final int maxlines) {
public static void genURLList(
final serverObjects prop,
final byte[] keyhash,
final String keystring,
final RWIProcess ranked,
final Bitfield flags,
final int maxlines) {
// search for a word hash and generate a list of url links
final String keyhashs = ASCII.String(keyhash);
prop.put("genUrlList_keyHash", keyhashs);
@ -445,27 +530,48 @@ public class IndexControlRWIs_p {
String us;
long rn = -1;
while ( !ranked.isEmpty() && (entry = ranked.takeURL(false, 1000)) != null ) {
if ((entry == null) || (entry.metadata() == null)) continue;
if ( (entry == null) || (entry.metadata() == null) ) {
continue;
}
url = entry.metadata().url();
if (url == null) continue;
if ( url == null ) {
continue;
}
us = url.toNormalform(false, false);
if (rn == -1) rn = entry.ranking();
if ( rn == -1 ) {
rn = entry.ranking();
}
prop.put("genUrlList_urlList_" + i + "_urlExists", "1");
prop.put("genUrlList_urlList_" + i + "_urlExists_urlhxCount", i);
prop.putHTML("genUrlList_urlList_" + i + "_urlExists_urlhxValue", entry.word().urlhash());
prop.putHTML("genUrlList_urlList_" + i + "_urlExists_keyString", keystring);
prop.put("genUrlList_urlList_" + i + "_urlExists_keyHash", keyhashs);
prop.putHTML("genUrlList_urlList_" + i + "_urlExists_urlString", us);
prop.put("genUrlList_urlList_"+i+"_urlExists_urlStringShort", (us.length() > 40) ? (us.substring(0, 20) + "<br>" + us.substring(20, 40) + "...") : ((us.length() > 30) ? (us.substring(0, 20) + "<br>" + us.substring(20)) : us));
prop.put(
"genUrlList_urlList_" + i + "_urlExists_urlStringShort",
(us.length() > 40) ? (us.substring(0, 20) + "<br>" + us.substring(20, 40) + "...") : ((us
.length() > 30) ? (us.substring(0, 20) + "<br>" + us.substring(20)) : us));
prop.putNum("genUrlList_urlList_" + i + "_urlExists_ranking", (entry.ranking() - rn));
prop.putNum("genUrlList_urlList_"+i+"_urlExists_domlength", DigestURI.domLengthEstimation(entry.hash()));
prop.putNum(
"genUrlList_urlList_" + i + "_urlExists_domlength",
DigestURI.domLengthEstimation(entry.hash()));
prop.putNum("genUrlList_urlList_" + i + "_urlExists_ybr", BlockRank.ranking(entry.hash()));
prop.putNum("genUrlList_urlList_"+i+"_urlExists_tf", 1000.0 * entry.word().termFrequency());
prop.putNum("genUrlList_urlList_"+i+"_urlExists_authority", (ranked.getOrder() == null) ? -1 : ranked.getOrder().authority(ASCII.String(entry.hash(), 6, 6)));
prop.put("genUrlList_urlList_"+i+"_urlExists_date", GenericFormatter.SHORT_DAY_FORMATTER.format(new Date(entry.word().lastModified())));
prop.putNum("genUrlList_urlList_"+i+"_urlExists_wordsintitle", entry.word().wordsintitle());
prop.putNum("genUrlList_urlList_" + i + "_urlExists_tf", 1000.0 * entry
.word()
.termFrequency());
prop.putNum("genUrlList_urlList_" + i + "_urlExists_authority", (ranked.getOrder() == null)
? -1
: ranked.getOrder().authority(ASCII.String(entry.hash(), 6, 6)));
prop.put(
"genUrlList_urlList_" + i + "_urlExists_date",
GenericFormatter.SHORT_DAY_FORMATTER.format(new Date(entry.word().lastModified())));
prop.putNum("genUrlList_urlList_" + i + "_urlExists_wordsintitle", entry
.word()
.wordsintitle());
prop.putNum("genUrlList_urlList_" + i + "_urlExists_wordsintext", entry.word().wordsintext());
prop.putNum("genUrlList_urlList_"+i+"_urlExists_phrasesintext", entry.word().phrasesintext());
prop.putNum("genUrlList_urlList_" + i + "_urlExists_phrasesintext", entry
.word()
.phrasesintext());
prop.putNum("genUrlList_urlList_" + i + "_urlExists_llocal", entry.word().llocal());
prop.putNum("genUrlList_urlList_" + i + "_urlExists_lother", entry.word().lother());
prop.putNum("genUrlList_urlList_" + i + "_urlExists_hitcount", entry.word().hitcount());
@ -476,25 +582,50 @@ public class IndexControlRWIs_p {
prop.putNum("genUrlList_urlList_" + i + "_urlExists_posinphrase", entry.word().posinphrase());
prop.putNum("genUrlList_urlList_" + i + "_urlExists_urlcomps", entry.word().urlcomps());
prop.putNum("genUrlList_urlList_" + i + "_urlExists_urllength", entry.word().urllength());
prop.put("genUrlList_urlList_"+i+"_urlExists_props",
((entry.word().flags().get(Condenser.flag_cat_indexof)) ? "appears on index page, " : "") +
((entry.word().flags().get(Condenser.flag_cat_hasimage)) ? "contains images, " : "") +
((entry.word().flags().get(Condenser.flag_cat_hasaudio)) ? "contains audio, " : "") +
((entry.word().flags().get(Condenser.flag_cat_hasvideo)) ? "contains video, " : "") +
((entry.word().flags().get(Condenser.flag_cat_hasapp)) ? "contains applications, " : "") +
((entry.word().flags().get(WordReferenceRow.flag_app_dc_identifier)) ? "appears in url, " : "") +
((entry.word().flags().get(WordReferenceRow.flag_app_dc_title)) ? "appears in title, " : "") +
((entry.word().flags().get(WordReferenceRow.flag_app_dc_creator)) ? "appears in author, " : "") +
((entry.word().flags().get(WordReferenceRow.flag_app_dc_subject)) ? "appears in subject, " : "") +
((entry.word().flags().get(WordReferenceRow.flag_app_dc_description)) ? "appears in description, " : "") +
((entry.word().flags().get(WordReferenceRow.flag_app_emphasized)) ? "appears emphasized, " : "") +
((DigestURI.probablyRootURL(entry.word().urlhash())) ? "probably root url" : "")
);
prop
.put(
"genUrlList_urlList_" + i + "_urlExists_props",
((entry.word().flags().get(Condenser.flag_cat_indexof))
? "appears on index page, "
: "")
+ ((entry.word().flags().get(Condenser.flag_cat_hasimage))
? "contains images, "
: "")
+ ((entry.word().flags().get(Condenser.flag_cat_hasaudio))
? "contains audio, "
: "")
+ ((entry.word().flags().get(Condenser.flag_cat_hasvideo))
? "contains video, "
: "")
+ ((entry.word().flags().get(Condenser.flag_cat_hasapp))
? "contains applications, "
: "")
+ ((entry.word().flags().get(WordReferenceRow.flag_app_dc_identifier))
? "appears in url, "
: "")
+ ((entry.word().flags().get(WordReferenceRow.flag_app_dc_title))
? "appears in title, "
: "")
+ ((entry.word().flags().get(WordReferenceRow.flag_app_dc_creator))
? "appears in author, "
: "")
+ ((entry.word().flags().get(WordReferenceRow.flag_app_dc_subject))
? "appears in subject, "
: "")
+ ((entry.word().flags().get(WordReferenceRow.flag_app_dc_description))
? "appears in description, "
: "")
+ ((entry.word().flags().get(WordReferenceRow.flag_app_emphasized))
? "appears emphasized, "
: "")
+ ((DigestURI.probablyRootURL(entry.word().urlhash())) ? "probably root url" : ""));
if ( Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_DHT, url) ) {
prop.put("genUrlList_urlList_" + i + "_urlExists_urlhxChecked", "1");
}
i++;
if ((maxlines >= 0) && (i >= maxlines)) break;
if ( (maxlines >= 0) && (i >= maxlines) ) {
break;
}
}
final Iterator<byte[]> iter = ranked.miss(); // iterates url hash strings
byte[] b;
@ -515,28 +646,55 @@ public class IndexControlRWIs_p {
public static void putBlacklists(final serverObjects prop, final List<String> lists) {
prop.put("genUrlList_blacklists", lists.size());
int i = 0;
for (final String list : lists)
for ( final String list : lists ) {
prop.put("genUrlList_blacklists_" + i++ + "_name", list);
}
}
public static Bitfield compileFlags(final serverObjects post) {
final Bitfield b = new Bitfield(4);
if (post.get("allurl", "").equals("on")) return null;
if ( post.get("allurl", "").equals("on") ) {
return null;
}
if ( post.get("flags") != null ) {
if (post.get("flags","").length() == 0) return null;
if ( post.get("flags", "").length() == 0 ) {
return null;
}
return new Bitfield(4, post.get("flags"));
}
if (post.get("description", "").equals("on")) b.set(WordReferenceRow.flag_app_dc_description, true);
if (post.get("title", "").equals("on")) b.set(WordReferenceRow.flag_app_dc_title, true);
if (post.get("creator", "").equals("on")) b.set(WordReferenceRow.flag_app_dc_creator, true);
if (post.get("subject", "").equals("on")) b.set(WordReferenceRow.flag_app_dc_subject, true);
if (post.get("url", "").equals("on")) b.set(WordReferenceRow.flag_app_dc_identifier, true);
if (post.get("emphasized", "").equals("on")) b.set(WordReferenceRow.flag_app_emphasized, true);
if (post.get("image", "").equals("on")) b.set(Condenser.flag_cat_hasimage, true);
if (post.get("audio", "").equals("on")) b.set(Condenser.flag_cat_hasaudio, true);
if (post.get("video", "").equals("on")) b.set(Condenser.flag_cat_hasvideo, true);
if (post.get("app", "").equals("on")) b.set(Condenser.flag_cat_hasapp, true);
if (post.get("indexof", "").equals("on")) b.set(Condenser.flag_cat_indexof, true);
if ( post.get("description", "").equals("on") ) {
b.set(WordReferenceRow.flag_app_dc_description, true);
}
if ( post.get("title", "").equals("on") ) {
b.set(WordReferenceRow.flag_app_dc_title, true);
}
if ( post.get("creator", "").equals("on") ) {
b.set(WordReferenceRow.flag_app_dc_creator, true);
}
if ( post.get("subject", "").equals("on") ) {
b.set(WordReferenceRow.flag_app_dc_subject, true);
}
if ( post.get("url", "").equals("on") ) {
b.set(WordReferenceRow.flag_app_dc_identifier, true);
}
if ( post.get("emphasized", "").equals("on") ) {
b.set(WordReferenceRow.flag_app_emphasized, true);
}
if ( post.get("image", "").equals("on") ) {
b.set(Condenser.flag_cat_hasimage, true);
}
if ( post.get("audio", "").equals("on") ) {
b.set(Condenser.flag_cat_hasaudio, true);
}
if ( post.get("video", "").equals("on") ) {
b.set(Condenser.flag_cat_hasvideo, true);
}
if ( post.get("app", "").equals("on") ) {
b.set(Condenser.flag_cat_hasapp, true);
}
if ( post.get("indexof", "").equals("on") ) {
b.set(Condenser.flag_cat_indexof, true);
}
return b;
}
@ -545,20 +703,29 @@ public class IndexControlRWIs_p {
Seed seed;
int hc = 0;
prop.put("searchresult_keyhash", startHash);
final Iterator<Seed> e = PeerSelection.getAcceptRemoteIndexSeeds(sb.peers, startHash, sb.peers.sizeConnected(), true);
final Iterator<Seed> e =
PeerSelection.getAcceptRemoteIndexSeeds(sb.peers, startHash, sb.peers.sizeConnected(), true);
while ( e.hasNext() ) {
seed = e.next();
if ( seed != null ) {
prop.put("searchresult_hosts_" + hc + "_hosthash", seed.hash);
prop.putHTML("searchresult_hosts_" + hc + "_hostname", seed.hash + " " + seed.get(Seed.NAME, "nameless"));
prop.putHTML(
"searchresult_hosts_" + hc + "_hostname",
seed.hash + " " + seed.get(Seed.NAME, "nameless"));
hc++;
}
}
prop.put("searchresult_hosts", hc);
}
public static RWIProcess genSearchresult(final serverObjects prop, final Switchboard sb, final Segment segment, final byte[] keyhash, final Bitfield filter) {
final QueryParams query = new QueryParams(ASCII.String(keyhash), -1, filter, segment, sb.getRanking(), "IndexControlRWIs_p");
public static RWIProcess genSearchresult(
final serverObjects prop,
final Switchboard sb,
final Segment segment,
final byte[] keyhash,
final Bitfield filter) {
final QueryParams query =
new QueryParams(ASCII.String(keyhash), -1, filter, segment, sb.getRanking(), "IndexControlRWIs_p");
final ReferenceOrder order = new ReferenceOrder(query.ranking, UTF8.getBytes(query.targetlang));
final RWIProcess ranked = new RWIProcess(query, order, Integer.MAX_VALUE);
ranked.run();
@ -569,7 +736,8 @@ public class IndexControlRWIs_p {
} else {
prop.put("searchresult", 3);
prop.put("searchresult_allurl", ranked.filteredCount());
prop.put("searchresult_description", ranked.flagCount()[WordReferenceRow.flag_app_dc_description]);
prop
.put("searchresult_description", ranked.flagCount()[WordReferenceRow.flag_app_dc_description]);
prop.put("searchresult_title", ranked.flagCount()[WordReferenceRow.flag_app_dc_title]);
prop.put("searchresult_creator", ranked.flagCount()[WordReferenceRow.flag_app_dc_creator]);
prop.put("searchresult_subject", ranked.flagCount()[WordReferenceRow.flag_app_dc_subject]);

File diff suppressed because it is too large Load Diff

@ -24,7 +24,6 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.search.index;
import java.io.File;
@ -51,13 +50,13 @@ import net.yacy.search.ranking.RankingProfile;
import net.yacy.search.ranking.ReferenceOrder;
import net.yacy.search.snippet.ContentDomain;
/**
* convenience class to access the yacycore library from outside of yacy to put files into the index
* @author Michael Christen
*
* @author Michael Christen
*/
public class DocumentIndex extends Segment {
public class DocumentIndex extends Segment
{
private static final RankingProfile textRankingDefault = new RankingProfile(ContentDomain.TEXT);
//private Bitfield zeroConstraint = new Bitfield(4);
@ -66,7 +65,8 @@ public class DocumentIndex extends Segment {
static {
try {
poison = new DigestURI("file://.");
} catch (final MalformedURLException e) {}
} catch ( final MalformedURLException e ) {
}
}
BlockingQueue<DigestURI> queue; // a queue of document ID's
private final Worker[] worker;
@ -74,8 +74,8 @@ public class DocumentIndex extends Segment {
static final ThreadGroup workerThreadGroup = new ThreadGroup("workerThreadGroup");
public DocumentIndex(final File segmentPath, final CallbackListener callback, final int cachesize) throws IOException {
public DocumentIndex(final File segmentPath, final CallbackListener callback, final int cachesize)
throws IOException {
super(new Log("DocumentIndex"), segmentPath, cachesize, targetFileSize * 4 - 1, false, false);
final int cores = Runtime.getRuntime().availableProcessors() + 1;
this.callback = callback;
@ -87,7 +87,8 @@ public class DocumentIndex extends Segment {
}
}
class Worker extends Thread {
class Worker extends Thread
{
public Worker(final int count) {
super(workerThreadGroup, "query-" + count);
}
@ -97,7 +98,8 @@ public class DocumentIndex extends Segment {
DigestURI f;
URIMetadataRow[] resultRows;
try {
while ((f = DocumentIndex.this.queue.take()) != poison) try {
while ( (f = DocumentIndex.this.queue.take()) != poison ) {
try {
resultRows = add(f);
for ( final URIMetadataRow resultRow : resultRows ) {
if ( DocumentIndex.this.callback != null ) {
@ -109,10 +111,14 @@ public class DocumentIndex extends Segment {
}
}
} catch ( final IOException e ) {
if (e.getMessage().indexOf("cannot parse",0) < 0) Log.logException(e);
if ( e.getMessage().indexOf("cannot parse", 0) < 0 ) {
Log.logException(e);
}
DocumentIndex.this.callback.fail(f, e.getMessage());
}
} catch (final InterruptedException e) {}
}
} catch ( final InterruptedException e ) {
}
}
}
@ -128,9 +134,15 @@ public class DocumentIndex extends Segment {
}
private URIMetadataRow[] add(final DigestURI url) throws IOException {
if (url == null) throw new IOException("file = null");
if (url.isDirectory()) throw new IOException("file should be a document, not a path");
if (!url.canRead()) throw new IOException("cannot read file");
if ( url == null ) {
throw new IOException("file = null");
}
if ( url.isDirectory() ) {
throw new IOException("file should be a document, not a path");
}
if ( !url.canRead() ) {
throw new IOException("cannot read file");
}
Document[] documents;
long length;
try {
@ -148,7 +160,8 @@ public class DocumentIndex extends Segment {
int c = 0;
for ( final Document document : documents ) {
final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib);
rows[c++] = super.storeDocument(
rows[c++] =
super.storeDocument(
url,
null,
new Date(url.lastModified()),
@ -157,15 +170,15 @@ public class DocumentIndex extends Segment {
document,
condenser,
null,
DocumentIndex.class.getName() + ".add"
);
DocumentIndex.class.getName() + ".add");
}
return rows;
}
/**
* add a file or a directory of files to the index
* If the given file is a path to a directory, the complete sub-tree is indexed
* add a file or a directory of files to the index If the given file is a path to a directory, the
* complete sub-tree is indexed
*
* @param start
*/
public void addConcurrent(final DigestURI start) throws IOException {
@ -174,7 +187,8 @@ public class DocumentIndex extends Segment {
if ( !start.isDirectory() ) {
try {
this.queue.put(start);
} catch (final InterruptedException e) {}
} catch ( final InterruptedException e ) {
}
return;
}
final String[] s = start.list();
@ -188,7 +202,8 @@ public class DocumentIndex extends Segment {
} else {
try {
this.queue.put(w);
} catch (final InterruptedException e) {}
} catch ( final InterruptedException e ) {
}
}
}
} catch ( final MalformedURLException e1 ) {
@ -199,13 +214,15 @@ public class DocumentIndex extends Segment {
/**
* do a full-text search of a given string and return a specific number of results
*
* @param querystring
* @param count
* @return a list of files that contain the given string
*/
public ArrayList<DigestURI> find(final String querystring, int count) {
// make a query and start a search
final QueryParams query = new QueryParams(querystring, count, null, this, textRankingDefault, "DocumentIndex");
final QueryParams query =
new QueryParams(querystring, count, null, this, textRankingDefault, "DocumentIndex");
final ReferenceOrder order = new ReferenceOrder(query.ranking, UTF8.getBytes(query.targetlang));
final RWIProcess rankedCache = new RWIProcess(query, order, SearchEvent.max_results_preparation);
rankedCache.start();
@ -216,38 +233,46 @@ public class DocumentIndex extends Segment {
Components metadata;
while ( (row = rankedCache.takeURL(false, 1000)) != null ) {
metadata = row.metadata();
if (metadata == null) continue;
if ( metadata == null ) {
continue;
}
files.add(metadata.url());
count--;
if (count == 0) break;
if ( count == 0 ) {
break;
}
}
return files;
}
/**
* close the index.
* This terminates all worker threads and then closes the segment.
* close the index. This terminates all worker threads and then closes the segment.
*/
@Override
public void close() {
// send termination signal to worker threads
for (@SuppressWarnings("unused") final Worker element : this.worker) {
for ( @SuppressWarnings("unused")
final Worker element : this.worker ) {
try {
this.queue.put(poison);
} catch (final InterruptedException e) {}
} catch ( final InterruptedException e ) {
}
}
// wait for termination
for ( final Worker element : this.worker ) {
try {
element.join();
} catch (final InterruptedException e) {}
} catch ( final InterruptedException e ) {
}
}
// close the segment
super.close();
}
public interface CallbackListener {
public interface CallbackListener
{
public void commit(DigestURI f, URIMetadataRow resultRow);
public void fail(DigestURI f, String failReason);
}
@ -260,13 +285,18 @@ public class DocumentIndex extends Segment {
// DocumentIndex yacyindex add test/parsertest
// DocumentIndex yacyindex search steht
System.setProperty("java.awt.headless", "true");
if (args.length < 3) return;
if ( args.length < 3 ) {
return;
}
final File segmentPath = new File(args[0]);
System.out.println("using index files at " + segmentPath.getAbsolutePath());
final CallbackListener callback = new CallbackListener() {
@Override
public void commit(final DigestURI f, final URIMetadataRow resultRow) {
System.out.println("indexed: " + f.toString());
}
@Override
public void fail(final DigestURI f, final String failReason) {
System.out.println("not indexed " + f.toString() + ": " + failReason);
}
@ -279,12 +309,16 @@ public class DocumentIndex extends Segment {
di.close();
} else {
String query = "";
for (int i = 2; i < args.length; i++) query += args[i];
for ( int i = 2; i < args.length; i++ ) {
query += args[i];
}
query.trim();
final DocumentIndex di = new DocumentIndex(segmentPath, callback, 100000);
final ArrayList<DigestURI> results = di.find(query, 100);
for ( final DigestURI f : results ) {
if (f != null) System.out.println(f.toString());
if ( f != null ) {
System.out.println(f.toString());
}
}
di.close();
}

@ -65,8 +65,10 @@ import net.yacy.search.ranking.ReferenceOrder;
import net.yacy.search.snippet.ContentDomain;
import net.yacy.search.snippet.ResultEntry;
public final class RWIProcess extends Thread {
public final class RWIProcess extends Thread
{
private static final long maxWaitPerResult = 30;
private static final int maxDoubleDomAll = 1000, maxDoubleDomSpecial = 10000;
private final QueryParams query;
@ -79,6 +81,8 @@ public final class RWIProcess extends Thread {
private int remote_resourceSize, remote_indexCount, remote_peerCount;
private int local_indexCount;
private int initialExpectedRemoteReferences;
private final AtomicInteger expectedRemoteReferences, receivedRemoteReferences;
private final WeakPriorityBlockingQueue<WordReferenceVars> stack;
private final AtomicInteger feeders;
private final ConcurrentHashMap<String, WeakPriorityBlockingQueue<WordReferenceVars>> doubleDomCache; // key = domhash (6 bytes); value = like stack
@ -97,7 +101,6 @@ public final class RWIProcess extends Thread {
private final ScoreMap<String> protocolNavigator; // a counter for protocol types
private final ScoreMap<String> filetypeNavigator; // a counter for file types
public RWIProcess(final QueryParams query, final ReferenceOrder order, final int maxentries) {
// we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime
@ -112,11 +115,15 @@ public final class RWIProcess extends Thread {
this.remote_resourceSize = 0;
this.remote_indexCount = 0;
this.local_indexCount = 0;
this.urlhashes = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 100);
this.misses = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 100);
this.urlhashes =
new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 100);
this.misses =
new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 100);
this.sortout = 0;
this.flagcount = new int[32];
for (int i = 0; i < 32; i++) {this.flagcount[i] = 0;}
for ( int i = 0; i < 32; i++ ) {
this.flagcount[i] = 0;
}
this.hostNavigator = new ConcurrentScoreMap<String>();
this.hostResolver = new ConcurrentHashMap<String, byte[]>();
this.authorNavigator = new ConcurrentScoreMap<String>();
@ -126,6 +133,18 @@ public final class RWIProcess extends Thread {
this.ref = new ConcurrentScoreMap<String>();
this.feeders = new AtomicInteger(1);
this.startTime = System.currentTimeMillis();
this.initialExpectedRemoteReferences = 0;
this.expectedRemoteReferences = new AtomicInteger(0);
this.receivedRemoteReferences = new AtomicInteger(0);
}
public void setExpectedRemoteReferences(int expectedRemoteReferences) {
this.initialExpectedRemoteReferences = expectedRemoteReferences;
this.expectedRemoteReferences.set(expectedRemoteReferences);
}
public void decExpectedRemoteReferences(int x) {
this.expectedRemoteReferences.addAndGet(-x);
}
public QueryParams getQuery() {
@ -144,7 +163,11 @@ public final class RWIProcess extends Thread {
// so following sortings together with the global results will be fast
try {
final long timer = System.currentTimeMillis();
final TermSearch<WordReference> search = this.query.getSegment().termIndex().query(
final TermSearch<WordReference> search =
this.query
.getSegment()
.termIndex()
.query(
this.query.queryHashes,
this.query.excludeHashes,
null,
@ -152,7 +175,15 @@ public final class RWIProcess extends Thread {
this.query.maxDistance);
this.localSearchInclusion = search.inclusion();
final ReferenceContainer<WordReference> index = search.joined();
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(this.query.id(true), SearchEvent.Type.JOIN, this.query.queryString, index.size(), System.currentTimeMillis() - timer), false);
EventTracker.update(
EventTracker.EClass.SEARCH,
new ProfilingGraph.EventSearch(
this.query.id(true),
SearchEvent.Type.JOIN,
this.query.queryString,
index.size(),
System.currentTimeMillis() - timer),
false);
if ( !index.isEmpty() ) {
add(index, true, "local index: " + this.query.getSegment().getLocation(), -1, true);
}
@ -176,7 +207,9 @@ public final class RWIProcess extends Thread {
this.addRunning = true;
assert (index != null);
if (index.isEmpty()) return;
if ( index.isEmpty() ) {
return;
}
if ( !local ) {
assert fullResource >= 0 : "fullResource = " + fullResource;
@ -188,27 +221,42 @@ public final class RWIProcess extends Thread {
// normalize entries
final BlockingQueue<WordReferenceVars> decodedEntries = this.order.normalizeWith(index);
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(this.query.id(true), SearchEvent.Type.NORMALIZING, resourceName, index.size(), System.currentTimeMillis() - timer), false);
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(
this.query.id(true),
SearchEvent.Type.NORMALIZING,
resourceName,
index.size(),
System.currentTimeMillis() - timer), false);
this.receivedRemoteReferences.addAndGet(index.size());
// iterate over normalized entries and select some that are better than currently stored
timer = System.currentTimeMillis();
final boolean nav_hosts = this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts",0) >= 0;
final boolean nav_hosts =
this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts", 0) >= 0;
// apply all constraints
try {
WordReferenceVars iEntry;
final String pattern = this.query.urlMask.pattern();
final boolean httpPattern = pattern.equals("http://.*");
final boolean noHttpButProtocolPattern = pattern.equals("https://.*") || pattern.equals("ftp://.*") || pattern.equals("smb://.*") || pattern.equals("file://.*");
final boolean noHttpButProtocolPattern =
pattern.equals("https://.*")
|| pattern.equals("ftp://.*")
|| pattern.equals("smb://.*")
|| pattern.equals("file://.*");
pollloop: while ( true ) {
iEntry = decodedEntries.poll(1, TimeUnit.SECONDS);
if (iEntry == null || iEntry == WordReferenceVars.poison) break pollloop;
if ( iEntry == null || iEntry == WordReferenceVars.poison ) {
break pollloop;
}
assert (iEntry.urlhash().length == index.row().primaryKeyLength);
//if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue;
// increase flag counts
for ( int j = 0; j < 32; j++ ) {
if (iEntry.flags().get(j)) {this.flagcount[j]++;}
if ( iEntry.flags().get(j) ) {
this.flagcount[j]++;
}
}
// check constraints
@ -218,10 +266,22 @@ public final class RWIProcess extends Thread {
// check document domain
if ( this.query.contentdom != ContentDomain.TEXT ) {
if ((this.query.contentdom == ContentDomain.AUDIO) && (!(iEntry.flags().get(Condenser.flag_cat_hasaudio)))) { continue pollloop; }
if ((this.query.contentdom == ContentDomain.VIDEO) && (!(iEntry.flags().get(Condenser.flag_cat_hasvideo)))) { continue pollloop; }
if ((this.query.contentdom == ContentDomain.IMAGE) && (!(iEntry.flags().get(Condenser.flag_cat_hasimage)))) { continue pollloop; }
if ((this.query.contentdom == ContentDomain.APP ) && (!(iEntry.flags().get(Condenser.flag_cat_hasapp )))) { continue pollloop; }
if ( (this.query.contentdom == ContentDomain.AUDIO)
&& (!(iEntry.flags().get(Condenser.flag_cat_hasaudio))) ) {
continue pollloop;
}
if ( (this.query.contentdom == ContentDomain.VIDEO)
&& (!(iEntry.flags().get(Condenser.flag_cat_hasvideo))) ) {
continue pollloop;
}
if ( (this.query.contentdom == ContentDomain.IMAGE)
&& (!(iEntry.flags().get(Condenser.flag_cat_hasimage))) ) {
continue pollloop;
}
if ( (this.query.contentdom == ContentDomain.APP)
&& (!(iEntry.flags().get(Condenser.flag_cat_hasapp))) ) {
continue pollloop;
}
}
// check tld domain
@ -254,8 +314,12 @@ public final class RWIProcess extends Thread {
// check protocol
if ( !this.query.urlMask_isCatchall ) {
final boolean httpFlagSet = DigestURI.flag4HTTPset(iEntry.urlHash);
if (httpPattern && !httpFlagSet) continue pollloop;
if (noHttpButProtocolPattern && httpFlagSet) continue pollloop;
if ( httpPattern && !httpFlagSet ) {
continue pollloop;
}
if ( noHttpButProtocolPattern && httpFlagSet ) {
continue pollloop;
}
}
// finally make a double-check and insert result to stack
@ -264,7 +328,8 @@ public final class RWIProcess extends Thread {
this.urlhashes.putUnique(iEntry.urlhash());
rankingtryloop: while ( true ) {
try {
this.stack.put(new ReverseElement<WordReferenceVars>(iEntry, this.order.cardinal(iEntry))); // inserts the element and removes the worst (which is smallest)
this.stack.put(new ReverseElement<WordReferenceVars>(iEntry, this.order
.cardinal(iEntry))); // inserts the element and removes the worst (which is smallest)
break rankingtryloop;
} catch ( final ArithmeticException e ) {
// this may happen if the concurrent normalizer changes values during cardinal computation
@ -272,16 +337,29 @@ public final class RWIProcess extends Thread {
}
}
// increase counter for statistics
if (local) this.local_indexCount++; else this.remote_indexCount++;
if ( local ) {
this.local_indexCount++;
} else {
this.remote_indexCount++;
//}
}
}
} catch (final InterruptedException e) {} catch (final RowSpaceExceededException e) {} finally {
if (finalizeAddAtEnd) this.addRunning = false;
} catch ( final InterruptedException e ) {
} catch ( final RowSpaceExceededException e ) {
} finally {
if ( finalizeAddAtEnd ) {
this.addRunning = false;
}
}
//if ((query.neededResults() > 0) && (container.size() > query.neededResults())) remove(true, true);
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(this.query.id(true), SearchEvent.Type.PRESORT, resourceName, index.size(), System.currentTimeMillis() - timer), false);
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(
this.query.id(true),
SearchEvent.Type.PRESORT,
resourceName,
index.size(),
System.currentTimeMillis() - timer), false);
}
/**
@ -301,18 +379,24 @@ public final class RWIProcess extends Thread {
}
private boolean testFlags(final WordReference ientry) {
if (this.query.constraint == null) return true;
if ( this.query.constraint == null ) {
return true;
}
// test if ientry matches with filter
// if all = true: let only entries pass that has all matching bits
// if all = false: let all entries pass that has at least one matching bit
if ( this.query.allofconstraint ) {
for ( int i = 0; i < 32; i++ ) {
if ((this.query.constraint.get(i)) && (!ientry.flags().get(i))) return false;
if ( (this.query.constraint.get(i)) && (!ientry.flags().get(i)) ) {
return false;
}
}
return true;
}
for ( int i = 0; i < 32; i++ ) {
if ((this.query.constraint.get(i)) && (ientry.flags().get(i))) return true;
if ( (this.query.constraint.get(i)) && (ientry.flags().get(i)) ) {
return true;
}
}
return false;
}
@ -323,7 +407,9 @@ public final class RWIProcess extends Thread {
return this.localSearchInclusion;
}
private WeakPriorityBlockingQueue.Element<WordReferenceVars> takeRWI(final boolean skipDoubleDom, final long waitingtime) {
private WeakPriorityBlockingQueue.Element<WordReferenceVars> takeRWI(
final boolean skipDoubleDom,
final long waitingtime) {
// returns from the current RWI list the best entry and removes this entry from the list
WeakPriorityBlockingQueue<WordReferenceVars> m;
@ -334,16 +420,36 @@ public final class RWIProcess extends Thread {
//System.out.println("stack.poll: feeders = " + this.feeders + ", stack.sizeQueue = " + stack.sizeQueue());
int loops = 0; // a loop counter to terminate the reading if all the results are from the same domain
final long timeout = System.currentTimeMillis() + waitingtime;
while (((!feedingIsFinished() && this.addRunning) || this.stack.sizeQueue() > 0) &&
(this.query.itemsPerPage < 1 || loops++ < this.query.itemsPerPage)) {
// wait some time if we did not get so much remote results so far to get a better ranking over remote results
// we wait at most 30 milliseconds to get a maximum total waiting time of 300 milliseconds for 10 results
long wait =
this.receivedRemoteReferences.get() == 0 ? maxWaitPerResult : Math.min(
maxWaitPerResult,
maxWaitPerResult
* this.initialExpectedRemoteReferences
/ this.receivedRemoteReferences.get());
if ( wait > 0 ) {
Thread.sleep(wait);
}
// loop as long as we can expect that we should get more results
while ( ((!feedingIsFinished() && this.addRunning) || this.stack.sizeQueue() > 0)
&& (this.query.itemsPerPage < 1 || loops++ < this.query.itemsPerPage) ) {
if ( waitingtime <= 0 ) {
rwi = this.stack.poll();
} else timeoutloop:while (System.currentTimeMillis() < timeout) {
if (feedingIsFinished() && this.stack.sizeQueue() == 0) break timeoutloop;
} else {
timeoutloop: while ( System.currentTimeMillis() < timeout ) {
if ( feedingIsFinished() && this.stack.sizeQueue() == 0 ) {
break timeoutloop;
}
rwi = this.stack.poll(50);
if (rwi != null) break timeoutloop;
if ( rwi != null ) {
break timeoutloop;
}
}
}
if ( rwi == null ) {
break;
}
if (rwi == null) break;
if ( !skipDoubleDom ) {
//System.out.println("!skipDoubleDom");
return rwi;
@ -355,7 +461,10 @@ public final class RWIProcess extends Thread {
m = this.doubleDomCache.get(hosthash);
if ( m == null ) {
// first appearance of dom. we create an entry to signal that one of that domain was already returned
m = new WeakPriorityBlockingQueue<WordReferenceVars>((this.query.specialRights) ? maxDoubleDomSpecial : maxDoubleDomAll);
m =
new WeakPriorityBlockingQueue<WordReferenceVars>((this.query.specialRights)
? maxDoubleDomSpecial
: maxDoubleDomAll);
this.doubleDomCache.put(hosthash, m);
return rwi;
}
@ -363,15 +472,19 @@ public final class RWIProcess extends Thread {
m.put(rwi);
}
}
} catch (final InterruptedException e1) {}
if (this.doubleDomCache.isEmpty()) return null;
} catch ( final InterruptedException e1 ) {
}
if ( this.doubleDomCache.isEmpty() ) {
return null;
}
// no more entries in sorted RWI entries. Now take Elements from the doubleDomCache
// find best entry from all caches
WeakPriorityBlockingQueue.Element<WordReferenceVars> bestEntry = null;
WeakPriorityBlockingQueue.Element<WordReferenceVars> o;
synchronized ( this.doubleDomCache ) {
final Iterator<WeakPriorityBlockingQueue<WordReferenceVars>> i = this.doubleDomCache.values().iterator();
final Iterator<WeakPriorityBlockingQueue<WordReferenceVars>> i =
this.doubleDomCache.values().iterator();
while ( i.hasNext() ) {
try {
m = i.next();
@ -379,19 +492,27 @@ public final class RWIProcess extends Thread {
Log.logException(e);
continue; // not the best solution...
}
if (m == null) continue;
if (m.isEmpty()) continue;
if ( m == null ) {
continue;
}
if ( m.isEmpty() ) {
continue;
}
if ( bestEntry == null ) {
bestEntry = m.peek();
continue;
}
o = m.peek();
if (o == null) continue;
if ( o == null ) {
continue;
}
if ( o.getWeight() < bestEntry.getWeight() ) {
bestEntry = o;
}
}
if (bestEntry == null) return null;
if ( bestEntry == null ) {
return null;
}
// finally remove the best entry from the doubledom cache
m = this.doubleDomCache.get(bestEntry.getElement().hosthash());
@ -401,10 +522,11 @@ public final class RWIProcess extends Thread {
}
/**
* get one metadata entry from the ranked results. This will be the 'best' entry so far
* according to the applied ranking. If there are no more entries left or the timeout
* limit is reached then null is returned. The caller may distinguish the timeout case
* from the case where there will be no more also in the future by calling this.feedingIsFinished()
* get one metadata entry from the ranked results. This will be the 'best' entry so far according to the
* applied ranking. If there are no more entries left or the timeout limit is reached then null is
* returned. The caller may distinguish the timeout case from the case where there will be no more also in
* the future by calling this.feedingIsFinished()
*
* @param skipDoubleDom should be true if it is wanted that double domain entries are skipped
* @param waitingtime the time this method may take for a result computation
* @return a metadata entry for a url
@ -416,8 +538,11 @@ public final class RWIProcess extends Thread {
long timeleft;
while ( (timeleft = timeout - System.currentTimeMillis()) > 0 ) {
//System.out.println("timeleft = " + timeleft);
final WeakPriorityBlockingQueue.Element<WordReferenceVars> obrwi = takeRWI(skipDoubleDom, timeleft);
if (obrwi == null) return null; // all time was already wasted in takeRWI to get another element
final WeakPriorityBlockingQueue.Element<WordReferenceVars> obrwi =
takeRWI(skipDoubleDom, timeleft);
if ( obrwi == null ) {
return null; // all time was already wasted in takeRWI to get another element
}
final URIMetadataRow page = this.query.getSegment().urlMetadata().load(obrwi);
if ( page == null ) {
try {
@ -462,17 +587,17 @@ public final class RWIProcess extends Thread {
final String pagetitle = metadata.dc_title().toLowerCase();
// check exclusion
if ((QueryParams.anymatch(pagetitle, this.query.excludeHashes)) ||
(QueryParams.anymatch(pageurl.toLowerCase(), this.query.excludeHashes)) ||
(QueryParams.anymatch(pageauthor.toLowerCase(), this.query.excludeHashes))) {
if ( (QueryParams.anymatch(pagetitle, this.query.excludeHashes))
|| (QueryParams.anymatch(pageurl.toLowerCase(), this.query.excludeHashes))
|| (QueryParams.anymatch(pageauthor.toLowerCase(), this.query.excludeHashes)) ) {
this.sortout++;
continue;
}
// check index-of constraint
if ((this.query.constraint != null) &&
(this.query.constraint.get(Condenser.flag_cat_indexof)) &&
(!(pagetitle.startsWith("index of")))) {
if ( (this.query.constraint != null)
&& (this.query.constraint.get(Condenser.flag_cat_indexof))
&& (!(pagetitle.startsWith("index of"))) ) {
final Iterator<byte[]> wi = this.query.queryHashes.iterator();
while ( wi.hasNext() ) {
this.query.getSegment().termIndex().removeDelayed(wi.next(), page.hash());
@ -482,18 +607,18 @@ public final class RWIProcess extends Thread {
}
// check location constraint
if ((this.query.constraint != null) &&
(this.query.constraint.get(Condenser.flag_cat_haslocation)) &&
(metadata.lat() == 0.0f || metadata.lon() == 0.0f)) {
if ( (this.query.constraint != null)
&& (this.query.constraint.get(Condenser.flag_cat_haslocation))
&& (metadata.lat() == 0.0f || metadata.lon() == 0.0f) ) {
this.sortout++;
continue;
}
// check content domain
if ((this.query.contentdom == ContentDomain.AUDIO && page.laudio() == 0) ||
(this.query.contentdom == ContentDomain.VIDEO && page.lvideo() == 0) ||
(this.query.contentdom == ContentDomain.IMAGE && page.limage() == 0) ||
(this.query.contentdom == ContentDomain.APP && page.lapp() == 0)) {
if ( (this.query.contentdom == ContentDomain.AUDIO && page.laudio() == 0)
|| (this.query.contentdom == ContentDomain.VIDEO && page.lvideo() == 0)
|| (this.query.contentdom == ContentDomain.IMAGE && page.limage() == 0)
|| (this.query.contentdom == ContentDomain.APP && page.lapp() == 0) ) {
this.sortout++;
continue;
}
@ -534,7 +659,9 @@ public final class RWIProcess extends Thread {
// file type navigation
final String fileext = metadata.url().getFileExtension();
if (fileext.length() > 0) this.filetypeNavigator.inc(fileext);
if ( fileext.length() > 0 ) {
this.filetypeNavigator.inc(fileext);
}
// check Scanner
if ( !Scanner.acceptURL(metadata.url()) ) {
@ -565,9 +692,13 @@ public final class RWIProcess extends Thread {
}
public boolean isEmpty() {
if (!this.stack.isEmpty()) return false;
if ( !this.stack.isEmpty() ) {
return false;
}
for ( final WeakPriorityBlockingQueue<WordReferenceVars> s : this.doubleDomCache.values() ) {
if (!s.isEmpty()) return false;
if ( !s.isEmpty() ) {
return false;
}
}
return true;
}
@ -616,22 +747,31 @@ public final class RWIProcess extends Thread {
}
public ScoreMap<String> getNamespaceNavigator() {
if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("namespace",0) < 0) return new ClusteredScoreMap<String>();
if (this.namespaceNavigator.sizeSmaller(2)) this.namespaceNavigator.clear(); // navigators with one entry are not useful
if ( !this.query.navigators.equals("all") && this.query.navigators.indexOf("namespace", 0) < 0 ) {
return new ClusteredScoreMap<String>();
}
if ( this.namespaceNavigator.sizeSmaller(2) ) {
this.namespaceNavigator.clear(); // navigators with one entry are not useful
}
return this.namespaceNavigator;
}
public ScoreMap<String> getHostNavigator() {
final ScoreMap<String> result = new ConcurrentScoreMap<String>();
if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("hosts",0) < 0) return result;
if ( !this.query.navigators.equals("all") && this.query.navigators.indexOf("hosts", 0) < 0 ) {
return result;
}
final Iterator<String> domhashs = this.hostNavigator.keys(false);
URIMetadataRow row;
byte[] urlhash;
String hosthash, hostname;
if (this.hostResolver != null) while (domhashs.hasNext() && result.sizeSmaller(30)) {
if ( this.hostResolver != null ) {
while ( domhashs.hasNext() && result.sizeSmaller(30) ) {
hosthash = domhashs.next();
if (hosthash == null) continue;
if ( hosthash == null ) {
continue;
}
urlhash = this.hostResolver.get(hosthash);
row = urlhash == null ? null : this.query.getSegment().urlMetadata().load(urlhash);
hostname = row == null ? null : row.metadata().url().getHost();
@ -639,26 +779,43 @@ public final class RWIProcess extends Thread {
result.set(hostname, this.hostNavigator.get(hosthash));
}
}
if (result.sizeSmaller(2)) result.clear(); // navigators with one entry are not useful
}
if ( result.sizeSmaller(2) ) {
result.clear(); // navigators with one entry are not useful
}
return result;
}
public ScoreMap<String> getProtocolNavigator() {
if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("protocol",0) < 0) return new ClusteredScoreMap<String>();
if (this.protocolNavigator.sizeSmaller(2)) this.protocolNavigator.clear(); // navigators with one entry are not useful
if ( !this.query.navigators.equals("all") && this.query.navigators.indexOf("protocol", 0) < 0 ) {
return new ClusteredScoreMap<String>();
}
if ( this.protocolNavigator.sizeSmaller(2) ) {
this.protocolNavigator.clear(); // navigators with one entry are not useful
}
return this.protocolNavigator;
}
public ScoreMap<String> getFiletypeNavigator() {
if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("filetype",0) < 0) return new ClusteredScoreMap<String>();
if (this.filetypeNavigator.sizeSmaller(2)) this.filetypeNavigator.clear(); // navigators with one entry are not useful
if ( !this.query.navigators.equals("all") && this.query.navigators.indexOf("filetype", 0) < 0 ) {
return new ClusteredScoreMap<String>();
}
if ( this.filetypeNavigator.sizeSmaller(2) ) {
this.filetypeNavigator.clear(); // navigators with one entry are not useful
}
return this.filetypeNavigator;
}
public static final Comparator<Map.Entry<String, Integer>> mecomp = new Comparator<Map.Entry<String, Integer>>() {
public static final Comparator<Map.Entry<String, Integer>> mecomp =
new Comparator<Map.Entry<String, Integer>>() {
@Override
public int compare(final Map.Entry<String, Integer> o1, final Map.Entry<String, Integer> o2) {
if (o1.getValue().intValue() < o2.getValue().intValue()) return 1;
if (o2.getValue().intValue() < o1.getValue().intValue()) return -1;
if ( o1.getValue().intValue() < o2.getValue().intValue() ) {
return 1;
}
if ( o2.getValue().intValue() < o1.getValue().intValue() ) {
return -1;
}
return 0;
}
};
@ -667,8 +824,12 @@ public final class RWIProcess extends Thread {
// create a list of words that had been computed by statistics over all
// words that appeared in the url or the description of all urls
final ScoreMap<String> result = new ConcurrentScoreMap<String>();
if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("topics",0) < 0) return result;
if (this.ref.sizeSmaller(2)) this.ref.clear(); // navigators with one entry are not useful
if ( !this.query.navigators.equals("all") && this.query.navigators.indexOf("topics", 0) < 0 ) {
return result;
}
if ( this.ref.sizeSmaller(2) ) {
this.ref.clear(); // navigators with one entry are not useful
}
final Map<String, Float> counts = new HashMap<String, Float>();
final Iterator<String> i = this.ref.keys(false);
String word;
@ -678,7 +839,9 @@ public final class RWIProcess extends Thread {
int ic = count;
while ( ic-- > 0 && i.hasNext() ) {
word = i.next();
if (word == null) continue;
if ( word == null ) {
continue;
}
termHash = Word.word2hash(word);
c = this.query.getSegment().termIndex().count(termHash);
if ( c > 0 ) {
@ -688,9 +851,11 @@ public final class RWIProcess extends Thread {
counts.put(word, q);
}
}
if (max > min) for (final Map.Entry<String, Float> ce: counts.entrySet()) {
if ( max > min ) {
for ( final Map.Entry<String, Float> ce : counts.entrySet() ) {
result.set(ce.getKey(), (int) (((double) count) * (ce.getValue() - min) / (max - min)));
}
}
return this.ref;
}
@ -700,12 +865,13 @@ public final class RWIProcess extends Thread {
String word;
for ( final String w : words ) {
word = w.toLowerCase();
if (word.length() > 2 &&
"http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_zum_der_die_das_und_the_zur_bzw_mit_blog_wiki_aus_bei_off".indexOf(word) < 0 &&
!this.query.queryHashes.has(Word.word2hash(word)) &&
lettermatch.matcher(word).matches() &&
!Switchboard.badwords.contains(word) &&
!Switchboard.stopwords.contains(word)) {
if ( word.length() > 2
&& "http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_zum_der_die_das_und_the_zur_bzw_mit_blog_wiki_aus_bei_off"
.indexOf(word) < 0
&& !this.query.queryHashes.has(Word.word2hash(word))
&& lettermatch.matcher(word).matches()
&& !Switchboard.badwords.contains(word)
&& !Switchboard.stopwords.contains(word) ) {
this.ref.inc(word);
}
}
@ -713,7 +879,9 @@ public final class RWIProcess extends Thread {
public void addTopics(final ResultEntry resultEntry) {
// take out relevant information for reference computation
if ((resultEntry.url() == null) || (resultEntry.title() == null)) return;
if ( (resultEntry.url() == null) || (resultEntry.title() == null) ) {
return;
}
//final String[] urlcomps = htmlFilterContentScraper.urlComps(resultEntry.url().toNormalform(true, true)); // word components of the url
final String[] descrcomps = MultiProtocolURI.splitpattern.split(resultEntry.title().toLowerCase()); // words in the description
@ -725,8 +893,12 @@ public final class RWIProcess extends Thread {
public ScoreMap<String> getAuthorNavigator() {
// create a list of words that had been computed by statistics over all
// words that appeared in the url or the description of all urls
if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("authors",0) < 0) return new ConcurrentScoreMap<String>();
if (this.authorNavigator.sizeSmaller(2)) this.authorNavigator.clear(); // navigators with one entry are not useful
if ( !this.query.navigators.equals("all") && this.query.navigators.indexOf("authors", 0) < 0 ) {
return new ConcurrentScoreMap<String>();
}
if ( this.authorNavigator.sizeSmaller(2) ) {
this.authorNavigator.clear(); // navigators with one entry are not useful
}
return this.authorNavigator;
}

@ -58,11 +58,25 @@ import net.yacy.search.ranking.ReferenceOrder;
import net.yacy.search.snippet.ResultEntry;
import de.anomic.data.WorkTables;
public final class SearchEvent {
public final class SearchEvent
{
public enum Type {
INITIALIZATION, COLLECTION, JOIN, PRESORT, URLFETCH, NORMALIZING, FINALIZATION,
REMOTESEARCH_START, REMOTESEARCH_TERMINATE, ABSTRACTS, CLEANUP, SNIPPETFETCH_START, ONERESULT, REFERENCECOLLECTION, RESULTLIST;
INITIALIZATION,
COLLECTION,
JOIN,
PRESORT,
URLFETCH,
NORMALIZING,
FINALIZATION,
REMOTESEARCH_START,
REMOTESEARCH_TERMINATE,
ABSTRACTS,
CLEANUP,
SNIPPETFETCH_START,
ONERESULT,
REFERENCECOLLECTION,
RESULTLIST;
}
public static final int max_results_preparation = 3000;
@ -87,7 +101,8 @@ public final class SearchEvent {
private byte[] IAmaxcounthash, IAneardhthash;
private final ReferenceOrder order;
protected SearchEvent(final QueryParams query,
protected SearchEvent(
final QueryParams query,
final SeedDB peers,
final WorkTables workTables,
final SortedMap<byte[], String> preselectedPeerHashes,
@ -98,13 +113,18 @@ public final class SearchEvent {
final int burstRobinsonPercent,
final int burstMultiwordPercent,
final boolean deleteIfSnippetFail) {
if (MemoryControl.available() < 1024 * 1024 * 100) SearchEventCache.cleanupEvents(true);
if ( MemoryControl.available() < 1024 * 1024 * 100 ) {
SearchEventCache.cleanupEvents(true);
}
this.eventTime = System.currentTimeMillis(); // for lifetime check
this.peers = peers;
this.workTables = workTables;
this.query = query;
this.secondarySearchSuperviser = (this.query.queryHashes.size() > 1) ? new SecondarySearchSuperviser() : null; // generate abstracts only for combined searches
if (this.secondarySearchSuperviser != null) this.secondarySearchSuperviser.start();
this.secondarySearchSuperviser =
(this.query.queryHashes.size() > 1) ? new SecondarySearchSuperviser() : null; // generate abstracts only for combined searches
if ( this.secondarySearchSuperviser != null ) {
this.secondarySearchSuperviser.start();
}
this.primarySearchThreads = null;
this.secondarySearchThreads = null;
this.preselectedPeerHashes = preselectedPeerHashes;
@ -115,7 +135,11 @@ public final class SearchEvent {
this.IAneardhthash = null;
this.localSearchThread = null;
this.order = new ReferenceOrder(this.query.ranking, UTF8.getBytes(this.query.targetlang));
final boolean remote = (peers != null && peers.sizeConnected() > 0) && (this.query.domType == QueryParams.Searchdom.CLUSTER || (this.query.domType == QueryParams.Searchdom.GLOBAL && peers.mySeed().getFlagAcceptRemoteIndex()));
final boolean remote =
(peers != null && peers.sizeConnected() > 0)
&& (this.query.domType == QueryParams.Searchdom.CLUSTER || (this.query.domType == QueryParams.Searchdom.GLOBAL && peers
.mySeed()
.getFlagAcceptRemoteIndex()));
final long start = System.currentTimeMillis();
// initialize a ranking process that is the target for data
@ -128,7 +152,8 @@ public final class SearchEvent {
if ( remote ) {
// start global searches
final long timer = System.currentTimeMillis();
this.primarySearchThreads = (this.query.queryHashes.isEmpty()) ? null : RemoteSearch.primaryRemoteSearches(
this.primarySearchThreads =
(this.query.queryHashes.isEmpty()) ? null : RemoteSearch.primaryRemoteSearches(
QueryParams.hashSet2hashString(this.query.queryHashes),
QueryParams.hashSet2hashString(this.query.excludeHashes),
this.query.prefer,
@ -152,11 +177,29 @@ public final class SearchEvent {
burstRobinsonPercent,
burstMultiwordPercent);
if ( this.primarySearchThreads != null ) {
Log.logFine("SEARCH_EVENT", "STARTING " + this.primarySearchThreads.length + " THREADS TO CATCH EACH " + remote_maxcount + " URLs");
Log.logFine("SEARCH_EVENT", "STARTING "
+ this.primarySearchThreads.length
+ " THREADS TO CATCH EACH "
+ remote_maxcount
+ " URLs");
this.rankingProcess.moreFeeders(this.primarySearchThreads.length);
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(this.query.id(true), Type.REMOTESEARCH_START, "", this.primarySearchThreads.length, System.currentTimeMillis() - timer), false);
this.rankingProcess.setExpectedRemoteReferences(this.primarySearchThreads.length
* remote_maxcount);
EventTracker.update(
EventTracker.EClass.SEARCH,
new ProfilingGraph.EventSearch(
this.query.id(true),
Type.REMOTESEARCH_START,
"",
this.primarySearchThreads.length,
System.currentTimeMillis() - timer),
false);
// finished searching
Log.logFine("SEARCH_EVENT", "SEARCH TIME AFTER GLOBAL-TRIGGER TO " + this.primarySearchThreads.length + " PEERS: " + ((System.currentTimeMillis() - start) / 1000) + " seconds");
Log.logFine("SEARCH_EVENT", "SEARCH TIME AFTER GLOBAL-TRIGGER TO "
+ this.primarySearchThreads.length
+ " PEERS: "
+ ((System.currentTimeMillis() - start) / 1000)
+ " seconds");
} else {
// no search since query is empty, user might have entered no data or filters have removed all search words
Log.logFine("SEARCH_EVENT", "NO SEARCH STARTED DUE TO EMPTY SEARCH REQUEST.");
@ -174,10 +217,15 @@ public final class SearchEvent {
long mindhtdistance = Long.MAX_VALUE, l;
byte[] wordhash;
assert this.rankingProcess.searchContainerMap() != null;
for (final Map.Entry<byte[], ReferenceContainer<WordReference>> entry : this.rankingProcess.searchContainerMap().entrySet()) {
for ( final Map.Entry<byte[], ReferenceContainer<WordReference>> entry : this.rankingProcess
.searchContainerMap()
.entrySet() ) {
wordhash = entry.getKey();
final ReferenceContainer<WordReference> container = entry.getValue();
assert (Base64Order.enhancedCoder.equal(container.getTermHash(), wordhash)) : "container.getTermHash() = " + ASCII.String(container.getTermHash()) + ", wordhash = " + ASCII.String(wordhash);
assert (Base64Order.enhancedCoder.equal(container.getTermHash(), wordhash)) : "container.getTermHash() = "
+ ASCII.String(container.getTermHash())
+ ", wordhash = "
+ ASCII.String(wordhash);
if ( container.size() > maxcount ) {
this.IAmaxcounthash = wordhash;
maxcount = container.size();
@ -189,9 +237,19 @@ public final class SearchEvent {
this.IAneardhthash = wordhash;
}
this.IACount.put(wordhash, LargeNumberCache.valueOf(container.size()));
this.IAResults.put(wordhash, WordReferenceFactory.compressIndex(container, null, 1000).toString());
}
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(this.query.id(true), Type.ABSTRACTS, "", this.rankingProcess.searchContainerMap().size(), System.currentTimeMillis() - timer), false);
this.IAResults.put(wordhash, WordReferenceFactory
.compressIndex(container, null, 1000)
.toString());
}
EventTracker.update(
EventTracker.EClass.SEARCH,
new ProfilingGraph.EventSearch(
this.query.id(true),
Type.ABSTRACTS,
"",
this.rankingProcess.searchContainerMap().size(),
System.currentTimeMillis() - timer),
false);
} else {
// give process time to accumulate a certain amount of data
// before a reading process wants to get results from it
@ -205,18 +263,32 @@ public final class SearchEvent {
}
// start worker threads to fetch urls and snippets
this.resultFetcher = new SnippetProcess(loader, this.rankingProcess, this.query, this.peers, this.workTables, 5000, deleteIfSnippetFail);
this.resultFetcher =
new SnippetProcess(
loader,
this.rankingProcess,
this.query,
this.peers,
this.workTables,
5000,
deleteIfSnippetFail);
// clean up events
SearchEventCache.cleanupEvents(false);
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(this.query.id(true), Type.CLEANUP, "", 0, 0), false);
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(
this.query.id(true),
Type.CLEANUP,
"",
0,
0), false);
// store this search to a cache so it can be re-used
if (MemoryControl.available() < 1024 * 1024 * 100) SearchEventCache.cleanupEvents(true);
if ( MemoryControl.available() < 1024 * 1024 * 100 ) {
SearchEventCache.cleanupEvents(true);
}
SearchEventCache.put(this.query.id(false), this);
}
public ReferenceOrder getOrder() {
return this.order;
}
@ -244,15 +316,23 @@ public final class SearchEvent {
// stop all threads
if ( this.primarySearchThreads != null ) {
for ( final RemoteSearch search : this.primarySearchThreads ) {
if (search != null) synchronized (search) {
if (search.isAlive()) search.interrupt();
if ( search != null ) {
synchronized ( search ) {
if ( search.isAlive() ) {
search.interrupt();
}
}
}
}
}
if ( this.secondarySearchThreads != null ) {
for ( final RemoteSearch search : this.secondarySearchThreads ) {
if (search != null) synchronized (search) {
if (search.isAlive()) search.interrupt();
if ( search != null ) {
synchronized ( search ) {
if ( search.isAlive() ) {
search.interrupt();
}
}
}
}
}
@ -265,16 +345,30 @@ public final class SearchEvent {
// the interrupt may occur during a MD5 computation which is resistant against interruption
// therefore set some more interrupts on the process
int ic = 10;
while (ic-- > 0 & w.isAlive()) w.interrupt();
while ( ic-- > 0 & w.isAlive() ) {
w.interrupt();
}
}
}
// clear all data structures
if (this.preselectedPeerHashes != null) this.preselectedPeerHashes.clear();
if (this.localSearchThread != null) if (this.localSearchThread.isAlive()) this.localSearchThread.interrupt();
if (this.IACount != null) this.IACount.clear();
if (this.IAResults != null) this.IAResults.clear();
if (this.heuristics != null) this.heuristics.clear();
if ( this.preselectedPeerHashes != null ) {
this.preselectedPeerHashes.clear();
}
if ( this.localSearchThread != null ) {
if ( this.localSearchThread.isAlive() ) {
this.localSearchThread.interrupt();
}
}
if ( this.IACount != null ) {
this.IACount.clear();
}
if ( this.IAResults != null ) {
this.IAResults.clear();
}
if ( this.heuristics != null ) {
this.heuristics.clear();
}
}
public Iterator<Map.Entry<byte[], String>> abstractsString() {
@ -291,7 +385,9 @@ public final class SearchEvent {
public int abstractsCount(final byte[] hash) {
final Integer i = this.IACount.get(hash);
if (i == null) return -1;
if ( i == null ) {
return -1;
}
return i.intValue();
}
@ -307,13 +403,17 @@ public final class SearchEvent {
// check primary search threads
if ( (this.primarySearchThreads != null) && (this.primarySearchThreads.length != 0) ) {
for ( final RemoteSearch primarySearchThread : this.primarySearchThreads ) {
if ((primarySearchThread != null) && (primarySearchThread.isAlive())) return true;
if ( (primarySearchThread != null) && (primarySearchThread.isAlive()) ) {
return true;
}
}
}
// maybe a secondary search thread is alive, check this
if ( (this.secondarySearchThreads != null) && (this.secondarySearchThreads.length != 0) ) {
for ( final RemoteSearch secondarySearchThread : this.secondarySearchThreads ) {
if ((secondarySearchThread != null) && (secondarySearchThread.isAlive())) return true;
if ( (secondarySearchThread != null) && (secondarySearchThread.isAlive()) ) {
return true;
}
}
}
return false;
@ -386,10 +486,16 @@ public final class SearchEvent {
//boolean secondarySearchStartet = false;
public static class HeuristicResult /*implements Comparable<HeuristicResult>*/ {
private final byte[] urlhash; public final String heuristicName; public final boolean redundant;
public static class HeuristicResult /*implements Comparable<HeuristicResult>*/
{
private final byte[] urlhash;
public final String heuristicName;
public final boolean redundant;
private HeuristicResult(final byte[] urlhash, final String heuristicName, final boolean redundant) {
this.urlhash = urlhash; this.heuristicName = heuristicName; this.redundant = redundant;
this.urlhash = urlhash;
this.heuristicName = heuristicName;
this.redundant = redundant;
}/*
public int compareTo(HeuristicResult o) {
return Base64Order.enhancedCoder.compare(this.urlhash, o.urlhash);
@ -402,7 +508,8 @@ public final class SearchEvent {
}*/
}
public class SecondarySearchSuperviser extends Thread {
public class SecondarySearchSuperviser extends Thread
{
// cache for index abstracts; word:TreeMap mapping where the embedded TreeMap is a urlhash:peerlist relation
// this relation contains the information where specific urls can be found in specific peers
@ -418,6 +525,7 @@ public final class SearchEvent {
/**
* add a single abstract to the existing set of abstracts
*
* @param wordhash
* @param singleAbstract // a mapping from url-hashes to a string of peer-hashes
*/
@ -433,13 +541,16 @@ public final class SearchEvent {
}
// extend the abstracts in the cache: join the single abstracts
new Thread() {
@Override
public void run() {
for ( final Map.Entry<String, StringBuilder> oneref : singleAbstract.entrySet() ) {
final String urlhash = oneref.getKey();
final StringBuilder peerlistNew = oneref.getValue();
synchronized ( oldAbstract ) {
final StringBuilder peerlistOld = oldAbstract.put(urlhash, peerlistNew);
if (peerlistOld != null) peerlistOld.append(peerlistNew);
if ( peerlistOld != null ) {
peerlistOld.append(peerlistNew);
}
}
}
}
@ -459,7 +570,8 @@ public final class SearchEvent {
int p;
boolean hasURL;
synchronized ( this ) {
final Iterator<Map.Entry <String, SortedMap<String, StringBuilder>>> i = this.abstractsCache.entrySet().iterator();
final Iterator<Map.Entry<String, SortedMap<String, StringBuilder>>> i =
this.abstractsCache.entrySet().iterator();
while ( i.hasNext() ) {
entry = i.next();
word = entry.getKey();
@ -474,7 +586,9 @@ public final class SearchEvent {
break;
}
}
if (hasURL) wordlist += word;
if ( hasURL ) {
wordlist += word;
}
}
}
return wordlist;
@ -488,7 +602,9 @@ public final class SearchEvent {
// a trigger was released
prepareSecondarySearch();
t++;
if (t > 10) break;
if ( t > 10 ) {
break;
}
}
} catch ( final InterruptedException e ) {
// the thread was interrupted
@ -498,7 +614,10 @@ public final class SearchEvent {
}
private void prepareSecondarySearch() {
if (this.abstractsCache == null || this.abstractsCache.size() != SearchEvent.this.query.queryHashes.size()) return; // secondary search not possible (yet)
if ( this.abstractsCache == null
|| this.abstractsCache.size() != SearchEvent.this.query.queryHashes.size() ) {
return; // secondary search not possible (yet)
}
// catch up index abstracts and join them; then call peers again to submit their urls
@ -510,12 +629,17 @@ public final class SearchEvent {
*/
// find out if there are enough references for all words that are searched
if (this.abstractsCache.size() != SearchEvent.this.query.queryHashes.size()) return;
if ( this.abstractsCache.size() != SearchEvent.this.query.queryHashes.size() ) {
return;
}
// join all the urlhash:peerlist relations: the resulting map has values with a combined peer-list list
final SortedMap<String, StringBuilder> abstractJoin = SetTools.joinConstructive(this.abstractsCache.values(), true);
if (abstractJoin.isEmpty()) return;
final SortedMap<String, StringBuilder> abstractJoin =
SetTools.joinConstructive(this.abstractsCache.values(), true);
if ( abstractJoin.isEmpty() ) {
return;
// the join result is now a urlhash: peer-list relation
}
// generate a list of peers that have the urls for the joined search result
final SortedMap<String, StringBuilder> secondarySearchURLs = new TreeMap<String, StringBuilder>(); // a (peerhash:urlhash-liststring) mapping
@ -531,7 +655,9 @@ public final class SearchEvent {
mypeercount = 0;
for ( int j = 0; j < peerlist.length(); j += 12 ) {
peer = peerlist.substring(j, j + 12);
if ((peer.equals(mypeerhash)) && (mypeercount++ > 1)) continue;
if ( (peer.equals(mypeerhash)) && (mypeercount++ > 1) ) {
continue;
}
//if (peers.indexOf(peer) < j) continue; // avoid doubles that may appear in the abstractJoin
urls = secondarySearchURLs.get(peer);
if ( urls == null ) {
@ -543,27 +669,47 @@ public final class SearchEvent {
}
secondarySearchURLs.put(peer, urls);
}
if (mypeercount == 1) mypeerinvolved = true;
if ( mypeercount == 1 ) {
mypeerinvolved = true;
}
}
// compute words for secondary search and start the secondary searches
String words;
SearchEvent.this.secondarySearchThreads = new RemoteSearch[(mypeerinvolved) ? secondarySearchURLs.size() - 1 : secondarySearchURLs.size()];
SearchEvent.this.secondarySearchThreads =
new RemoteSearch[(mypeerinvolved) ? secondarySearchURLs.size() - 1 : secondarySearchURLs
.size()];
int c = 0;
for ( final Map.Entry<String, StringBuilder> entry : secondarySearchURLs.entrySet() ) {
peer = entry.getKey();
if (peer.equals(mypeerhash)) continue; // we don't need to ask ourself
if (this.checkedPeers.contains(peer)) continue; // do not ask a peer again
if ( peer.equals(mypeerhash) ) {
continue; // we don't need to ask ourself
}
if ( this.checkedPeers.contains(peer) ) {
continue; // do not ask a peer again
}
urls = entry.getValue();
words = wordsFromPeer(peer, urls);
if (words.length() == 0) continue; // ???
if ( words.length() == 0 ) {
continue; // ???
}
assert words.length() >= 12 : "words = " + words;
//System.out.println("DEBUG-INDEXABSTRACT ***: peer " + peer + " has urls: " + urls + " from words: " + words);
SearchEvent.this.rankingProcess.moreFeeders(1);
this.checkedPeers.add(peer);
SearchEvent.this.secondarySearchThreads[c++] = RemoteSearch.secondaryRemoteSearch(
words, urls.toString(), 6000, SearchEvent.this.query.getSegment(), SearchEvent.this.peers, SearchEvent.this.rankingProcess, peer, Switchboard.urlBlacklist,
SearchEvent.this.query.ranking, SearchEvent.this.query.constraint, SearchEvent.this.preselectedPeerHashes);
SearchEvent.this.secondarySearchThreads[c++] =
RemoteSearch.secondaryRemoteSearch(
words,
urls.toString(),
6000,
SearchEvent.this.query.getSegment(),
SearchEvent.this.peers,
SearchEvent.this.rankingProcess,
peer,
Switchboard.urlBlacklist,
SearchEvent.this.query.ranking,
SearchEvent.this.query.constraint,
SearchEvent.this.preselectedPeerHashes);
}
}
@ -575,8 +721,14 @@ public final class SearchEvent {
}
public boolean workerAlive() {
if (this.resultFetcher== null || this.resultFetcher.workerThreads == null) return false;
for (final Worker w: this.resultFetcher.workerThreads) if (w != null && w.isAlive()) return true;
if ( this.resultFetcher == null || this.resultFetcher.workerThreads == null ) {
return false;
}
for ( final Worker w : this.resultFetcher.workerThreads ) {
if ( w != null && w.isAlive() ) {
return true;
}
}
return false;
}

Loading…
Cancel
Save