From f14faf503b6bb49c10c5aff6f548c943998a3a92 Mon Sep 17 00:00:00 2001 From: Michael Christen Date: Tue, 6 Dec 2011 02:24:51 +0100 Subject: [PATCH] better ranking because we wait a very little time during the search process more to get better remote sear results into the ranking priority stack --- htroot/IndexControlRWIs_p.java | 638 ++++++---- source/net/yacy/peers/Protocol.java | 1132 +++++++++++------ .../net/yacy/search/index/DocumentIndex.java | 162 ++- source/net/yacy/search/query/RWIProcess.java | 546 +++++--- source/net/yacy/search/query/SearchEvent.java | 510 +++++--- 5 files changed, 1945 insertions(+), 1043 deletions(-) diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index 9ed49367b..619309d3d 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -75,9 +75,13 @@ import de.anomic.http.client.Cache; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; -public class IndexControlRWIs_p { +public class IndexControlRWIs_p +{ - public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) throws IOException { + public static serverObjects respond( + final RequestHeader header, + final serverObjects post, + final serverSwitch env) throws IOException { // return variable that accumulates replacements final Switchboard sb = (Switchboard) env; final serverObjects prop = new serverObjects(); @@ -87,11 +91,12 @@ public class IndexControlRWIs_p { prop.put("keyhash", ""); prop.put("result", ""); prop.put("cleanup", post == null || post.containsKey("maxReferencesLimit") ? 1 : 0); - prop.put("cleanup_solr", sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr() == null || !sb.getConfigBool("federated.service.solr.indexing.enabled", false) ? 0 : 1); + prop.put("cleanup_solr", sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr() == null + || !sb.getConfigBool("federated.service.solr.indexing.enabled", false) ? 0 : 1); String segmentName = sb.getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default"); int i = 0; - for (final String s: sb.indexSegments.segmentNames()) { + for ( final String s : sb.indexSegments.segmentNames() ) { prop.put("segments_" + i + "_name", s); prop.put("segments_" + i + "_selected", (segmentName.equals(s)) ? 1 : 0); i++; @@ -107,11 +112,11 @@ public class IndexControlRWIs_p { // clean up all search events SearchEventCache.cleanupEvents(true); - if (post != null) { + if ( post != null ) { // default values segmentName = post.get("segment", segmentName).trim(); - i= 0; - for (final String s: sb.indexSegments.segmentNames()) { + i = 0; + for ( final String s : sb.indexSegments.segmentNames() ) { prop.put("segments_" + i + "_name", s); prop.put("segments_" + i + "_selected", (segmentName.equals(s)) ? 1 : 0); i++; @@ -126,62 +131,77 @@ public class IndexControlRWIs_p { // read values from checkboxes final String[] urls = post.getAll("urlhx.*"); - HandleSet urlb = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, urls.length); - if (urls != null) for (final String s: urls) try { urlb.put(s.getBytes()); } catch (final RowSpaceExceededException e) { Log.logException(e); } - final boolean delurl = post.containsKey("delurl"); + HandleSet urlb = + new HandleSet( + URIMetadataRow.rowdef.primaryKeyLength, + URIMetadataRow.rowdef.objectOrder, + urls.length); + if ( urls != null ) { + for ( final String s : urls ) { + try { + urlb.put(s.getBytes()); + } catch ( final RowSpaceExceededException e ) { + Log.logException(e); + } + } + } + final boolean delurl = post.containsKey("delurl"); final boolean delurlref = post.containsKey("delurlref"); - if (post.containsKey("keystringsearch")) { + if ( post.containsKey("keystringsearch") ) { keyhash = Word.word2hash(keystring); prop.put("keyhash", keyhash); final RWIProcess ranking = genSearchresult(prop, sb, segment, keyhash, null); - if (ranking.filteredCount() == 0) { + if ( ranking.filteredCount() == 0 ) { prop.put("searchresult", 1); prop.putHTML("searchresult_word", keystring); } } - if (post.containsKey("keyhashsearch")) { - if (keystring.length() == 0 || !ByteBuffer.equals(Word.word2hash(keystring), keyhash)) { + if ( post.containsKey("keyhashsearch") ) { + if ( keystring.length() == 0 || !ByteBuffer.equals(Word.word2hash(keystring), keyhash) ) { prop.put("keystring", "<not possible to compute word from hash>"); } final RWIProcess ranking = genSearchresult(prop, sb, segment, keyhash, null); - if (ranking.filteredCount() == 0) { + if ( ranking.filteredCount() == 0 ) { prop.put("searchresult", 2); prop.putHTML("searchresult_wordhash", ASCII.String(keyhash)); } } - // delete everything - if (post.containsKey("deletecomplete")) { - if (post.get("deleteIndex", "").equals("on")) { + // delete everything + if ( post.containsKey("deletecomplete") ) { + if ( post.get("deleteIndex", "").equals("on") ) { segment.clear(); } - if (post.get("deleteSolr", "").equals("on") && sb.getConfigBool("federated.service.solr.indexing.enabled", false)) try { - sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr().clear(); - } catch (final Exception e) { - Log.logException(e); + if ( post.get("deleteSolr", "").equals("on") + && sb.getConfigBool("federated.service.solr.indexing.enabled", false) ) { + try { + sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr().clear(); + } catch ( final Exception e ) { + Log.logException(e); + } } - if (post.get("deleteCrawlQueues", "").equals("on")) { + if ( post.get("deleteCrawlQueues", "").equals("on") ) { sb.crawlQueues.clear(); sb.crawlStacker.clear(); ResultURLs.clearStacks(); } - if (post.get("deleteCache", "").equals("on")) { + if ( post.get("deleteCache", "").equals("on") ) { Cache.clear(); } - if (post.get("deleteRobots", "").equals("on")) { + if ( post.get("deleteRobots", "").equals("on") ) { sb.robots.clear(); } - if (post.get("deleteSearchFl", "").equals("on")) { + if ( post.get("deleteSearchFl", "").equals("on") ) { sb.tables.clear(WorkTables.TABLE_SEARCH_FAILURE_NAME); } post.remove("deletecomplete"); } // set reference limitation - if (post.containsKey("maxReferencesLimit")) { - if (post.get("maxReferencesRadio", "").equals("on")) { + if ( post.containsKey("maxReferencesLimit") ) { + if ( post.get("maxReferencesRadio", "").equals("on") ) { ReferenceContainer.maxReferences = post.getInt("maxReferences", 0); } else { ReferenceContainer.maxReferences = 0; @@ -190,52 +210,80 @@ public class IndexControlRWIs_p { } // delete word - if (post.containsKey("keyhashdeleteall")) try { - if (delurl || delurlref) { - // generate urlx: an array of url hashes to be deleted - ReferenceContainer index = null; - index = segment.termIndex().get(keyhash, null); - final Iterator en = index.entries(); - urlb = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, index.size()); - while (en.hasNext()) try { urlb.put(en.next().urlhash()); } catch (final RowSpaceExceededException e) { Log.logException(e); } - index = null; - } - if (delurlref) { - segment.removeAllUrlReferences(urlb, sb.loader, CacheStrategy.IFEXIST); - } - // delete the word first because that is much faster than the deletion of the urls from the url database - segment.termIndex().delete(keyhash); - // now delete all urls if demanded - if (delurl || delurlref) { - for (final byte[] b: urlb) sb.urlRemove(segment, b); + if ( post.containsKey("keyhashdeleteall") ) { + try { + if ( delurl || delurlref ) { + // generate urlx: an array of url hashes to be deleted + ReferenceContainer index = null; + index = segment.termIndex().get(keyhash, null); + final Iterator en = index.entries(); + urlb = + new HandleSet( + URIMetadataRow.rowdef.primaryKeyLength, + URIMetadataRow.rowdef.objectOrder, + index.size()); + while ( en.hasNext() ) { + try { + urlb.put(en.next().urlhash()); + } catch ( final RowSpaceExceededException e ) { + Log.logException(e); + } + } + index = null; + } + if ( delurlref ) { + segment.removeAllUrlReferences(urlb, sb.loader, CacheStrategy.IFEXIST); + } + // delete the word first because that is much faster than the deletion of the urls from the url database + segment.termIndex().delete(keyhash); + // now delete all urls if demanded + if ( delurl || delurlref ) { + for ( final byte[] b : urlb ) { + sb.urlRemove(segment, b); + } + } + post.remove("keyhashdeleteall"); + post.put("urllist", "generated"); + } catch ( final IOException e ) { + Log.logException(e); } - post.remove("keyhashdeleteall"); - post.put("urllist", "generated"); - } catch (final IOException e) { - Log.logException(e); } // delete selected URLs - if (post.containsKey("keyhashdelete")) try { - if (delurlref) { - segment.removeAllUrlReferences(urlb, sb.loader, CacheStrategy.IFEXIST); - } - if (delurl || delurlref) { - for (final byte[] b: urlb) sb.urlRemove(segment, b); + if ( post.containsKey("keyhashdelete") ) { + try { + if ( delurlref ) { + segment.removeAllUrlReferences(urlb, sb.loader, CacheStrategy.IFEXIST); + } + if ( delurl || delurlref ) { + for ( final byte[] b : urlb ) { + sb.urlRemove(segment, b); + } + } + final HandleSet urlHashes = + new HandleSet( + URIMetadataRow.rowdef.primaryKeyLength, + URIMetadataRow.rowdef.objectOrder, + 0); + for ( final byte[] b : urlb ) { + try { + urlHashes.put(b); + } catch ( final RowSpaceExceededException e ) { + Log.logException(e); + } + } + segment.termIndex().remove(keyhash, urlHashes); + // this shall lead to a presentation of the list; so handle that the remaining program + // thinks that it was called for a list presentation + post.remove("keyhashdelete"); + post.put("urllist", "generated"); + } catch ( final IOException e ) { + Log.logException(e); } - final HandleSet urlHashes = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); - for (final byte[] b: urlb) try { urlHashes.put(b); } catch (final RowSpaceExceededException e) { Log.logException(e); } - segment.termIndex().remove(keyhash, urlHashes); - // this shall lead to a presentation of the list; so handle that the remaining program - // thinks that it was called for a list presentation - post.remove("keyhashdelete"); - post.put("urllist", "generated"); - } catch (final IOException e) { - Log.logException(e); } - if (post.containsKey("urllist")) { - if (keystring.length() == 0 || !ByteBuffer.equals(Word.word2hash(keystring), keyhash)) { + if ( post.containsKey("urllist") ) { + if ( keystring.length() == 0 || !ByteBuffer.equals(Word.word2hash(keystring), keyhash) ) { prop.put("keystring", "<not possible to compute word from hash>"); } final Bitfield flags = compileFlags(post); @@ -245,192 +293,229 @@ public class IndexControlRWIs_p { } // transfer to other peer - if (post.containsKey("keyhashtransfer")) try { - if (keystring.length() == 0 || !ByteBuffer.equals(Word.word2hash(keystring), keyhash)) { - prop.put("keystring", "<not possible to compute word from hash>"); - } + if ( post.containsKey("keyhashtransfer") ) { + try { + if ( keystring.length() == 0 || !ByteBuffer.equals(Word.word2hash(keystring), keyhash) ) { + prop.put("keystring", "<not possible to compute word from hash>"); + } - // find host & peer - String host = post.get("host", ""); // get host from input field - Seed seed = null; - if (host.length() != 0) { - if (host.length() == 12) { - // the host string is a peer hash - seed = sb.peers.getConnected(host); + // find host & peer + String host = post.get("host", ""); // get host from input field + Seed seed = null; + if ( host.length() != 0 ) { + if ( host.length() == 12 ) { + // the host string is a peer hash + seed = sb.peers.getConnected(host); + } else { + // the host string can be a host name + seed = sb.peers.lookupByName(host); + } } else { - // the host string can be a host name - seed = sb.peers.lookupByName(host); + host = post.get("hostHash", ""); // if input field is empty, get from select box + seed = sb.peers.getConnected(host); } - } else { - host = post.get("hostHash", ""); // if input field is empty, get from select box - seed = sb.peers.getConnected(host); - } - // prepare index - ReferenceContainer index; - final long starttime = System.currentTimeMillis(); - index = segment.termIndex().get(keyhash, null); - // built urlCache - final Iterator urlIter = index.entries(); - final TreeMap knownURLs = new TreeMap(Base64Order.enhancedCoder); - final HandleSet unknownURLEntries = new HandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, index.size()); - Reference iEntry; - URIMetadataRow lurl; - while (urlIter.hasNext()) { - iEntry = urlIter.next(); - lurl = segment.urlMetadata().load(iEntry.urlhash()); - if (lurl == null) { - try { - unknownURLEntries.put(iEntry.urlhash()); - } catch (final RowSpaceExceededException e) { - Log.logException(e); + // prepare index + ReferenceContainer index; + final long starttime = System.currentTimeMillis(); + index = segment.termIndex().get(keyhash, null); + // built urlCache + final Iterator urlIter = index.entries(); + final TreeMap knownURLs = + new TreeMap(Base64Order.enhancedCoder); + final HandleSet unknownURLEntries = + new HandleSet( + WordReferenceRow.urlEntryRow.primaryKeyLength, + WordReferenceRow.urlEntryRow.objectOrder, + index.size()); + Reference iEntry; + URIMetadataRow lurl; + while ( urlIter.hasNext() ) { + iEntry = urlIter.next(); + lurl = segment.urlMetadata().load(iEntry.urlhash()); + if ( lurl == null ) { + try { + unknownURLEntries.put(iEntry.urlhash()); + } catch ( final RowSpaceExceededException e ) { + Log.logException(e); + } + urlIter.remove(); + } else { + knownURLs.put(iEntry.urlhash(), lurl); } - urlIter.remove(); - } else { - knownURLs.put(iEntry.urlhash(), lurl); } - } - // make an indexContainerCache - final ReferenceContainerCache icc = new ReferenceContainerCache(Segment.wordReferenceFactory, Segment.wordOrder, Word.commonHashLength); - try { - icc.add(index); - } catch (final RowSpaceExceededException e) { + // make an indexContainerCache + final ReferenceContainerCache icc = + new ReferenceContainerCache( + Segment.wordReferenceFactory, + Segment.wordOrder, + Word.commonHashLength); + try { + icc.add(index); + } catch ( final RowSpaceExceededException e ) { + Log.logException(e); + } + + // transport to other peer + final boolean gzipBody = sb.getConfigBool("indexControl.gzipBody", false); + final int timeout = (int) sb.getConfigLong("indexControl.timeout", 60000); + final String error = Protocol.transferIndex(seed, icc, knownURLs, gzipBody, timeout); + prop.put("result", (error == null) ? ("Successfully transferred " + + knownURLs.size() + + " words in " + + ((System.currentTimeMillis() - starttime) / 1000) + + " seconds, " + + unknownURLEntries.size() + " URL not found") : "error: " + error); + index = null; + } catch ( final IOException e ) { Log.logException(e); } - - // transport to other peer - final boolean gzipBody = sb.getConfigBool("indexControl.gzipBody", false); - final int timeout = (int) sb.getConfigLong("indexControl.timeout", 60000); - final String error = Protocol.transferIndex( - seed, - icc, - knownURLs, - gzipBody, - timeout); - prop.put("result", (error == null) ? ("Successfully transferred " + knownURLs.size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds, " + unknownURLEntries.size() + " URL not found") : "error: " + error); - index = null; - } catch (final IOException e) { - Log.logException(e); } // generate list - if (post.containsKey("keyhashsimilar")) try { - final Iterator> containerIt = segment.termIndex().referenceContainer(keyhash, true, 256, false).iterator(); + if ( post.containsKey("keyhashsimilar") ) { + try { + final Iterator> containerIt = + segment.termIndex().referenceContainer(keyhash, true, 256, false).iterator(); ReferenceContainer container; i = 0; int rows = 0, cols = 0; prop.put("keyhashsimilar", "1"); - while (containerIt.hasNext() && i < 256) { + while ( containerIt.hasNext() && i < 256 ) { container = containerIt.next(); - prop.put("keyhashsimilar_rows_"+rows+"_cols_"+cols+"_wordHash", container.getTermHash()); + prop.put( + "keyhashsimilar_rows_" + rows + "_cols_" + cols + "_wordHash", + container.getTermHash()); cols++; - if (cols==8) { - prop.put("keyhashsimilar_rows_"+rows+"_cols", cols); + if ( cols == 8 ) { + prop.put("keyhashsimilar_rows_" + rows + "_cols", cols); cols = 0; rows++; } i++; } - prop.put("keyhashsimilar_rows_"+rows+"_cols", cols); + prop.put("keyhashsimilar_rows_" + rows + "_cols", cols); prop.put("keyhashsimilar_rows", rows + 1); prop.put("result", ""); - } catch (final IOException e) { - Log.logException(e); + } catch ( final IOException e ) { + Log.logException(e); + } } - if (post.containsKey("blacklist")) { + if ( post.containsKey("blacklist") ) { final String blacklist = post.get("blacklist", ""); - final HandleSet urlHashes = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, urlb.size()); - if (post.containsKey("blacklisturls")) { + final HandleSet urlHashes = + new HandleSet( + URIMetadataRow.rowdef.primaryKeyLength, + URIMetadataRow.rowdef.objectOrder, + urlb.size()); + if ( post.containsKey("blacklisturls") ) { PrintWriter pw; try { - final String[] supportedBlacklistTypes = env.getConfig("BlackLists.types", "").split(","); - pw = new PrintWriter(new FileWriter(new File(ListManager.listsPath, blacklist), true)); + final String[] supportedBlacklistTypes = + env.getConfig("BlackLists.types", "").split(","); + pw = + new PrintWriter(new FileWriter(new File(ListManager.listsPath, blacklist), true)); DigestURI url; - for (final byte[] b: urlb) { + for ( final byte[] b : urlb ) { try { urlHashes.put(b); - } catch (final RowSpaceExceededException e) { + } catch ( final RowSpaceExceededException e ) { Log.logException(e); } final URIMetadataRow e = segment.urlMetadata().load(b); segment.urlMetadata().remove(b); - if (e != null) { + if ( e != null ) { url = e.metadata().url(); pw.println(url.getHost() + "/" + url.getFile()); - for (final String supportedBlacklistType : supportedBlacklistTypes) { - if (ListManager.listSetContains(supportedBlacklistType + ".BlackLists", blacklist)) { + for ( final String supportedBlacklistType : supportedBlacklistTypes ) { + if ( ListManager.listSetContains( + supportedBlacklistType + ".BlackLists", + blacklist) ) { Switchboard.urlBlacklist.add( - supportedBlacklistType, - url.getHost(), - url.getFile()); + supportedBlacklistType, + url.getHost(), + url.getFile()); } } SearchEventCache.cleanupEvents(true); } } pw.close(); - } catch (final IOException e) { + } catch ( final IOException e ) { } } - if (post.containsKey("blacklistdomains")) { + if ( post.containsKey("blacklistdomains") ) { PrintWriter pw; try { final String[] supportedBlacklistTypes = Blacklist.BLACKLIST_TYPES_STRING.split(","); - pw = new PrintWriter(new FileWriter(new File(ListManager.listsPath, blacklist), true)); + pw = + new PrintWriter(new FileWriter(new File(ListManager.listsPath, blacklist), true)); DigestURI url; - for (final byte[] b: urlb) { + for ( final byte[] b : urlb ) { try { urlHashes.put(b); - } catch (final RowSpaceExceededException e) { + } catch ( final RowSpaceExceededException e ) { Log.logException(e); } final URIMetadataRow e = segment.urlMetadata().load(b); segment.urlMetadata().remove(b); - if (e != null) { + if ( e != null ) { url = e.metadata().url(); pw.println(url.getHost() + "/.*"); - for (final String supportedBlacklistType : supportedBlacklistTypes) { - if (ListManager.listSetContains(supportedBlacklistType + ".BlackLists", blacklist)) { + for ( final String supportedBlacklistType : supportedBlacklistTypes ) { + if ( ListManager.listSetContains( + supportedBlacklistType + ".BlackLists", + blacklist) ) { Switchboard.urlBlacklist.add( - supportedBlacklistType, - url.getHost(), ".*"); + supportedBlacklistType, + url.getHost(), + ".*"); } } } } pw.close(); - } catch (final IOException e) { + } catch ( final IOException e ) { } } try { segment.termIndex().remove(keyhash, urlHashes); - } catch (final IOException e) { + } catch ( final IOException e ) { Log.logException(e); } } - if (prop.getInt("searchresult", 0) == 3) listHosts(prop, keyhash, sb); + if ( prop.getInt("searchresult", 0) == 3 ) { + listHosts(prop, keyhash, sb); + } } - // insert constants prop.putNum("wcount", segment.termIndex().sizesMax()); prop.put("cleanup_maxReferencesRadioChecked", ReferenceContainer.maxReferences > 0 ? 1 : 0); - prop.put("cleanup_maxReferences", ReferenceContainer.maxReferences > 0 ? ReferenceContainer.maxReferences : 100000); + prop.put("cleanup_maxReferences", ReferenceContainer.maxReferences > 0 + ? ReferenceContainer.maxReferences + : 100000); // return rewrite properties return prop; } - public static void genURLList(final serverObjects prop, final byte[] keyhash, final String keystring, final RWIProcess ranked, final Bitfield flags, final int maxlines) { + public static void genURLList( + final serverObjects prop, + final byte[] keyhash, + final String keystring, + final RWIProcess ranked, + final Bitfield flags, + final int maxlines) { // search for a word hash and generate a list of url links final String keyhashs = ASCII.String(keyhash); prop.put("genUrlList_keyHash", keyhashs); - if (ranked.filteredCount() == 0) { + if ( ranked.filteredCount() == 0 ) { prop.put("genUrlList", 1); prop.put("genUrlList_count", 0); prop.put("searchresult", 2); @@ -444,61 +529,107 @@ public class IndexControlRWIs_p { URIMetadataRow entry; String us; long rn = -1; - while (!ranked.isEmpty() && (entry = ranked.takeURL(false, 1000)) != null) { - if ((entry == null) || (entry.metadata() == null)) continue; + while ( !ranked.isEmpty() && (entry = ranked.takeURL(false, 1000)) != null ) { + if ( (entry == null) || (entry.metadata() == null) ) { + continue; + } url = entry.metadata().url(); - if (url == null) continue; + if ( url == null ) { + continue; + } us = url.toNormalform(false, false); - if (rn == -1) rn = entry.ranking(); - prop.put("genUrlList_urlList_"+i+"_urlExists", "1"); - prop.put("genUrlList_urlList_"+i+"_urlExists_urlhxCount", i); - prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlhxValue", entry.word().urlhash()); - prop.putHTML("genUrlList_urlList_"+i+"_urlExists_keyString", keystring); - prop.put("genUrlList_urlList_"+i+"_urlExists_keyHash", keyhashs); - prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlString", us); - prop.put("genUrlList_urlList_"+i+"_urlExists_urlStringShort", (us.length() > 40) ? (us.substring(0, 20) + "
" + us.substring(20, 40) + "...") : ((us.length() > 30) ? (us.substring(0, 20) + "
" + us.substring(20)) : us)); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_ranking", (entry.ranking() - rn)); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_domlength", DigestURI.domLengthEstimation(entry.hash())); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_ybr", BlockRank.ranking(entry.hash())); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_tf", 1000.0 * entry.word().termFrequency()); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_authority", (ranked.getOrder() == null) ? -1 : ranked.getOrder().authority(ASCII.String(entry.hash(), 6, 6))); - prop.put("genUrlList_urlList_"+i+"_urlExists_date", GenericFormatter.SHORT_DAY_FORMATTER.format(new Date(entry.word().lastModified()))); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_wordsintitle", entry.word().wordsintitle()); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_wordsintext", entry.word().wordsintext()); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_phrasesintext", entry.word().phrasesintext()); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_llocal", entry.word().llocal()); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_lother", entry.word().lother()); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_hitcount", entry.word().hitcount()); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_worddistance", 0); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_ybr", BlockRank.ranking(entry.hash())); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_pos", entry.word().minposition()); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_phrase", entry.word().posofphrase()); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_posinphrase", entry.word().posinphrase()); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_urlcomps", entry.word().urlcomps()); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_urllength", entry.word().urllength()); - prop.put("genUrlList_urlList_"+i+"_urlExists_props", - ((entry.word().flags().get(Condenser.flag_cat_indexof)) ? "appears on index page, " : "") + - ((entry.word().flags().get(Condenser.flag_cat_hasimage)) ? "contains images, " : "") + - ((entry.word().flags().get(Condenser.flag_cat_hasaudio)) ? "contains audio, " : "") + - ((entry.word().flags().get(Condenser.flag_cat_hasvideo)) ? "contains video, " : "") + - ((entry.word().flags().get(Condenser.flag_cat_hasapp)) ? "contains applications, " : "") + - ((entry.word().flags().get(WordReferenceRow.flag_app_dc_identifier)) ? "appears in url, " : "") + - ((entry.word().flags().get(WordReferenceRow.flag_app_dc_title)) ? "appears in title, " : "") + - ((entry.word().flags().get(WordReferenceRow.flag_app_dc_creator)) ? "appears in author, " : "") + - ((entry.word().flags().get(WordReferenceRow.flag_app_dc_subject)) ? "appears in subject, " : "") + - ((entry.word().flags().get(WordReferenceRow.flag_app_dc_description)) ? "appears in description, " : "") + - ((entry.word().flags().get(WordReferenceRow.flag_app_emphasized)) ? "appears emphasized, " : "") + - ((DigestURI.probablyRootURL(entry.word().urlhash())) ? "probably root url" : "") - ); - if (Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_DHT, url)) { - prop.put("genUrlList_urlList_"+i+"_urlExists_urlhxChecked", "1"); + if ( rn == -1 ) { + rn = entry.ranking(); + } + prop.put("genUrlList_urlList_" + i + "_urlExists", "1"); + prop.put("genUrlList_urlList_" + i + "_urlExists_urlhxCount", i); + prop.putHTML("genUrlList_urlList_" + i + "_urlExists_urlhxValue", entry.word().urlhash()); + prop.putHTML("genUrlList_urlList_" + i + "_urlExists_keyString", keystring); + prop.put("genUrlList_urlList_" + i + "_urlExists_keyHash", keyhashs); + prop.putHTML("genUrlList_urlList_" + i + "_urlExists_urlString", us); + prop.put( + "genUrlList_urlList_" + i + "_urlExists_urlStringShort", + (us.length() > 40) ? (us.substring(0, 20) + "
" + us.substring(20, 40) + "...") : ((us + .length() > 30) ? (us.substring(0, 20) + "
" + us.substring(20)) : us)); + prop.putNum("genUrlList_urlList_" + i + "_urlExists_ranking", (entry.ranking() - rn)); + prop.putNum( + "genUrlList_urlList_" + i + "_urlExists_domlength", + DigestURI.domLengthEstimation(entry.hash())); + prop.putNum("genUrlList_urlList_" + i + "_urlExists_ybr", BlockRank.ranking(entry.hash())); + prop.putNum("genUrlList_urlList_" + i + "_urlExists_tf", 1000.0 * entry + .word() + .termFrequency()); + prop.putNum("genUrlList_urlList_" + i + "_urlExists_authority", (ranked.getOrder() == null) + ? -1 + : ranked.getOrder().authority(ASCII.String(entry.hash(), 6, 6))); + prop.put( + "genUrlList_urlList_" + i + "_urlExists_date", + GenericFormatter.SHORT_DAY_FORMATTER.format(new Date(entry.word().lastModified()))); + prop.putNum("genUrlList_urlList_" + i + "_urlExists_wordsintitle", entry + .word() + .wordsintitle()); + prop.putNum("genUrlList_urlList_" + i + "_urlExists_wordsintext", entry.word().wordsintext()); + prop.putNum("genUrlList_urlList_" + i + "_urlExists_phrasesintext", entry + .word() + .phrasesintext()); + prop.putNum("genUrlList_urlList_" + i + "_urlExists_llocal", entry.word().llocal()); + prop.putNum("genUrlList_urlList_" + i + "_urlExists_lother", entry.word().lother()); + prop.putNum("genUrlList_urlList_" + i + "_urlExists_hitcount", entry.word().hitcount()); + prop.putNum("genUrlList_urlList_" + i + "_urlExists_worddistance", 0); + prop.putNum("genUrlList_urlList_" + i + "_urlExists_ybr", BlockRank.ranking(entry.hash())); + prop.putNum("genUrlList_urlList_" + i + "_urlExists_pos", entry.word().minposition()); + prop.putNum("genUrlList_urlList_" + i + "_urlExists_phrase", entry.word().posofphrase()); + prop.putNum("genUrlList_urlList_" + i + "_urlExists_posinphrase", entry.word().posinphrase()); + prop.putNum("genUrlList_urlList_" + i + "_urlExists_urlcomps", entry.word().urlcomps()); + prop.putNum("genUrlList_urlList_" + i + "_urlExists_urllength", entry.word().urllength()); + prop + .put( + "genUrlList_urlList_" + i + "_urlExists_props", + ((entry.word().flags().get(Condenser.flag_cat_indexof)) + ? "appears on index page, " + : "") + + ((entry.word().flags().get(Condenser.flag_cat_hasimage)) + ? "contains images, " + : "") + + ((entry.word().flags().get(Condenser.flag_cat_hasaudio)) + ? "contains audio, " + : "") + + ((entry.word().flags().get(Condenser.flag_cat_hasvideo)) + ? "contains video, " + : "") + + ((entry.word().flags().get(Condenser.flag_cat_hasapp)) + ? "contains applications, " + : "") + + ((entry.word().flags().get(WordReferenceRow.flag_app_dc_identifier)) + ? "appears in url, " + : "") + + ((entry.word().flags().get(WordReferenceRow.flag_app_dc_title)) + ? "appears in title, " + : "") + + ((entry.word().flags().get(WordReferenceRow.flag_app_dc_creator)) + ? "appears in author, " + : "") + + ((entry.word().flags().get(WordReferenceRow.flag_app_dc_subject)) + ? "appears in subject, " + : "") + + ((entry.word().flags().get(WordReferenceRow.flag_app_dc_description)) + ? "appears in description, " + : "") + + ((entry.word().flags().get(WordReferenceRow.flag_app_emphasized)) + ? "appears emphasized, " + : "") + + ((DigestURI.probablyRootURL(entry.word().urlhash())) ? "probably root url" : "")); + if ( Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_DHT, url) ) { + prop.put("genUrlList_urlList_" + i + "_urlExists_urlhxChecked", "1"); } i++; - if ((maxlines >= 0) && (i >= maxlines)) break; + if ( (maxlines >= 0) && (i >= maxlines) ) { + break; + } } final Iterator iter = ranked.miss(); // iterates url hash strings byte[] b; - while (iter.hasNext()) { + while ( iter.hasNext() ) { b = iter.next(); prop.put("genUrlList_urlList_" + i + "_urlExists", "0"); prop.put("genUrlList_urlList_" + i + "_urlExists_urlhxCount", i); @@ -515,28 +646,55 @@ public class IndexControlRWIs_p { public static void putBlacklists(final serverObjects prop, final List lists) { prop.put("genUrlList_blacklists", lists.size()); int i = 0; - for (final String list : lists) + for ( final String list : lists ) { prop.put("genUrlList_blacklists_" + i++ + "_name", list); + } } public static Bitfield compileFlags(final serverObjects post) { final Bitfield b = new Bitfield(4); - if (post.get("allurl", "").equals("on")) return null; - if (post.get("flags") != null) { - if (post.get("flags","").length() == 0) return null; + if ( post.get("allurl", "").equals("on") ) { + return null; + } + if ( post.get("flags") != null ) { + if ( post.get("flags", "").length() == 0 ) { + return null; + } return new Bitfield(4, post.get("flags")); } - if (post.get("description", "").equals("on")) b.set(WordReferenceRow.flag_app_dc_description, true); - if (post.get("title", "").equals("on")) b.set(WordReferenceRow.flag_app_dc_title, true); - if (post.get("creator", "").equals("on")) b.set(WordReferenceRow.flag_app_dc_creator, true); - if (post.get("subject", "").equals("on")) b.set(WordReferenceRow.flag_app_dc_subject, true); - if (post.get("url", "").equals("on")) b.set(WordReferenceRow.flag_app_dc_identifier, true); - if (post.get("emphasized", "").equals("on")) b.set(WordReferenceRow.flag_app_emphasized, true); - if (post.get("image", "").equals("on")) b.set(Condenser.flag_cat_hasimage, true); - if (post.get("audio", "").equals("on")) b.set(Condenser.flag_cat_hasaudio, true); - if (post.get("video", "").equals("on")) b.set(Condenser.flag_cat_hasvideo, true); - if (post.get("app", "").equals("on")) b.set(Condenser.flag_cat_hasapp, true); - if (post.get("indexof", "").equals("on")) b.set(Condenser.flag_cat_indexof, true); + if ( post.get("description", "").equals("on") ) { + b.set(WordReferenceRow.flag_app_dc_description, true); + } + if ( post.get("title", "").equals("on") ) { + b.set(WordReferenceRow.flag_app_dc_title, true); + } + if ( post.get("creator", "").equals("on") ) { + b.set(WordReferenceRow.flag_app_dc_creator, true); + } + if ( post.get("subject", "").equals("on") ) { + b.set(WordReferenceRow.flag_app_dc_subject, true); + } + if ( post.get("url", "").equals("on") ) { + b.set(WordReferenceRow.flag_app_dc_identifier, true); + } + if ( post.get("emphasized", "").equals("on") ) { + b.set(WordReferenceRow.flag_app_emphasized, true); + } + if ( post.get("image", "").equals("on") ) { + b.set(Condenser.flag_cat_hasimage, true); + } + if ( post.get("audio", "").equals("on") ) { + b.set(Condenser.flag_cat_hasaudio, true); + } + if ( post.get("video", "").equals("on") ) { + b.set(Condenser.flag_cat_hasvideo, true); + } + if ( post.get("app", "").equals("on") ) { + b.set(Condenser.flag_cat_hasapp, true); + } + if ( post.get("indexof", "").equals("on") ) { + b.set(Condenser.flag_cat_indexof, true); + } return b; } @@ -545,31 +703,41 @@ public class IndexControlRWIs_p { Seed seed; int hc = 0; prop.put("searchresult_keyhash", startHash); - final Iterator e = PeerSelection.getAcceptRemoteIndexSeeds(sb.peers, startHash, sb.peers.sizeConnected(), true); - while (e.hasNext()) { + final Iterator e = + PeerSelection.getAcceptRemoteIndexSeeds(sb.peers, startHash, sb.peers.sizeConnected(), true); + while ( e.hasNext() ) { seed = e.next(); - if (seed != null) { + if ( seed != null ) { prop.put("searchresult_hosts_" + hc + "_hosthash", seed.hash); - prop.putHTML("searchresult_hosts_" + hc + "_hostname", seed.hash + " " + seed.get(Seed.NAME, "nameless")); + prop.putHTML( + "searchresult_hosts_" + hc + "_hostname", + seed.hash + " " + seed.get(Seed.NAME, "nameless")); hc++; } } prop.put("searchresult_hosts", hc); } - public static RWIProcess genSearchresult(final serverObjects prop, final Switchboard sb, final Segment segment, final byte[] keyhash, final Bitfield filter) { - final QueryParams query = new QueryParams(ASCII.String(keyhash), -1, filter, segment, sb.getRanking(), "IndexControlRWIs_p"); + public static RWIProcess genSearchresult( + final serverObjects prop, + final Switchboard sb, + final Segment segment, + final byte[] keyhash, + final Bitfield filter) { + final QueryParams query = + new QueryParams(ASCII.String(keyhash), -1, filter, segment, sb.getRanking(), "IndexControlRWIs_p"); final ReferenceOrder order = new ReferenceOrder(query.ranking, UTF8.getBytes(query.targetlang)); final RWIProcess ranked = new RWIProcess(query, order, Integer.MAX_VALUE); ranked.run(); - if (ranked.filteredCount() == 0) { + if ( ranked.filteredCount() == 0 ) { prop.put("searchresult", 2); prop.put("searchresult_wordhash", keyhash); } else { prop.put("searchresult", 3); prop.put("searchresult_allurl", ranked.filteredCount()); - prop.put("searchresult_description", ranked.flagCount()[WordReferenceRow.flag_app_dc_description]); + prop + .put("searchresult_description", ranked.flagCount()[WordReferenceRow.flag_app_dc_description]); prop.put("searchresult_title", ranked.flagCount()[WordReferenceRow.flag_app_dc_title]); prop.put("searchresult_creator", ranked.flagCount()[WordReferenceRow.flag_app_dc_creator]); prop.put("searchresult_subject", ranked.flagCount()[WordReferenceRow.flag_app_dc_subject]); diff --git a/source/net/yacy/peers/Protocol.java b/source/net/yacy/peers/Protocol.java index 3c83bdb48..b545238c7 100644 --- a/source/net/yacy/peers/Protocol.java +++ b/source/net/yacy/peers/Protocol.java @@ -115,49 +115,66 @@ import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.tools.crypt; -public final class Protocol { - - - private static byte[] postToFile(final Seed target, final String filename, final Map parts, final int timeout) throws IOException { +public final class Protocol +{ + + private static byte[] postToFile( + final Seed target, + final String filename, + final Map parts, + final int timeout) throws IOException { return postToFile(target.getClusterAddress(), target.hash, filename, parts, timeout); } - private static byte[] postToFile(final SeedDB seedDB, final String targetHash, final String filename, final Map parts, final int timeout) throws IOException { - return postToFile(seedDB.targetAddress(targetHash), targetHash, filename, parts, timeout); + + private static byte[] postToFile( + final SeedDB seedDB, + final String targetHash, + final String filename, + final Map parts, + final int timeout) throws IOException { + return postToFile(seedDB.targetAddress(targetHash), targetHash, filename, parts, timeout); } - private static byte[] postToFile(final String targetAddress, final String targetPeerHash, final String filename, final Map parts, final int timeout) throws IOException { + + private static byte[] postToFile( + final String targetAddress, + final String targetPeerHash, + final String filename, + final Map parts, + final int timeout) throws IOException { final HTTPClient httpClient = new HTTPClient(ClientIdentification.getUserAgent(), timeout); - return httpClient.POSTbytes(new MultiProtocolURI("http://" + targetAddress + "/yacy/" + filename), Seed.b64Hash2hexHash(targetPeerHash) + ".yacyh", parts, false); + return httpClient.POSTbytes( + new MultiProtocolURI("http://" + targetAddress + "/yacy/" + filename), + Seed.b64Hash2hexHash(targetPeerHash) + ".yacyh", + parts, + false); } /** - * this is called to enrich the seed information by - * - own address (if peer is behind a nat/router) - * - check peer type (virgin/junior/senior/principal) - * - * to do this, we send a 'Hello' to another peer - * this carries the following information: - * 'iam' - own hash - * 'youare' - remote hash, to verify that we are correct - * 'key' - a session key that the remote peer may use to answer - * and the own seed string - * we expect the following information to be send back: - * - 'yourip' the ip of the connection peer (we) - * - 'yourtype' the type of this peer that the other peer checked by asking for a specific word - * and the remote seed string - * - * one exceptional failure case is when we know the other's peers hash, the other peers responds correctly - * but they appear to be another peer by comparisment of the other peer's hash - * this works of course only if we know the other peer's hash. - * + * this is called to enrich the seed information by - own address (if peer is behind a nat/router) - check + * peer type (virgin/junior/senior/principal) to do this, we send a 'Hello' to another peer this carries + * the following information: 'iam' - own hash 'youare' - remote hash, to verify that we are correct 'key' + * - a session key that the remote peer may use to answer and the own seed string we expect the following + * information to be send back: - 'yourip' the ip of the connection peer (we) - 'yourtype' the type of + * this peer that the other peer checked by asking for a specific word and the remote seed string one + * exceptional failure case is when we know the other's peers hash, the other peers responds correctly but + * they appear to be another peer by comparisment of the other peer's hash this works of course only if we + * know the other peer's hash. + * * @return the number of new seeds */ - public static int hello(final Seed mySeed, final PeerActions peerActions, final String address, final String otherHash, final String otherName) { + public static int hello( + final Seed mySeed, + final PeerActions peerActions, + final String address, + final String otherHash, + final String otherName) { Map result = null; final String salt = crypt.randomSalt(); try { // generate request - final Map parts = basicRequestParts(Switchboard.getSwitchboard(), null, salt); + final Map parts = + basicRequestParts(Switchboard.getSwitchboard(), null, salt); parts.put("count", UTF8.StringBody("20")); parts.put("magic", UTF8.StringBody(Long.toString(Network.magic))); parts.put("seed", UTF8.StringBody(mySeed.genSeedStr(salt))); @@ -165,45 +182,75 @@ public final class Protocol { final long start = System.currentTimeMillis(); // final byte[] content = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI("http://" + address + "/yacy/hello.html"), 30000, yacySeed.b64Hash2hexHash(otherHash) + ".yacyh", parts); final HTTPClient httpClient = new HTTPClient(ClientIdentification.getUserAgent(), 30000); - final byte[] content = httpClient.POSTbytes(new MultiProtocolURI("http://" + address + "/yacy/hello.html"), Seed.b64Hash2hexHash(otherHash) + ".yacyh", parts, false); - Network.log.logInfo("yacyClient.hello thread '" + Thread.currentThread().getName() + "' contacted peer at " + address + ", received " + ((content == null) ? "null" : content.length) + " bytes, time = " + (System.currentTimeMillis() - start) + " milliseconds"); + final byte[] content = + httpClient.POSTbytes( + new MultiProtocolURI("http://" + address + "/yacy/hello.html"), + Seed.b64Hash2hexHash(otherHash) + ".yacyh", + parts, + false); + Network.log.logInfo("yacyClient.hello thread '" + + Thread.currentThread().getName() + + "' contacted peer at " + + address + + ", received " + + ((content == null) ? "null" : content.length) + + " bytes, time = " + + (System.currentTimeMillis() - start) + + " milliseconds"); result = FileUtils.table(content); - } catch (final Exception e) { - if (Thread.currentThread().isInterrupted()) { - Network.log.logInfo("yacyClient.hello thread '" + Thread.currentThread().getName() + "' interrupted."); + } catch ( final Exception e ) { + if ( Thread.currentThread().isInterrupted() ) { + Network.log.logInfo("yacyClient.hello thread '" + + Thread.currentThread().getName() + + "' interrupted."); return -1; } - Network.log.logInfo("yacyClient.hello thread '" + Thread.currentThread().getName() + "', peer " + address + "; exception: " + e.getMessage()); + Network.log.logInfo("yacyClient.hello thread '" + + Thread.currentThread().getName() + + "', peer " + + address + + "; exception: " + + e.getMessage()); // try again (go into loop) result = null; } - if (result == null) { - Network.log.logInfo("yacyClient.hello result error: " + - ((result == null) ? "result null" : ("result=" + result.toString()))); + if ( result == null ) { + Network.log.logInfo("yacyClient.hello result error: " + + ((result == null) ? "result null" : ("result=" + result.toString()))); return -1; } // check consistency with expectation Seed otherPeer = null; String seed; - if ((otherHash != null) && - (otherHash.length() > 0) && - ((seed = result.get("seed0")) != null)) { - if (seed.length() > Seed.maxsize) { - Network.log.logInfo("hello/client 0: rejected contacting seed; too large (" + seed.length() + " > " + Seed.maxsize + ")"); + if ( (otherHash != null) && (otherHash.length() > 0) && ((seed = result.get("seed0")) != null) ) { + if ( seed.length() > Seed.maxsize ) { + Network.log.logInfo("hello/client 0: rejected contacting seed; too large (" + + seed.length() + + " > " + + Seed.maxsize + + ")"); } else { - try { - final int p = address.indexOf(':'); - if (p < 0) return -1; - final String host = Domains.dnsResolve(address.substring(0, p)).getHostAddress(); + try { + final int p = address.indexOf(':'); + if ( p < 0 ) { + return -1; + } + final String host = Domains.dnsResolve(address.substring(0, p)).getHostAddress(); otherPeer = Seed.genRemoteSeed(seed, salt, false, host); - if (!otherPeer.hash.equals(otherHash)) { - Network.log.logInfo("yacyClient.hello: consistency error: otherPeer.hash = " + otherPeer.hash + ", otherHash = " + otherHash); + if ( !otherPeer.hash.equals(otherHash) ) { + Network.log.logInfo("yacyClient.hello: consistency error: otherPeer.hash = " + + otherPeer.hash + + ", otherHash = " + + otherHash); return -1; // no success } - } catch (final IOException e) { - Network.log.logInfo("yacyClient.hello: consistency error: other seed bad:" + e.getMessage() + ", seed=" + seed); + } catch ( final IOException e ) { + Network.log.logInfo("yacyClient.hello: consistency error: other seed bad:" + + e.getMessage() + + ", seed=" + + seed); return -1; // no success } } @@ -211,21 +258,25 @@ public final class Protocol { // set my own seed according to new information // we overwrite our own IP number only - if (serverCore.useStaticIP) { + if ( serverCore.useStaticIP ) { mySeed.setIP(Switchboard.getSwitchboard().myPublicIP()); } else { final String myIP = result.get("yourip"); final String properIP = Seed.isProperIP(myIP); - if (properIP == null) mySeed.setIP(myIP); + if ( properIP == null ) { + mySeed.setIP(myIP); + } } // change our seed-type String mytype = result.get(Seed.YOURTYPE); - if (mytype == null) { mytype = ""; } + if ( mytype == null ) { + mytype = ""; + } final Accessible accessible = new Accessible(); - if (mytype.equals(Seed.PEERTYPE_SENIOR)||mytype.equals(Seed.PEERTYPE_PRINCIPAL)) { + if ( mytype.equals(Seed.PEERTYPE_SENIOR) || mytype.equals(Seed.PEERTYPE_PRINCIPAL) ) { accessible.IWasAccessed = true; - if (mySeed.isPrincipal()) { + if ( mySeed.isPrincipal() ) { mytype = Seed.PEERTYPE_PRINCIPAL; } } else { @@ -238,21 +289,36 @@ public final class Protocol { * If we were reported as junior we have to check if your port forwarding channel is broken * If this is true we try to reconnect the sch channel to the remote server now. */ - if (mytype.equalsIgnoreCase(Seed.PEERTYPE_JUNIOR)) { - Network.log.logInfo("yacyClient.hello: Peer '" + ((otherPeer==null)?"unknown":otherPeer.getName()) + "' reported us as junior."); - } else if ((mytype.equalsIgnoreCase(Seed.PEERTYPE_SENIOR)) || - (mytype.equalsIgnoreCase(Seed.PEERTYPE_PRINCIPAL))) { - if (Network.log.isFine()) Network.log.logFine("yacyClient.hello: Peer '" + ((otherPeer==null)?"unknown":otherPeer.getName()) + "' reported us as " + mytype + ", accepted other peer."); + if ( mytype.equalsIgnoreCase(Seed.PEERTYPE_JUNIOR) ) { + Network.log.logInfo("yacyClient.hello: Peer '" + + ((otherPeer == null) ? "unknown" : otherPeer.getName()) + + "' reported us as junior."); + } else if ( (mytype.equalsIgnoreCase(Seed.PEERTYPE_SENIOR)) + || (mytype.equalsIgnoreCase(Seed.PEERTYPE_PRINCIPAL)) ) { + if ( Network.log.isFine() ) { + Network.log.logFine("yacyClient.hello: Peer '" + + ((otherPeer == null) ? "unknown" : otherPeer.getName()) + + "' reported us as " + + mytype + + ", accepted other peer."); + } } else { // wrong type report - if (Network.log.isFine()) Network.log.logFine("yacyClient.hello: Peer '" + ((otherPeer==null)?"unknown":otherPeer.getName()) + "' reported us as " + mytype + ", rejecting other peer."); + if ( Network.log.isFine() ) { + Network.log.logFine("yacyClient.hello: Peer '" + + ((otherPeer == null) ? "unknown" : otherPeer.getName()) + + "' reported us as " + + mytype + + ", rejecting other peer."); + } return -1; } - if (mySeed.orVirgin().equals(Seed.PEERTYPE_VIRGIN)) + if ( mySeed.orVirgin().equals(Seed.PEERTYPE_VIRGIN) ) { mySeed.put(Seed.PEERTYPE, mytype); + } final String error = mySeed.isProper(true); - if (error != null) { + if ( error != null ) { Network.log.logWarning("yacyClient.hello mySeed error - not proper: " + error); return -1; } @@ -265,31 +331,45 @@ public final class Protocol { String seedStr; Seed s; final int connectedBefore = peerActions.sizeConnected(); - while ((seedStr = result.get("seed" + i++)) != null) { + while ( (seedStr = result.get("seed" + i++)) != null ) { // integrate new seed into own database // the first seed, "seed0" is the seed of the responding peer - if (seedStr.length() > Seed.maxsize) { - Network.log.logInfo("hello/client: rejected contacting seed; too large (" + seedStr.length() + " > " + Seed.maxsize + ")"); + if ( seedStr.length() > Seed.maxsize ) { + Network.log.logInfo("hello/client: rejected contacting seed; too large (" + + seedStr.length() + + " > " + + Seed.maxsize + + ")"); } else { try { - if (i == 1) { + if ( i == 1 ) { final int p = address.indexOf(':'); - if (p < 0) return -1; + if ( p < 0 ) { + return -1; + } final String host = Domains.dnsResolve(address.substring(0, p)).getHostAddress(); s = Seed.genRemoteSeed(seedStr, salt, false, host); } else { s = Seed.genRemoteSeed(seedStr, salt, false, null); } - if (peerActions.peerArrival(s, (i == 1))) count++; - } catch (final IOException e) { - Network.log.logInfo("hello/client: rejected contacting seed; bad (" + e.getMessage() + ")"); + if ( peerActions.peerArrival(s, (i == 1)) ) { + count++; + } + } catch ( final IOException e ) { + Network.log.logInfo("hello/client: rejected contacting seed; bad (" + + e.getMessage() + + ")"); } } } final int connectedAfter = peerActions.sizeConnected(); // update event tracker - EventTracker.update(EventTracker.EClass.PEERPING, new ProfilingGraph.EventPing(mySeed.getName(), otherName, true, connectedAfter - connectedBefore), false); + EventTracker.update(EventTracker.EClass.PEERPING, new ProfilingGraph.EventPing( + mySeed.getName(), + otherName, + true, + connectedAfter - connectedBefore), false); return count; } @@ -300,16 +380,19 @@ public final class Protocol { // send request try { - final Map parts = basicRequestParts(Switchboard.getSwitchboard(), target.hash, salt); + final Map parts = + basicRequestParts(Switchboard.getSwitchboard(), target.hash, salt); parts.put("object", UTF8.StringBody("seed")); parts.put("env", UTF8.StringBody(seedHash)); final byte[] content = postToFile(target, "query.html", parts, 10000); final Map result = FileUtils.table(content); - if (result == null || result.isEmpty()) { return null; } + if ( result == null || result.isEmpty() ) { + return null; + } //final Date remoteTime = yacyCore.parseUniversalDate((String) result.get(yacySeed.MYTIME)); // read remote time return Seed.genRemoteSeed(result.get("response"), salt, false, target.getIP()); - } catch (final Exception e) { + } catch ( final Exception e ) { Network.log.logWarning("yacyClient.querySeed error:" + e.getMessage()); return null; } @@ -321,16 +404,19 @@ public final class Protocol { // send request try { - final Map parts = basicRequestParts(Switchboard.getSwitchboard(), target.hash, salt); + final Map parts = + basicRequestParts(Switchboard.getSwitchboard(), target.hash, salt); parts.put("object", UTF8.StringBody("rwicount")); parts.put("ttl", UTF8.StringBody("0")); parts.put("env", UTF8.StringBody(wordHash)); final byte[] content = postToFile(target, "query.html", parts, 5000); final Map result = FileUtils.table(content); - if (result == null || result.isEmpty()) { return -1; } + if ( result == null || result.isEmpty() ) { + return -1; + } return Integer.parseInt(result.get("response")); - } catch (final Exception e) { + } catch ( final Exception e ) { Network.log.logWarning("yacyClient.queryRWICount error:" + e.getMessage()); return -1; } @@ -338,45 +424,81 @@ public final class Protocol { /** * check the status of a remote peer + * * @param target * @return an array of two long: [0] is the count of urls, [1] is a magic */ public static long[] queryUrlCount(final Seed target) { - if (target == null) return new long[]{-1, -1}; + if ( target == null ) { + return new long[] { + -1, -1 + }; + } // prepare request final String salt = crypt.randomSalt(); // send request try { - final Map parts = basicRequestParts(Switchboard.getSwitchboard(), target.hash, salt); + final Map parts = + basicRequestParts(Switchboard.getSwitchboard(), target.hash, salt); parts.put("object", UTF8.StringBody("lurlcount")); parts.put("ttl", UTF8.StringBody("0")); parts.put("env", UTF8.StringBody("")); final byte[] content = postToFile(target, "query.html", parts, 5000); final Map result = FileUtils.table(content); - if (result == null || result.isEmpty()) return new long[]{-1, -1}; + if ( result == null || result.isEmpty() ) { + return new long[] { + -1, -1 + }; + } final String resp = result.get("response"); - if (resp == null) return new long[]{-1, -1}; - String magic = result.get("magic"); if (magic == null) magic = "0"; + if ( resp == null ) { + return new long[] { + -1, -1 + }; + } + String magic = result.get("magic"); + if ( magic == null ) { + magic = "0"; + } try { - return new long[]{Long.parseLong(resp), Long.parseLong(magic)}; - } catch (final NumberFormatException e) { - return new long[]{-1, -1}; + return new long[] { + Long.parseLong(resp), Long.parseLong(magic) + }; + } catch ( final NumberFormatException e ) { + return new long[] { + -1, -1 + }; } - } catch (final IOException e) { - if (Network.log.isFine()) Network.log.logFine("yacyClient.queryUrlCount error asking peer '" + target.getName() + "':" + e.toString()); - return new long[]{-1, -1}; + } catch ( final IOException e ) { + if ( Network.log.isFine() ) { + Network.log.logFine("yacyClient.queryUrlCount error asking peer '" + + target.getName() + + "':" + + e.toString()); + } + return new long[] { + -1, -1 + }; } } - public static RSSFeed queryRemoteCrawlURLs(final SeedDB seedDB, final Seed target, final int maxCount, final long maxTime) { + public static RSSFeed queryRemoteCrawlURLs( + final SeedDB seedDB, + final Seed target, + final int maxCount, + final long maxTime) { // returns a list of - if (target == null) { return null; } + if ( target == null ) { + return null; + } final int targetCount = Integer.parseInt(target.get(Seed.RCOUNT, "0")); - if (targetCount <= 0) { - Network.log.logWarning("yacyClient.queryRemoteCrawlURLs wrong peer '" + target.getName() + "' selected: not enough links available"); + if ( targetCount <= 0 ) { + Network.log.logWarning("yacyClient.queryRemoteCrawlURLs wrong peer '" + + target.getName() + + "' selected: not enough links available"); return null; } // prepare request @@ -385,25 +507,33 @@ public final class Protocol { // send request try { /* a long time-out is needed */ - final Map parts = basicRequestParts(Switchboard.getSwitchboard(), target.hash, salt); + final Map parts = + basicRequestParts(Switchboard.getSwitchboard(), target.hash, salt); parts.put("call", UTF8.StringBody("remotecrawl")); parts.put("count", UTF8.StringBody(Integer.toString(maxCount))); parts.put("time", UTF8.StringBody(Long.toString(maxTime))); // final byte[] result = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI("http://" + target.getClusterAddress() + "/yacy/urls.xml"), (int) maxTime, target.getHexHash() + ".yacyh", parts); final HTTPClient httpClient = new HTTPClient(ClientIdentification.getUserAgent(), (int) maxTime); - final byte[] result = httpClient.POSTbytes(new MultiProtocolURI("http://" + target.getClusterAddress() + "/yacy/urls.xml"), target.getHexHash() + ".yacyh", parts, false); + final byte[] result = + httpClient.POSTbytes(new MultiProtocolURI("http://" + + target.getClusterAddress() + + "/yacy/urls.xml"), target.getHexHash() + ".yacyh", parts, false); final RSSReader reader = RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, result); - if (reader == null) { - Network.log.logWarning("yacyClient.queryRemoteCrawlURLs failed asking peer '" + target.getName() + "': probably bad response from remote peer (1), reader == null"); + if ( reader == null ) { + Network.log.logWarning("yacyClient.queryRemoteCrawlURLs failed asking peer '" + + target.getName() + + "': probably bad response from remote peer (1), reader == null"); target.put(Seed.RCOUNT, "0"); seedDB.update(target.hash, target); // overwrite number of remote-available number to avoid that this peer is called again (until update is done by peer ping) //Log.logException(e); return null; } final RSSFeed feed = reader.getFeed(); - if (feed == null) { + if ( feed == null ) { // case where the rss reader does not understand the content - Network.log.logWarning("yacyClient.queryRemoteCrawlURLs failed asking peer '" + target.getName() + "': probably bad response from remote peer (2)"); + Network.log.logWarning("yacyClient.queryRemoteCrawlURLs failed asking peer '" + + target.getName() + + "': probably bad response from remote peer (2)"); //System.out.println("***DEBUG*** rss input = " + UTF8.String(result)); target.put(Seed.RCOUNT, "0"); seedDB.update(target.hash, target); // overwrite number of remote-available number to avoid that this peer is called again (until update is done by peer ping) @@ -414,44 +544,63 @@ public final class Protocol { target.put(Seed.RCOUNT, Integer.toString(Math.max(0, targetCount - feed.size()))); seedDB.update(target.hash, target); return feed; - } catch (final IOException e) { - Network.log.logWarning("yacyClient.queryRemoteCrawlURLs error asking peer '" + target.getName() + "':" + e.toString()); + } catch ( final IOException e ) { + Network.log.logWarning("yacyClient.queryRemoteCrawlURLs error asking peer '" + + target.getName() + + "':" + + e.toString()); return null; } } - public static RSSFeed search(final Seed targetSeed, final String query, final CacheStrategy verify, final boolean global, final long timeout, final int startRecord, final int maximumRecords) throws IOException { - final String address = (targetSeed == null || targetSeed == Switchboard.getSwitchboard().peers.mySeed()) ? "localhost:" + Switchboard.getSwitchboard().getConfig("port", "8090") : targetSeed.getClusterAddress(); + public static RSSFeed search( + final Seed targetSeed, + final String query, + final CacheStrategy verify, + final boolean global, + final long timeout, + final int startRecord, + final int maximumRecords) throws IOException { + final String address = + (targetSeed == null || targetSeed == Switchboard.getSwitchboard().peers.mySeed()) ? "localhost:" + + Switchboard.getSwitchboard().getConfig("port", "8090") : targetSeed.getClusterAddress(); final String urlBase = "http://" + address + "/yacysearch.rss"; - return SRURSSConnector.loadSRURSS(urlBase, query, timeout, startRecord, maximumRecords, verify, global, null); + return SRURSSConnector.loadSRURSS( + urlBase, + query, + timeout, + startRecord, + maximumRecords, + verify, + global, + null); } @SuppressWarnings("unchecked") public static int search( - final Seed mySeed, - final String wordhashes, - final String excludehashes, - final String urlhashes, - final Pattern prefer, - final Pattern filter, - final Pattern snippet, - final String modifier, - final String language, - final String sitehash, - final String authorhash, - final int count, - final long time, - final int maxDistance, - final boolean global, - final int partitions, - final Seed target, - final Segment indexSegment, - final RWIProcess containerCache, - final SearchEvent.SecondarySearchSuperviser secondarySearchSuperviser, - final Blacklist blacklist, - final RankingProfile rankingProfile, - final Bitfield constraint - ) { + final Seed mySeed, + final String wordhashes, + final String excludehashes, + final String urlhashes, + final Pattern prefer, + final Pattern filter, + final Pattern snippet, + final String modifier, + final String language, + final String sitehash, + final String authorhash, + final int count, + final long time, + final int maxDistance, + final boolean global, + final int partitions, + final Seed target, + final Segment indexSegment, + final RWIProcess containerCache, + final SearchEvent.SecondarySearchSuperviser secondarySearchSuperviser, + final Blacklist blacklist, + final RankingProfile rankingProfile, + final Bitfield constraint) { // send a search request to peer with remote Hash // INPUT: @@ -470,13 +619,38 @@ public final class Protocol { final long timestamp = System.currentTimeMillis(); SearchResult result; try { - result = new SearchResult( - basicRequestParts(Switchboard.getSwitchboard(), target.hash, crypt.randomSalt()), - mySeed, wordhashes, excludehashes, urlhashes, prefer, filter, snippet, modifier, language, - sitehash, authorhash, count, time, maxDistance, global, partitions, target.getHexHash() + ".yacyh", target.getClusterAddress(), - secondarySearchSuperviser, rankingProfile, constraint); - } catch (final IOException e) { - Network.log.logInfo("SEARCH failed, Peer: " + target.hash + ":" + target.getName() + " (" + e.getMessage() + ")"); + result = + new SearchResult( + basicRequestParts(Switchboard.getSwitchboard(), target.hash, crypt.randomSalt()), + mySeed, + wordhashes, + excludehashes, + urlhashes, + prefer, + filter, + snippet, + modifier, + language, + sitehash, + authorhash, + count, + time, + maxDistance, + global, + partitions, + target.getHexHash() + ".yacyh", + target.getClusterAddress(), + secondarySearchSuperviser, + rankingProfile, + constraint); + } catch ( final IOException e ) { + Network.log.logInfo("SEARCH failed, Peer: " + + target.hash + + ":" + + target.getName() + + " (" + + e.getMessage() + + ")"); //yacyCore.peerActions.peerDeparture(target, "search request to peer created io exception: " + e.getMessage()); return -1; } @@ -490,10 +664,15 @@ public final class Protocol { final int words = wordhashes.length() / Word.commonHashLength; assert words > 0 : "wordhashes = " + wordhashes; final ReferenceContainer[] container = new ReferenceContainer[words]; - for (int i = 0; i < words; i++) { + for ( int i = 0; i < words; i++ ) { try { - container[i] = ReferenceContainer.emptyContainer(Segment.wordReferenceFactory, ASCII.getBytes(wordhashes.substring(i * Word.commonHashLength, (i + 1) * Word.commonHashLength)), count); - } catch (final RowSpaceExceededException e) { + container[i] = + ReferenceContainer.emptyContainer( + Segment.wordReferenceFactory, + ASCII.getBytes(wordhashes.substring(i * Word.commonHashLength, (i + 1) + * Word.commonHashLength)), + count); + } catch ( final RowSpaceExceededException e ) { Log.logException(e); return -1; } @@ -501,48 +680,87 @@ public final class Protocol { // insert results to containers int term = count; - for (final URIMetadataRow urlEntry: result.links) { - if (term-- <= 0) break; // do not process more that requested (in case that evil peers fill us up with rubbish) + for ( final URIMetadataRow urlEntry : result.links ) { + if ( term-- <= 0 ) { + break; // do not process more that requested (in case that evil peers fill us up with rubbish) + } // get one single search result - if (urlEntry == null) continue; + if ( urlEntry == null ) { + continue; + } assert (urlEntry.hash().length == 12) : "urlEntry.hash() = " + ASCII.String(urlEntry.hash()); - if (urlEntry.hash().length != 12) continue; // bad url hash + if ( urlEntry.hash().length != 12 ) { + continue; // bad url hash + } final URIMetadataRow.Components metadata = urlEntry.metadata(); - if (metadata == null) continue; - if (blacklist.isListed(Blacklist.BLACKLIST_SEARCH, metadata.url())) { - if (Network.log.isInfo()) Network.log.logInfo("remote search: filtered blacklisted url " + metadata.url() + " from peer " + target.getName()); + if ( metadata == null ) { + continue; + } + if ( blacklist.isListed(Blacklist.BLACKLIST_SEARCH, metadata.url()) ) { + if ( Network.log.isInfo() ) { + Network.log.logInfo("remote search: filtered blacklisted url " + + metadata.url() + + " from peer " + + target.getName()); + } continue; // block with backlist } - final String urlRejectReason = Switchboard.getSwitchboard().crawlStacker.urlInAcceptedDomain(metadata.url()); - if (urlRejectReason != null) { - if (Network.log.isInfo()) Network.log.logInfo("remote search: rejected url '" + metadata.url() + "' (" + urlRejectReason + ") from peer " + target.getName()); + final String urlRejectReason = + Switchboard.getSwitchboard().crawlStacker.urlInAcceptedDomain(metadata.url()); + if ( urlRejectReason != null ) { + if ( Network.log.isInfo() ) { + Network.log.logInfo("remote search: rejected url '" + + metadata.url() + + "' (" + + urlRejectReason + + ") from peer " + + target.getName()); + } continue; // reject url outside of our domain } // save the url entry final Reference entry = urlEntry.word(); - if (entry == null) { - if (Network.log.isWarning()) Network.log.logWarning("remote search: no word attached from peer " + target.getName() + ", version " + target.getVersion()); + if ( entry == null ) { + if ( Network.log.isWarning() ) { + Network.log.logWarning("remote search: no word attached from peer " + + target.getName() + + ", version " + + target.getVersion()); + } continue; // no word attached } // the search-result-url transports all the attributes of word indexes - if (!Base64Order.enhancedCoder.equal(entry.urlhash(), urlEntry.hash())) { - Network.log.logInfo("remote search: url-hash " + ASCII.String(urlEntry.hash()) + " does not belong to word-attached-hash " + ASCII.String(entry.urlhash()) + "; url = " + metadata.url() + " from peer " + target.getName()); + if ( !Base64Order.enhancedCoder.equal(entry.urlhash(), urlEntry.hash()) ) { + Network.log.logInfo("remote search: url-hash " + + ASCII.String(urlEntry.hash()) + + " does not belong to word-attached-hash " + + ASCII.String(entry.urlhash()) + + "; url = " + + metadata.url() + + " from peer " + + target.getName()); continue; // spammed } // passed all checks, store url try { indexSegment.urlMetadata().store(urlEntry); - ResultURLs.stack(urlEntry, mySeed.hash.getBytes(), UTF8.getBytes(target.hash), EventOrigin.QUERIES); - } catch (final IOException e) { + ResultURLs.stack( + urlEntry, + mySeed.hash.getBytes(), + UTF8.getBytes(target.hash), + EventOrigin.QUERIES); + } catch ( final IOException e ) { Network.log.logWarning("could not store search result", e); continue; // db-error } - if (urlEntry.snippet() != null && urlEntry.snippet().length() > 0 && !urlEntry.snippet().equals("null")) { + if ( urlEntry.snippet() != null + && urlEntry.snippet().length() > 0 + && !urlEntry.snippet().equals("null") ) { // we don't store the snippets along the url entry, // because they are search-specific. // instead, they are placed in a snipped-search cache. @@ -551,10 +769,10 @@ public final class Protocol { } // add the url entry to the word indexes - for (int m = 0; m < words; m++) { + for ( int m = 0; m < words; m++ ) { try { container[m].add(entry); - } catch (final RowSpaceExceededException e) { + } catch ( final RowSpaceExceededException e ) { Log.logException(e); break; } @@ -565,62 +783,92 @@ public final class Protocol { // insert one container into the search result buffer // one is enough, only the references are used, not the word containerCache.add(container[0], false, target.getName() + "/" + target.hash, result.joincount, true); + containerCache.decExpectedRemoteReferences(count - container[0].size()); // insert the containers to the index - for (final ReferenceContainer c: container) try { - indexSegment.termIndex().add(c); - } catch (final Exception e) { - Log.logException(e); + for ( final ReferenceContainer c : container ) { + try { + indexSegment.termIndex().add(c); + } catch ( final Exception e ) { + Log.logException(e); + } } - Network.log.logInfo("remote search: peer " + target.getName() + " sent " + container[0].size() + "/" + result.joincount + " references for " + (thisIsASecondarySearch ? "a secondary search" : "joined word queries")); + Network.log.logInfo("remote search: peer " + + target.getName() + + " sent " + + container[0].size() + + "/" + + result.joincount + + " references for " + + (thisIsASecondarySearch ? "a secondary search" : "joined word queries")); // integrate remote top-words/topics - if (result.references != null && result.references.length > 0) { - Network.log.logInfo("remote search: peer " + target.getName() + " sent " + result.references.length + " topics"); + if ( result.references != null && result.references.length > 0 ) { + Network.log.logInfo("remote search: peer " + + target.getName() + + " sent " + + result.references.length + + " topics"); // add references twice, so they can be counted (must have at least 2 entries) - synchronized (containerCache) { + synchronized ( containerCache ) { containerCache.addTopic(result.references); containerCache.addTopic(result.references); } } // read index abstract - if (secondarySearchSuperviser != null) { + if ( secondarySearchSuperviser != null ) { String wordhash; String whacc = ""; ByteBuffer ci; int ac = 0; - for (final Map.Entry abstractEntry: result.indexabstract.entrySet()) { + for ( final Map.Entry abstractEntry : result.indexabstract.entrySet() ) { try { ci = new ByteBuffer(abstractEntry.getValue()); wordhash = ASCII.String(abstractEntry.getKey()); - } catch (final OutOfMemoryError e) { + } catch ( final OutOfMemoryError e ) { Log.logException(e); continue; } whacc += wordhash; - secondarySearchSuperviser.addAbstract(wordhash, WordReferenceFactory.decompressIndex(ci, target.hash)); + secondarySearchSuperviser.addAbstract( + wordhash, + WordReferenceFactory.decompressIndex(ci, target.hash)); ac++; } - if (ac > 0) { + if ( ac > 0 ) { secondarySearchSuperviser.commitAbstract(); - Network.log.logInfo("remote search: peer " + target.getName() + " sent " + ac + " index abstracts for words "+ whacc); + Network.log.logInfo("remote search: peer " + + target.getName() + + " sent " + + ac + + " index abstracts for words " + + whacc); } } // generate statistics - if (Network.log.isFine()) Network.log.logFine( - "SEARCH " + result.urlcount + - " URLS FROM " + target.hash + ":" + target.getName() + - ", searchtime=" + result.searchtime + - ", netdelay=" + (totalrequesttime - result.searchtime) + - ", references=" + result.references); + if ( Network.log.isFine() ) { + Network.log.logFine("SEARCH " + + result.urlcount + + " URLS FROM " + + target.hash + + ":" + + target.getName() + + ", searchtime=" + + result.searchtime + + ", netdelay=" + + (totalrequesttime - result.searchtime) + + ", references=" + + result.references); + } return result.urlcount; } - public static class SearchResult { + public static class SearchResult + { public String version; // version : application version of responder public String uptime; // uptime : uptime in seconds of responder @@ -636,28 +884,28 @@ public final class Protocol { public Map indexabstract; // index abstracts, a collection of url-hashes per word public SearchResult( - final Map parts, - final Seed mySeed, - final String wordhashes, - final String excludehashes, - final String urlhashes, - final Pattern prefer, - final Pattern filter, - final Pattern snippet, - final String modifier, - final String language, - final String sitehash, - final String authorhash, - final int count, - final long time, - final int maxDistance, - final boolean global, - final int partitions, - final String hostname, - final String hostaddress, - final SearchEvent.SecondarySearchSuperviser secondarySearchSuperviser, - final RankingProfile rankingProfile, - final Bitfield constraint) throws IOException { + final Map parts, + final Seed mySeed, + final String wordhashes, + final String excludehashes, + final String urlhashes, + final Pattern prefer, + final Pattern filter, + final Pattern snippet, + final String modifier, + final String language, + final String sitehash, + final String authorhash, + final int count, + final long time, + final int maxDistance, + final boolean global, + final int partitions, + final String hostname, + final String hostaddress, + final SearchEvent.SecondarySearchSuperviser secondarySearchSuperviser, + final RankingProfile rankingProfile, + final Bitfield constraint) throws IOException { // send a search request to peer with remote Hash //if (hostaddress.equals(mySeed.getClusterAddress())) hostaddress = "127.0.0.1:" + mySeed.getPort(); // for debugging @@ -679,7 +927,7 @@ public final class Protocol { Map resultMap = null; String key = ""; final ContentBody keyBody = parts.get("key"); - if (keyBody != null) { + if ( keyBody != null ) { final ByteArrayOutputStream baos = new ByteArrayOutputStream(20); keyBody.writeTo(baos); key = baos.toString(); @@ -704,28 +952,38 @@ public final class Protocol { parts.put("maxdist", UTF8.StringBody(Integer.toString(maxDistance))); parts.put("profile", UTF8.StringBody(crypt.simpleEncode(rankingProfile.toExternalString()))); parts.put("constraint", UTF8.StringBody((constraint == null) ? "" : constraint.exportB64())); - if (secondarySearchSuperviser != null) parts.put("abstracts", UTF8.StringBody("auto")); - // resultMap = FileUtils.table(HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI("http://" + hostaddress + "/yacy/search.html"), 60000, hostname, parts)); - //resultMap = FileUtils.table(HTTPConnector.getConnector(MultiProtocolURI.crawlerUserAgent).post(new MultiProtocolURI("http://" + target.getClusterAddress() + "/yacy/search.html"), 60000, target.getHexHash() + ".yacyh", parts)); + if ( secondarySearchSuperviser != null ) { + parts.put("abstracts", UTF8.StringBody("auto")); + // resultMap = FileUtils.table(HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI("http://" + hostaddress + "/yacy/search.html"), 60000, hostname, parts)); + //resultMap = FileUtils.table(HTTPConnector.getConnector(MultiProtocolURI.crawlerUserAgent).post(new MultiProtocolURI("http://" + target.getClusterAddress() + "/yacy/search.html"), 60000, target.getHexHash() + ".yacyh", parts)); + } final HTTPClient httpClient = new HTTPClient(ClientIdentification.getUserAgent(), 8000); - resultMap = FileUtils.table(httpClient.POSTbytes(new MultiProtocolURI("http://" + hostaddress + "/yacy/search.html"), hostname, parts, false)); + resultMap = + FileUtils.table(httpClient.POSTbytes(new MultiProtocolURI("http://" + + hostaddress + + "/yacy/search.html"), hostname, parts, false)); // evaluate request result - if (resultMap == null || resultMap.isEmpty()) throw new IOException("resultMap is NULL"); + if ( resultMap == null || resultMap.isEmpty() ) { + throw new IOException("resultMap is NULL"); + } try { this.searchtime = Integer.parseInt(resultMap.get("searchtime")); - } catch (final NumberFormatException e) { - throw new IOException("wrong output format for searchtime: " + e.getMessage() + ", map = " + resultMap.toString()); + } catch ( final NumberFormatException e ) { + throw new IOException("wrong output format for searchtime: " + + e.getMessage() + + ", map = " + + resultMap.toString()); } try { this.joincount = Integer.parseInt(resultMap.get("joincount")); // the complete number of hits at remote site - } catch (final NumberFormatException e) { + } catch ( final NumberFormatException e ) { throw new IOException("wrong output format for joincount: " + e.getMessage()); } try { - this.urlcount = Integer.parseInt(resultMap.get("count")); // the number of hits that are returned in the result list - } catch (final NumberFormatException e) { + this.urlcount = Integer.parseInt(resultMap.get("count")); // the number of hits that are returned in the result list + } catch ( final NumberFormatException e ) { throw new IOException("wrong output format for count: " + e.getMessage()); } this.fwhop = resultMap.get("fwhop"); @@ -734,22 +992,28 @@ public final class Protocol { // scan the result map for entries with special prefix this.indexcount = new TreeMap(Base64Order.enhancedCoder); this.indexabstract = new TreeMap(Base64Order.enhancedCoder); - for (final Map.Entry entry: resultMap.entrySet()) { - if (entry.getKey().startsWith("indexcount.")) { - this.indexcount.put(UTF8.getBytes(entry.getKey().substring(11)), Integer.parseInt(entry.getValue())); + for ( final Map.Entry entry : resultMap.entrySet() ) { + if ( entry.getKey().startsWith("indexcount.") ) { + this.indexcount.put( + UTF8.getBytes(entry.getKey().substring(11)), + Integer.parseInt(entry.getValue())); } - if (entry.getKey().startsWith("indexabstract.")) { + if ( entry.getKey().startsWith("indexabstract.") ) { this.indexabstract.put(UTF8.getBytes(entry.getKey().substring(14)), entry.getValue()); } } this.references = resultMap.get("references").split(","); this.links = new ArrayList(this.urlcount); - for (int n = 0; n < this.urlcount; n++) { + for ( int n = 0; n < this.urlcount; n++ ) { // get one single search result final String resultLine = resultMap.get("resource" + n); - if (resultLine == null) continue; + if ( resultLine == null ) { + continue; + } final URIMetadataRow urlEntry = URIMetadataRow.importEntry(resultLine); - if (urlEntry == null) continue; + if ( urlEntry == null ) { + continue; + } this.links.add(urlEntry); } } @@ -764,19 +1028,24 @@ public final class Protocol { // send request try { - final Map parts = basicRequestParts(Switchboard.getSwitchboard(), targetHash, salt); + final Map parts = + basicRequestParts(Switchboard.getSwitchboard(), targetHash, salt); parts.put("process", UTF8.StringBody("permission")); final byte[] content = postToFile(seedDB, targetHash, "message.html", parts, 5000); final Map result = FileUtils.table(content); return result; - } catch (final Exception e) { + } catch ( final Exception e ) { // most probably a network time-out exception Network.log.logWarning("yacyClient.permissionMessage error:" + e.getMessage()); return null; } } - public static Map postMessage(final SeedDB seedDB, final String targetHash, final String subject, final byte[] message) { + public static Map postMessage( + final SeedDB seedDB, + final String targetHash, + final String subject, + final byte[] message) { // this post a message to the remote message board // prepare request @@ -784,7 +1053,8 @@ public final class Protocol { // send request try { - final Map parts = basicRequestParts(Switchboard.getSwitchboard(), targetHash, salt); + final Map parts = + basicRequestParts(Switchboard.getSwitchboard(), targetHash, salt); parts.put("process", UTF8.StringBody("post")); parts.put("myseed", UTF8.StringBody(seedDB.mySeed().genSeedStr(salt))); parts.put("subject", UTF8.StringBody(subject)); @@ -792,13 +1062,20 @@ public final class Protocol { final byte[] content = postToFile(seedDB, targetHash, "message.html", parts, 20000); final Map result = FileUtils.table(content); return result; - } catch (final Exception e) { + } catch ( final Exception e ) { Network.log.logWarning("yacyClient.postMessage error:" + e.getMessage()); return null; } } - public static Map crawlReceipt(final Seed mySeed, final Seed target, final String process, final String result, final String reason, final URIMetadataRow entry, final String wordhashes) { + public static Map crawlReceipt( + final Seed mySeed, + final Seed target, + final String process, + final String result, + final String reason, + final URIMetadataRow entry, + final String wordhashes) { assert (target != null); assert (mySeed != null); assert (mySeed != target); @@ -828,24 +1105,34 @@ public final class Protocol { // determining target address final String address = target.getClusterAddress(); - if (address == null) { return null; } + if ( address == null ) { + return null; + } // send request try { // prepare request - final Map parts = basicRequestParts(Switchboard.getSwitchboard(), target.hash, salt); + final Map parts = + basicRequestParts(Switchboard.getSwitchboard(), target.hash, salt); parts.put("process", UTF8.StringBody(process)); parts.put("urlhash", UTF8.StringBody(((entry == null) ? "" : ASCII.String(entry.hash())))); parts.put("result", UTF8.StringBody(result)); parts.put("reason", UTF8.StringBody(reason)); parts.put("wordh", UTF8.StringBody(wordhashes)); - parts.put("lurlEntry", UTF8.StringBody(((entry == null) ? "" : crypt.simpleEncode(entry.toString(), salt)))); + parts.put( + "lurlEntry", + UTF8.StringBody(((entry == null) ? "" : crypt.simpleEncode(entry.toString(), salt)))); // send request // final byte[] content = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI("http://" + address + "/yacy/crawlReceipt.html"), 10000, target.getHexHash() + ".yacyh", parts); final HTTPClient httpClient = new HTTPClient(ClientIdentification.getUserAgent(), 10000); - final byte[] content = httpClient.POSTbytes(new MultiProtocolURI("http://" + address + "/yacy/crawlReceipt.html"), target.getHexHash() + ".yacyh", parts, false); + final byte[] content = + httpClient.POSTbytes( + new MultiProtocolURI("http://" + address + "/yacy/crawlReceipt.html"), + target.getHexHash() + ".yacyh", + parts, + false); return FileUtils.table(content); - } catch (final Exception e) { + } catch ( final Exception e ) { // most probably a network time-out exception Network.log.logWarning("yacyClient.crawlReceipt error:" + e.getMessage()); return null; @@ -853,8 +1140,9 @@ public final class Protocol { } /** - * transfer the index. If the transmission fails, return a string describing the cause. - * If everything is ok, return null. + * transfer the index. If the transmission fails, return a string describing the cause. If everything is + * ok, return null. + * * @param targetSeed * @param indexes * @param urlCache @@ -863,21 +1151,25 @@ public final class Protocol { * @return */ public static String transferIndex( - final Seed targetSeed, - final ReferenceContainerCache indexes, - final SortedMap urlCache, - final boolean gzipBody, - final int timeout) { + final Seed targetSeed, + final ReferenceContainerCache indexes, + final SortedMap urlCache, + final boolean gzipBody, + final int timeout) { // check if we got all necessary urls in the urlCache (only for debugging) Iterator eenum; Reference entry; - for (final ReferenceContainer ic: indexes) { + for ( final ReferenceContainer ic : indexes ) { eenum = ic.entries(); - while (eenum.hasNext()) { + while ( eenum.hasNext() ) { entry = eenum.next(); - if (urlCache.get(entry.urlhash()) == null) { - if (Network.log.isFine()) Network.log.logFine("DEBUG transferIndex: to-send url hash '" + ASCII.String(entry.urlhash()) + "' is not contained in urlCache"); + if ( urlCache.get(entry.urlhash()) == null ) { + if ( Network.log.isFine() ) { + Network.log.logFine("DEBUG transferIndex: to-send url hash '" + + ASCII.String(entry.urlhash()) + + "' is not contained in urlCache"); + } } } } @@ -885,73 +1177,96 @@ public final class Protocol { // transfer the RWI without the URLs Map in = transferRWI(targetSeed, indexes, gzipBody, timeout); - if (in == null) { + if ( in == null ) { return "no connection from transferRWI"; } String result = in.get("result"); - if (result == null) { + if ( result == null ) { return "no result from transferRWI"; } - if (!(result.equals("ok"))) { + if ( !(result.equals("ok")) ) { return result; } // in now contains a list of unknown hashes String uhss = in.get("unknownURL"); - if (uhss == null) { + if ( uhss == null ) { return "no unknownURL tag in response"; } - EventChannel.channels(EventChannel.DHTSEND).addMessage(new RSSMessage("Sent " + indexes.size() + " RWIs to " + targetSeed.getName(), "", targetSeed.hash)); + EventChannel + .channels(EventChannel.DHTSEND) + .addMessage( + new RSSMessage( + "Sent " + indexes.size() + " RWIs to " + targetSeed.getName(), + "", + targetSeed.hash)); uhss = uhss.trim(); - if (uhss.length() == 0 || uhss.equals(",")) { return null; } // all url's known, we are ready here + if ( uhss.length() == 0 || uhss.equals(",") ) { + return null; + } // all url's known, we are ready here final String[] uhs = uhss.split(","); - if (uhs.length == 0) { return null; } // all url's known + if ( uhs.length == 0 ) { + return null; + } // all url's known // extract the urlCache from the result final URIMetadataRow[] urls = new URIMetadataRow[uhs.length]; - for (int i = 0; i < uhs.length; i++) { + for ( int i = 0; i < uhs.length; i++ ) { urls[i] = urlCache.get(ASCII.getBytes(uhs[i])); - if (urls[i] == null) { - if (Network.log.isFine()) Network.log.logFine("DEBUG transferIndex: requested url hash '" + uhs[i] + "', unknownURL='" + uhss + "'"); + if ( urls[i] == null ) { + if ( Network.log.isFine() ) { + Network.log.logFine("DEBUG transferIndex: requested url hash '" + + uhs[i] + + "', unknownURL='" + + uhss + + "'"); + } } } in = transferURL(targetSeed, urls, gzipBody, timeout); - if (in == null) { + if ( in == null ) { return "no connection from transferURL"; } result = in.get("result"); - if (result == null) { + if ( result == null ) { return "no result from transferURL"; } - if (!result.equals("ok")) { + if ( !result.equals("ok") ) { return result; } - EventChannel.channels(EventChannel.DHTSEND).addMessage(new RSSMessage("Sent " + uhs.length + " URLs to peer " + targetSeed.getName(), "", targetSeed.hash)); + EventChannel.channels(EventChannel.DHTSEND).addMessage( + new RSSMessage( + "Sent " + uhs.length + " URLs to peer " + targetSeed.getName(), + "", + targetSeed.hash)); return null; } private static Map transferRWI( - final Seed targetSeed, - final ReferenceContainerCache indexes, - boolean gzipBody, - final int timeout) { + final Seed targetSeed, + final ReferenceContainerCache indexes, + boolean gzipBody, + final int timeout) { final String address = targetSeed.getPublicAddress(); - if (address == null) { Network.log.logWarning("no address for transferRWI"); return null; } + if ( address == null ) { + Network.log.logWarning("no address for transferRWI"); + return null; + } // prepare post values final String salt = crypt.randomSalt(); // enabling gzip compression for post request body - if (gzipBody && (targetSeed.getVersion() < yacyVersion.YACY_SUPPORTS_GZIP_POST_REQUESTS_CHUNKED)) { + if ( gzipBody && (targetSeed.getVersion() < yacyVersion.YACY_SUPPORTS_GZIP_POST_REQUESTS_CHUNKED) ) { gzipBody = false; } @@ -959,18 +1274,19 @@ public final class Protocol { final StringBuilder entrypost = new StringBuilder(indexes.size() * 73); Iterator eenum; Reference entry; - for (final ReferenceContainer ic: indexes) { + for ( final ReferenceContainer ic : indexes ) { eenum = ic.entries(); - while (eenum.hasNext()) { + while ( eenum.hasNext() ) { entry = eenum.next(); - entrypost.append(ASCII.String(ic.getTermHash())) - .append(entry.toPropertyForm()) - .append(serverCore.CRLF_STRING); + entrypost + .append(ASCII.String(ic.getTermHash())) + .append(entry.toPropertyForm()) + .append(serverCore.CRLF_STRING); indexcount++; } } - if (indexcount == 0) { + if ( indexcount == 0 ) { // nothing to do but everything ok final Map result = new HashMap(2); result.put("result", "ok"); @@ -978,13 +1294,19 @@ public final class Protocol { return result; } try { - final Map parts = basicRequestParts(Switchboard.getSwitchboard(), targetSeed.hash, salt); + final Map parts = + basicRequestParts(Switchboard.getSwitchboard(), targetSeed.hash, salt); parts.put("wordc", UTF8.StringBody(Integer.toString(indexes.size()))); parts.put("entryc", UTF8.StringBody(Integer.toString(indexcount))); parts.put("indexes", UTF8.StringBody(entrypost.toString())); // final byte[] content = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI("http://" + address + "/yacy/transferRWI.html"), timeout, targetSeed.getHexHash() + ".yacyh", parts, gzipBody); final HTTPClient httpClient = new HTTPClient(ClientIdentification.getUserAgent(), timeout); - final byte[] content = httpClient.POSTbytes(new MultiProtocolURI("http://" + address + "/yacy/transferRWI.html"), targetSeed.getHexHash() + ".yacyh", parts, gzipBody); + final byte[] content = + httpClient.POSTbytes( + new MultiProtocolURI("http://" + address + "/yacy/transferRWI.html"), + targetSeed.getHexHash() + ".yacyh", + parts, + gzipBody); final Iterator v = FileUtils.strings(content); // this should return a list of urlhashes that are unknown @@ -992,34 +1314,41 @@ public final class Protocol { // return the transfered index data in bytes (for debugging only) result.put("indexPayloadSize", Integer.toString(entrypost.length())); return result; - } catch (final Exception e) { + } catch ( final Exception e ) { Network.log.logInfo("yacyClient.transferRWI to " + address + " error: " + e.getMessage()); return null; } } - private static Map transferURL(final Seed targetSeed, final URIMetadataRow[] urls, boolean gzipBody, final int timeout) { + private static Map transferURL( + final Seed targetSeed, + final URIMetadataRow[] urls, + boolean gzipBody, + final int timeout) { // this post a message to the remote message board final String address = targetSeed.getPublicAddress(); - if (address == null) { return null; } + if ( address == null ) { + return null; + } // prepare post values final String salt = crypt.randomSalt(); - final Map parts = basicRequestParts(Switchboard.getSwitchboard(), targetSeed.hash, salt); + final Map parts = + basicRequestParts(Switchboard.getSwitchboard(), targetSeed.hash, salt); // enabling gzip compression for post request body - if (gzipBody && (targetSeed.getVersion() < yacyVersion.YACY_SUPPORTS_GZIP_POST_REQUESTS_CHUNKED)) { + if ( gzipBody && (targetSeed.getVersion() < yacyVersion.YACY_SUPPORTS_GZIP_POST_REQUESTS_CHUNKED) ) { gzipBody = false; } String resource; int urlc = 0; int urlPayloadSize = 0; - for (final URIMetadataRow url : urls) { - if (url != null) { + for ( final URIMetadataRow url : urls ) { + if ( url != null ) { resource = url.toString(); //System.out.println("*** DEBUG resource = " + resource); - if (resource != null && resource.indexOf(0) == -1) { + if ( resource != null && resource.indexOf(0) == -1 ) { parts.put("url" + urlc, UTF8.StringBody(resource)); urlPayloadSize += resource.length(); urlc++; @@ -1030,14 +1359,19 @@ public final class Protocol { parts.put("urlc", UTF8.StringBody(Integer.toString(urlc))); // final byte[] content = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI("http://" + address + "/yacy/transferURL.html"), timeout, targetSeed.getHexHash() + ".yacyh", parts, gzipBody); final HTTPClient httpClient = new HTTPClient(ClientIdentification.getUserAgent(), timeout); - final byte[] content = httpClient.POSTbytes(new MultiProtocolURI("http://" + address + "/yacy/transferURL.html"), targetSeed.getHexHash() + ".yacyh", parts, gzipBody); + final byte[] content = + httpClient.POSTbytes( + new MultiProtocolURI("http://" + address + "/yacy/transferURL.html"), + targetSeed.getHexHash() + ".yacyh", + parts, + gzipBody); final Iterator v = FileUtils.strings(content); final Map result = FileUtils.table(v); // return the transfered url data in bytes (for debugging only) result.put("urlPayloadSize", Integer.toString(urlPayloadSize)); return result; - } catch (final Exception e) { + } catch ( final Exception e ) { Network.log.logWarning("yacyClient.transferURL to " + address + " error: " + e.getMessage()); return null; } @@ -1050,23 +1384,35 @@ public final class Protocol { final String salt = crypt.randomSalt(); String address = targetSeed.getClusterAddress(); - if (address == null) { address = "localhost:8090"; } + if ( address == null ) { + address = "localhost:8090"; + } try { - final Map parts = basicRequestParts(Switchboard.getSwitchboard(), targetSeed.hash, salt); + final Map parts = + basicRequestParts(Switchboard.getSwitchboard(), targetSeed.hash, salt); // final byte[] content = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI("http://" + address + "/yacy/profile.html"), 5000, targetSeed.getHexHash() + ".yacyh", parts); final HTTPClient httpclient = new HTTPClient(ClientIdentification.getUserAgent(), 5000); - final byte[] content = httpclient.POSTbytes(new MultiProtocolURI("http://" + address + "/yacy/profile.html"), targetSeed.getHexHash() + ".yacyh", parts, false); + final byte[] content = + httpclient.POSTbytes( + new MultiProtocolURI("http://" + address + "/yacy/profile.html"), + targetSeed.getHexHash() + ".yacyh", + parts, + false); return FileUtils.table(content); - } catch (final Exception e) { + } catch ( final Exception e ) { Network.log.logWarning("yacyClient.getProfile error:" + e.getMessage()); return null; } } public static ReferenceContainerCache loadIDXHosts(final Seed target) { - final ReferenceContainerCache index = new ReferenceContainerCache(WebStructureGraph.hostReferenceFactory, Base64Order.enhancedCoder, 6); + final ReferenceContainerCache index = + new ReferenceContainerCache( + WebStructureGraph.hostReferenceFactory, + Base64Order.enhancedCoder, + 6); // check if the host supports this protocol - if (target.getRevision() < migration.IDX_HOST) { + if ( target.getRevision() < migration.IDX_HOST ) { // if the protocol is not supported then we just return an empty host reference container return index; } @@ -1076,14 +1422,16 @@ public final class Protocol { // send request try { - final Map parts = basicRequestParts(Switchboard.getSwitchboard(), target.hash, salt); + final Map parts = + basicRequestParts(Switchboard.getSwitchboard(), target.hash, salt); parts.put("object", UTF8.StringBody("host")); final byte[] content = postToFile(target, "idx.json", parts, 30000); - if (content == null || content.length == 0) { + if ( content == null || content.length == 0 ) { Network.log.logWarning("yacyClient.loadIDXHosts error: empty result"); return null; } - final JSONObject json = new JSONObject(new JSONTokener(new InputStreamReader(new ByteArrayInputStream(content)))); + final JSONObject json = + new JSONObject(new JSONTokener(new InputStreamReader(new ByteArrayInputStream(content)))); /* the json has the following form: { "version":"#[version]#", @@ -1100,144 +1448,155 @@ public final class Protocol { // iterate over all references final Iterator termIterator = idx.keys(); String term; - while (termIterator.hasNext()) { + while ( termIterator.hasNext() ) { term = termIterator.next(); final JSONArray references = idx.getJSONArray(term); // iterate until we get an exception or null int c = 0; String reference; - final ReferenceContainer referenceContainer = new ReferenceContainer(WebStructureGraph.hostReferenceFactory, UTF8.getBytes(term)); + final ReferenceContainer referenceContainer = + new ReferenceContainer( + WebStructureGraph.hostReferenceFactory, + UTF8.getBytes(term)); try { - while ((reference = references.getString(c++)) != null) { + while ( (reference = references.getString(c++)) != null ) { //System.out.println("REFERENCE: " + reference); referenceContainer.add(new HostReference(reference)); } - } catch (final JSONException e) {} // this finishes the iteration + } catch ( final JSONException e ) { + } // this finishes the iteration index.add(referenceContainer); } return index; - } catch (final Exception e) { + } catch ( final Exception e ) { Network.log.logWarning("yacyClient.loadIDXHosts error:" + e.getMessage()); return index; } } - - public static void main(final String[] args) { - if (args.length > 2) { + if ( args.length > 2 ) { // search a remote peer. arguments: // first arg: path to application home // second arg: address of target peer // third arg: search word or file name with list of search words // i.e. /Data/workspace1/yacy/ localhost:8090 /Data/workspace1/yacy/test/words/searchtest.words System.out.println("yacyClient Test"); - final File searchwordfile = new File(args[2]); - final List searchlines = new ArrayList(); - if (searchwordfile.exists()) { - Iterator i; - try { - i = FileUtils.strings(FileUtils.read(searchwordfile)); - while (i.hasNext()) searchlines.add(i.next()); - } catch (final IOException e) { - e.printStackTrace(); - System.exit(-1); + final File searchwordfile = new File(args[2]); + final List searchlines = new ArrayList(); + if ( searchwordfile.exists() ) { + Iterator i; + try { + i = FileUtils.strings(FileUtils.read(searchwordfile)); + while ( i.hasNext() ) { + searchlines.add(i.next()); } - } else { - searchlines.add(args[2]); + } catch ( final IOException e ) { + e.printStackTrace(); + System.exit(-1); } - for (final String line: searchlines) { - final byte[] wordhashe = ASCII.getBytes(QueryParams.hashSet2hashString(Word.words2hashesHandles(QueryParams.cleanQuery(line)[0]))); - final long time = System.currentTimeMillis(); - SearchResult result; - try { - result = new SearchResult( - basicRequestParts((String) null, (String) null, "freeworld"), - null, // sb.peers.mySeed(), - ASCII.String(wordhashe), - "", // excludehashes, - "", // urlhashes, - QueryParams.matchnothing_pattern, // prefer, - QueryParams.catchall_pattern, // filter, - QueryParams.catchall_pattern, // snippet, - "", // modifier - "", // language, - "", // sitehash, - "", // authorhash, - 10, // count, - 3000, // time, - 1000, // maxDistance, - true, //global, - 16, // partitions, - "", args[1], - null, //secondarySearchSuperviser, - new RankingProfile(ContentDomain.TEXT), // rankingProfile, - null // constraint); + } else { + searchlines.add(args[2]); + } + for ( final String line : searchlines ) { + final byte[] wordhashe = + ASCII.getBytes(QueryParams.hashSet2hashString(Word.words2hashesHandles(QueryParams + .cleanQuery(line)[0]))); + final long time = System.currentTimeMillis(); + SearchResult result; + try { + result = + new SearchResult(basicRequestParts((String) null, (String) null, "freeworld"), null, // sb.peers.mySeed(), + ASCII.String(wordhashe), + "", // excludehashes, + "", // urlhashes, + QueryParams.matchnothing_pattern, // prefer, + QueryParams.catchall_pattern, // filter, + QueryParams.catchall_pattern, // snippet, + "", // modifier + "", // language, + "", // sitehash, + "", // authorhash, + 10, // count, + 3000, // time, + 1000, // maxDistance, + true, //global, + 16, // partitions, + "", + args[1], + null, //secondarySearchSuperviser, + new RankingProfile(ContentDomain.TEXT), // rankingProfile, + null // constraint); ); - for (final URIMetadataRow link: result.links) { - System.out.println(link.metadata().url().toNormalform(true, false)); - System.out.println(link.snippet()); - } - } catch (final IOException e) { - // TODO Auto-generated catch block - e.printStackTrace(); + for ( final URIMetadataRow link : result.links ) { + System.out.println(link.metadata().url().toNormalform(true, false)); + System.out.println(link.snippet()); } - System.out.println("Search Time: " + (System.currentTimeMillis() - time)); + } catch ( final IOException e ) { + // TODO Auto-generated catch block + e.printStackTrace(); } + System.out.println("Search Time: " + (System.currentTimeMillis() - time)); + } System.exit(0); - } else if(args.length == 1) { + } else if ( args.length == 1 ) { System.out.println("wput Test"); // connection params MultiProtocolURI url = null; try { url = new MultiProtocolURI(args[0]); - } catch (final MalformedURLException e) { + } catch ( final MalformedURLException e ) { Log.logException(e); } - if (url == null) { + if ( url == null ) { System.exit(1); return; } final String vhost = url.getHost(); final int timeout = 10000; // new data - final Map newpost = new LinkedHashMap(); + final Map newpost = new LinkedHashMap(); newpost.put("process", UTF8.StringBody("permission")); newpost.put("purpose", UTF8.StringBody("crcon")); - byte[] res; - try { - // res = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(url, timeout, vhost, newpost, true); - final HTTPClient httpClient = new HTTPClient(ClientIdentification.getUserAgent(), timeout); - res = httpClient.POSTbytes(url, vhost, newpost, true); - System.out.println(UTF8.String(res)); - } catch (final IOException e1) { - Log.logException(e1); - } + byte[] res; + try { + // res = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(url, timeout, vhost, newpost, true); + final HTTPClient httpClient = new HTTPClient(ClientIdentification.getUserAgent(), timeout); + res = httpClient.POSTbytes(url, vhost, newpost, true); + System.out.println(UTF8.String(res)); + } catch ( final IOException e1 ) { + Log.logException(e1); + } + } + try { + net.yacy.cora.protocol.http.HTTPClient.closeConnectionManager(); + } catch ( final InterruptedException e ) { + Log.logException(e); } - try { - net.yacy.cora.protocol.http.HTTPClient.closeConnectionManager(); - } catch (final InterruptedException e) { - Log.logException(e); - } } public static final boolean authentifyRequest(final serverObjects post, final serverSwitch env) { - if (post == null || env == null) return false; + if ( post == null || env == null ) { + return false; + } // identify network final String unitName = post.get(SwitchboardConstants.NETWORK_NAME, Seed.DFLT_NETWORK_UNIT); // the network unit - if (!unitName.equals(env.getConfig(SwitchboardConstants.NETWORK_NAME, Seed.DFLT_NETWORK_UNIT))) { + if ( !unitName.equals(env.getConfig(SwitchboardConstants.NETWORK_NAME, Seed.DFLT_NETWORK_UNIT)) ) { return false; } // check authentication method final String authenticationControl = env.getConfig("network.unit.protocol.control", "uncontrolled"); - if (authenticationControl.equals("uncontrolled")) return true; - final String authenticationMethod = env.getConfig("network.unit.protocol.request.authentication.method", ""); - if (authenticationMethod.length() == 0) { + if ( authenticationControl.equals("uncontrolled") ) { + return true; + } + final String authenticationMethod = + env.getConfig("network.unit.protocol.request.authentication.method", ""); + if ( authenticationMethod.length() == 0 ) { return false; } - if (authenticationMethod.equals("salted-magic-sim")) { + if ( authenticationMethod.equals("salted-magic-sim") ) { // authorize the peer using the md5-magic final String salt = post.get("key", ""); final String iam = post.get("iam", ""); @@ -1250,19 +1609,30 @@ public final class Protocol { return false; } - public static final LinkedHashMap basicRequestParts(final Switchboard sb, final String targetHash, final String salt) { + public static final LinkedHashMap basicRequestParts( + final Switchboard sb, + final String targetHash, + final String salt) { // put in all the essentials for routing and network authentication // generate a session key - final LinkedHashMap parts = basicRequestParts(sb.peers.mySeed().hash, targetHash, Switchboard.getSwitchboard().getConfig(SwitchboardConstants.NETWORK_NAME, Seed.DFLT_NETWORK_UNIT)); + final LinkedHashMap parts = + basicRequestParts( + sb.peers.mySeed().hash, + targetHash, + Switchboard.getSwitchboard().getConfig( + SwitchboardConstants.NETWORK_NAME, + Seed.DFLT_NETWORK_UNIT)); parts.put("key", UTF8.StringBody(salt)); // authentication essentials final String authenticationControl = sb.getConfig("network.unit.protocol.control", "uncontrolled"); - final String authenticationMethod = sb.getConfig("network.unit.protocol.request.authentication.method", ""); - if ((authenticationControl.equals("controlled")) && (authenticationMethod.length() > 0)) { - if (authenticationMethod.equals("salted-magic-sim")) { + final String authenticationMethod = + sb.getConfig("network.unit.protocol.request.authentication.method", ""); + if ( (authenticationControl.equals("controlled")) && (authenticationMethod.length() > 0) ) { + if ( authenticationMethod.equals("salted-magic-sim") ) { // generate an authentication essential using the salt, the iam-hash and the network magic - final String magic = sb.getConfig("network.unit.protocol.request.authentication.essentials", ""); + final String magic = + sb.getConfig("network.unit.protocol.request.authentication.essentials", ""); final String md5 = Digest.encodeMD5Hex(salt + sb.peers.mySeed().hash + magic); parts.put("magicmd5", UTF8.StringBody(md5)); } @@ -1271,19 +1641,25 @@ public final class Protocol { return parts; } - public static final LinkedHashMap basicRequestParts(final String myHash, final String targetHash, final String networkName) { + public static final LinkedHashMap basicRequestParts( + final String myHash, + final String targetHash, + final String networkName) { // put in all the essentials for routing and network authentication // generate a session key - final LinkedHashMap parts = new LinkedHashMap(); + final LinkedHashMap parts = new LinkedHashMap(); // just standard identification essentials - if (myHash != null) { + if ( myHash != null ) { parts.put("iam", UTF8.StringBody(myHash)); - if (targetHash != null) parts.put("youare", UTF8.StringBody(targetHash)); + if ( targetHash != null ) { + parts.put("youare", UTF8.StringBody(targetHash)); + } // time information for synchronization // use our own formatter to prevent concurrency locks with other processes - final GenericFormatter my_SHORT_SECOND_FORMATTER = new GenericFormatter(GenericFormatter.FORMAT_SHORT_SECOND, GenericFormatter.time_second); + final GenericFormatter my_SHORT_SECOND_FORMATTER = + new GenericFormatter(GenericFormatter.FORMAT_SHORT_SECOND, GenericFormatter.time_second); parts.put("mytime", UTF8.StringBody(my_SHORT_SECOND_FORMATTER.format())); parts.put("myUTC", UTF8.StringBody(Long.toString(System.currentTimeMillis()))); diff --git a/source/net/yacy/search/index/DocumentIndex.java b/source/net/yacy/search/index/DocumentIndex.java index e96daaa8a..9b4241a00 100644 --- a/source/net/yacy/search/index/DocumentIndex.java +++ b/source/net/yacy/search/index/DocumentIndex.java @@ -24,7 +24,6 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - package net.yacy.search.index; import java.io.File; @@ -51,13 +50,13 @@ import net.yacy.search.ranking.RankingProfile; import net.yacy.search.ranking.ReferenceOrder; import net.yacy.search.snippet.ContentDomain; - /** * convenience class to access the yacycore library from outside of yacy to put files into the index + * * @author Michael Christen - * */ -public class DocumentIndex extends Segment { +public class DocumentIndex extends Segment +{ private static final RankingProfile textRankingDefault = new RankingProfile(ContentDomain.TEXT); //private Bitfield zeroConstraint = new Bitfield(4); @@ -66,7 +65,8 @@ public class DocumentIndex extends Segment { static { try { poison = new DigestURI("file://."); - } catch (final MalformedURLException e) {} + } catch ( final MalformedURLException e ) { + } } BlockingQueue queue; // a queue of document ID's private final Worker[] worker; @@ -74,20 +74,21 @@ public class DocumentIndex extends Segment { static final ThreadGroup workerThreadGroup = new ThreadGroup("workerThreadGroup"); - - public DocumentIndex(final File segmentPath, final CallbackListener callback, final int cachesize) throws IOException { + public DocumentIndex(final File segmentPath, final CallbackListener callback, final int cachesize) + throws IOException { super(new Log("DocumentIndex"), segmentPath, cachesize, targetFileSize * 4 - 1, false, false); final int cores = Runtime.getRuntime().availableProcessors() + 1; this.callback = callback; this.queue = new LinkedBlockingQueue(cores * 300); this.worker = new Worker[cores]; - for (int i = 0; i < cores; i++) { + for ( int i = 0; i < cores; i++ ) { this.worker[i] = new Worker(i); this.worker[i].start(); } } - class Worker extends Thread { + class Worker extends Thread + { public Worker(final int count) { super(workerThreadGroup, "query-" + count); } @@ -97,22 +98,27 @@ public class DocumentIndex extends Segment { DigestURI f; URIMetadataRow[] resultRows; try { - while ((f = DocumentIndex.this.queue.take()) != poison) try { - resultRows = add(f); - for (final URIMetadataRow resultRow: resultRows) { - if (DocumentIndex.this.callback != null) { - if (resultRow == null) { - DocumentIndex.this.callback.fail(f, "result is null"); - } else { - DocumentIndex.this.callback.commit(f, resultRow); + while ( (f = DocumentIndex.this.queue.take()) != poison ) { + try { + resultRows = add(f); + for ( final URIMetadataRow resultRow : resultRows ) { + if ( DocumentIndex.this.callback != null ) { + if ( resultRow == null ) { + DocumentIndex.this.callback.fail(f, "result is null"); + } else { + DocumentIndex.this.callback.commit(f, resultRow); + } } } + } catch ( final IOException e ) { + if ( e.getMessage().indexOf("cannot parse", 0) < 0 ) { + Log.logException(e); + } + DocumentIndex.this.callback.fail(f, e.getMessage()); } - } catch (final IOException e) { - if (e.getMessage().indexOf("cannot parse",0) < 0) Log.logException(e); - DocumentIndex.this.callback.fail(f, e.getMessage()); } - } catch (final InterruptedException e) {} + } catch ( final InterruptedException e ) { + } } } @@ -128,70 +134,79 @@ public class DocumentIndex extends Segment { } private URIMetadataRow[] add(final DigestURI url) throws IOException { - if (url == null) throw new IOException("file = null"); - if (url.isDirectory()) throw new IOException("file should be a document, not a path"); - if (!url.canRead()) throw new IOException("cannot read file"); + if ( url == null ) { + throw new IOException("file = null"); + } + if ( url.isDirectory() ) { + throw new IOException("file should be a document, not a path"); + } + if ( !url.canRead() ) { + throw new IOException("cannot read file"); + } Document[] documents; long length; try { length = url.length(); - } catch (final Exception e) { + } catch ( final Exception e ) { length = -1; } try { documents = TextParser.parseSource(url, null, null, length, url.getInputStream(null, -1), true); - } catch (final Exception e) { + } catch ( final Exception e ) { throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage()); } //Document document = Document.mergeDocuments(url, null, documents); final URIMetadataRow[] rows = new URIMetadataRow[documents.length]; int c = 0; - for (final Document document: documents) { + for ( final Document document : documents ) { final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib); - rows[c++] = super.storeDocument( - url, - null, - new Date(url.lastModified()), - new Date(), - url.length(), - document, - condenser, - null, - DocumentIndex.class.getName() + ".add" - ); + rows[c++] = + super.storeDocument( + url, + null, + new Date(url.lastModified()), + new Date(), + url.length(), + document, + condenser, + null, + DocumentIndex.class.getName() + ".add"); } return rows; } /** - * add a file or a directory of files to the index - * If the given file is a path to a directory, the complete sub-tree is indexed + * add a file or a directory of files to the index If the given file is a path to a directory, the + * complete sub-tree is indexed + * * @param start */ public void addConcurrent(final DigestURI start) throws IOException { assert (start != null); assert (start.canRead()) : start.toString(); - if (!start.isDirectory()) { + if ( !start.isDirectory() ) { try { this.queue.put(start); - } catch (final InterruptedException e) {} + } catch ( final InterruptedException e ) { + } return; } final String[] s = start.list(); DigestURI w; - for (final String t: s) { + for ( final String t : s ) { try { w = new DigestURI(start, t); - if (w.canRead() && !w.isHidden()) { - if (w.isDirectory()) { + if ( w.canRead() && !w.isHidden() ) { + if ( w.isDirectory() ) { addConcurrent(w); } else { try { this.queue.put(w); - } catch (final InterruptedException e) {} + } catch ( final InterruptedException e ) { + } } } - } catch (final MalformedURLException e1) { + } catch ( final MalformedURLException e1 ) { Log.logException(e1); } } @@ -199,13 +214,15 @@ public class DocumentIndex extends Segment { /** * do a full-text search of a given string and return a specific number of results + * * @param querystring * @param count * @return a list of files that contain the given string */ public ArrayList find(final String querystring, int count) { // make a query and start a search - final QueryParams query = new QueryParams(querystring, count, null, this, textRankingDefault, "DocumentIndex"); + final QueryParams query = + new QueryParams(querystring, count, null, this, textRankingDefault, "DocumentIndex"); final ReferenceOrder order = new ReferenceOrder(query.ranking, UTF8.getBytes(query.targetlang)); final RWIProcess rankedCache = new RWIProcess(query, order, SearchEvent.max_results_preparation); rankedCache.start(); @@ -214,40 +231,48 @@ public class DocumentIndex extends Segment { URIMetadataRow row; final ArrayList files = new ArrayList(); Components metadata; - while ((row = rankedCache.takeURL(false, 1000)) != null) { + while ( (row = rankedCache.takeURL(false, 1000)) != null ) { metadata = row.metadata(); - if (metadata == null) continue; + if ( metadata == null ) { + continue; + } files.add(metadata.url()); count--; - if (count == 0) break; + if ( count == 0 ) { + break; + } } return files; } /** - * close the index. - * This terminates all worker threads and then closes the segment. + * close the index. This terminates all worker threads and then closes the segment. */ @Override public void close() { // send termination signal to worker threads - for (@SuppressWarnings("unused") final Worker element : this.worker) { + for ( @SuppressWarnings("unused") + final Worker element : this.worker ) { try { this.queue.put(poison); - } catch (final InterruptedException e) {} + } catch ( final InterruptedException e ) { + } } // wait for termination - for (final Worker element : this.worker) { + for ( final Worker element : this.worker ) { try { element.join(); - } catch (final InterruptedException e) {} + } catch ( final InterruptedException e ) { + } } // close the segment super.close(); } - public interface CallbackListener { + public interface CallbackListener + { public void commit(DigestURI f, URIMetadataRow resultRow); + public void fail(DigestURI f, String failReason); } @@ -260,35 +285,44 @@ public class DocumentIndex extends Segment { // DocumentIndex yacyindex add test/parsertest // DocumentIndex yacyindex search steht System.setProperty("java.awt.headless", "true"); - if (args.length < 3) return; + if ( args.length < 3 ) { + return; + } final File segmentPath = new File(args[0]); System.out.println("using index files at " + segmentPath.getAbsolutePath()); final CallbackListener callback = new CallbackListener() { + @Override public void commit(final DigestURI f, final URIMetadataRow resultRow) { System.out.println("indexed: " + f.toString()); } + + @Override public void fail(final DigestURI f, final String failReason) { System.out.println("not indexed " + f.toString() + ": " + failReason); } }; try { - if (args[1].equals("add")) { + if ( args[1].equals("add") ) { final DigestURI f = new DigestURI(args[2]); final DocumentIndex di = new DocumentIndex(segmentPath, callback, 100000); di.addConcurrent(f); di.close(); } else { String query = ""; - for (int i = 2; i < args.length; i++) query += args[i]; + for ( int i = 2; i < args.length; i++ ) { + query += args[i]; + } query.trim(); final DocumentIndex di = new DocumentIndex(segmentPath, callback, 100000); final ArrayList results = di.find(query, 100); - for (final DigestURI f: results) { - if (f != null) System.out.println(f.toString()); + for ( final DigestURI f : results ) { + if ( f != null ) { + System.out.println(f.toString()); + } } di.close(); } - } catch (final IOException e) { + } catch ( final IOException e ) { Log.logException(e); } //System.exit(0); diff --git a/source/net/yacy/search/query/RWIProcess.java b/source/net/yacy/search/query/RWIProcess.java index 156d49fd3..fde64e205 100644 --- a/source/net/yacy/search/query/RWIProcess.java +++ b/source/net/yacy/search/query/RWIProcess.java @@ -65,30 +65,34 @@ import net.yacy.search.ranking.ReferenceOrder; import net.yacy.search.snippet.ContentDomain; import net.yacy.search.snippet.ResultEntry; -public final class RWIProcess extends Thread { +public final class RWIProcess extends Thread +{ + private static final long maxWaitPerResult = 30; private static final int maxDoubleDomAll = 1000, maxDoubleDomSpecial = 10000; private final QueryParams query; private final HandleSet urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion) private final int[] flagcount; // flag counter private final HandleSet misses; // contains url-hashes that could not been found in the LURL-DB - private int sortout; // counter for referenced that had been sorted out for other reasons + private int sortout; // counter for referenced that had been sorted out for other reasons //private final int[] domZones; private SortedMap> localSearchInclusion; private int remote_resourceSize, remote_indexCount, remote_peerCount; private int local_indexCount; + private int initialExpectedRemoteReferences; + private final AtomicInteger expectedRemoteReferences, receivedRemoteReferences; private final WeakPriorityBlockingQueue stack; private final AtomicInteger feeders; private final ConcurrentHashMap> doubleDomCache; // key = domhash (6 bytes); value = like stack //private final HandleSet handover; // key = urlhash; used for double-check of urls that had been handed over to search process - private final ScoreMap ref; // reference score computation for the commonSense heuristic + private final ScoreMap ref; // reference score computation for the commonSense heuristic private final Map hostResolver; // a mapping from a host hash (6 bytes) to the full url hash of one of these urls that have the host hash private final ReferenceOrder order; private final long startTime; - private boolean addRunning; + private boolean addRunning; // navigation scores private final ScoreMap hostNavigator; // a counter for the appearance of the host hash @@ -97,7 +101,6 @@ public final class RWIProcess extends Thread { private final ScoreMap protocolNavigator; // a counter for protocol types private final ScoreMap filetypeNavigator; // a counter for file types - public RWIProcess(final QueryParams query, final ReferenceOrder order, final int maxentries) { // we collect the urlhashes and construct a list with urlEntry objects // attention: if minEntries is too high, this method will not terminate within the maxTime @@ -112,11 +115,15 @@ public final class RWIProcess extends Thread { this.remote_resourceSize = 0; this.remote_indexCount = 0; this.local_indexCount = 0; - this.urlhashes = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 100); - this.misses = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 100); + this.urlhashes = + new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 100); + this.misses = + new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 100); this.sortout = 0; this.flagcount = new int[32]; - for (int i = 0; i < 32; i++) {this.flagcount[i] = 0;} + for ( int i = 0; i < 32; i++ ) { + this.flagcount[i] = 0; + } this.hostNavigator = new ConcurrentScoreMap(); this.hostResolver = new ConcurrentHashMap(); this.authorNavigator = new ConcurrentScoreMap(); @@ -126,6 +133,18 @@ public final class RWIProcess extends Thread { this.ref = new ConcurrentScoreMap(); this.feeders = new AtomicInteger(1); this.startTime = System.currentTimeMillis(); + this.initialExpectedRemoteReferences = 0; + this.expectedRemoteReferences = new AtomicInteger(0); + this.receivedRemoteReferences = new AtomicInteger(0); + } + + public void setExpectedRemoteReferences(int expectedRemoteReferences) { + this.initialExpectedRemoteReferences = expectedRemoteReferences; + this.expectedRemoteReferences.set(expectedRemoteReferences); + } + + public void decExpectedRemoteReferences(int x) { + this.expectedRemoteReferences.addAndGet(-x); } public QueryParams getQuery() { @@ -144,19 +163,31 @@ public final class RWIProcess extends Thread { // so following sortings together with the global results will be fast try { final long timer = System.currentTimeMillis(); - final TermSearch search = this.query.getSegment().termIndex().query( - this.query.queryHashes, - this.query.excludeHashes, - null, - Segment.wordReferenceFactory, - this.query.maxDistance); + final TermSearch search = + this.query + .getSegment() + .termIndex() + .query( + this.query.queryHashes, + this.query.excludeHashes, + null, + Segment.wordReferenceFactory, + this.query.maxDistance); this.localSearchInclusion = search.inclusion(); final ReferenceContainer index = search.joined(); - EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(this.query.id(true), SearchEvent.Type.JOIN, this.query.queryString, index.size(), System.currentTimeMillis() - timer), false); - if (!index.isEmpty()) { + EventTracker.update( + EventTracker.EClass.SEARCH, + new ProfilingGraph.EventSearch( + this.query.id(true), + SearchEvent.Type.JOIN, + this.query.queryString, + index.size(), + System.currentTimeMillis() - timer), + false); + if ( !index.isEmpty() ) { add(index, true, "local index: " + this.query.getSegment().getLocation(), -1, true); } - } catch (final Exception e) { + } catch ( final Exception e ) { Log.logException(e); } finally { oneFeederTerminated(); @@ -164,11 +195,11 @@ public final class RWIProcess extends Thread { } public void add( - final ReferenceContainer index, - final boolean local, - final String resourceName, - final int fullResource, - final boolean finalizeAddAtEnd) { + final ReferenceContainer index, + final boolean local, + final String resourceName, + final int fullResource, + final boolean finalizeAddAtEnd) { // we collect the urlhashes and construct a list with urlEntry objects // attention: if minEntries is too high, this method will not terminate within the maxTime //Log.logInfo("RWIProcess", "added a container, size = " + index.size()); @@ -176,9 +207,11 @@ public final class RWIProcess extends Thread { this.addRunning = true; assert (index != null); - if (index.isEmpty()) return; + if ( index.isEmpty() ) { + return; + } - if (!local) { + if ( !local ) { assert fullResource >= 0 : "fullResource = " + fullResource; this.remote_resourceSize += fullResource; this.remote_peerCount++; @@ -188,40 +221,67 @@ public final class RWIProcess extends Thread { // normalize entries final BlockingQueue decodedEntries = this.order.normalizeWith(index); - EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(this.query.id(true), SearchEvent.Type.NORMALIZING, resourceName, index.size(), System.currentTimeMillis() - timer), false); + EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch( + this.query.id(true), + SearchEvent.Type.NORMALIZING, + resourceName, + index.size(), + System.currentTimeMillis() - timer), false); + this.receivedRemoteReferences.addAndGet(index.size()); // iterate over normalized entries and select some that are better than currently stored timer = System.currentTimeMillis(); - final boolean nav_hosts = this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts",0) >= 0; + final boolean nav_hosts = + this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts", 0) >= 0; // apply all constraints try { WordReferenceVars iEntry; final String pattern = this.query.urlMask.pattern(); final boolean httpPattern = pattern.equals("http://.*"); - final boolean noHttpButProtocolPattern = pattern.equals("https://.*") || pattern.equals("ftp://.*") || pattern.equals("smb://.*") || pattern.equals("file://.*"); - pollloop: while (true) { + final boolean noHttpButProtocolPattern = + pattern.equals("https://.*") + || pattern.equals("ftp://.*") + || pattern.equals("smb://.*") + || pattern.equals("file://.*"); + pollloop: while ( true ) { iEntry = decodedEntries.poll(1, TimeUnit.SECONDS); - if (iEntry == null || iEntry == WordReferenceVars.poison) break pollloop; + if ( iEntry == null || iEntry == WordReferenceVars.poison ) { + break pollloop; + } assert (iEntry.urlhash().length == index.row().primaryKeyLength); //if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue; // increase flag counts - for (int j = 0; j < 32; j++) { - if (iEntry.flags().get(j)) {this.flagcount[j]++;} + for ( int j = 0; j < 32; j++ ) { + if ( iEntry.flags().get(j) ) { + this.flagcount[j]++; + } } // check constraints - if (!testFlags(iEntry)) { + if ( !testFlags(iEntry) ) { continue pollloop; } // check document domain - if (this.query.contentdom != ContentDomain.TEXT) { - if ((this.query.contentdom == ContentDomain.AUDIO) && (!(iEntry.flags().get(Condenser.flag_cat_hasaudio)))) { continue pollloop; } - if ((this.query.contentdom == ContentDomain.VIDEO) && (!(iEntry.flags().get(Condenser.flag_cat_hasvideo)))) { continue pollloop; } - if ((this.query.contentdom == ContentDomain.IMAGE) && (!(iEntry.flags().get(Condenser.flag_cat_hasimage)))) { continue pollloop; } - if ((this.query.contentdom == ContentDomain.APP ) && (!(iEntry.flags().get(Condenser.flag_cat_hasapp )))) { continue pollloop; } + if ( this.query.contentdom != ContentDomain.TEXT ) { + if ( (this.query.contentdom == ContentDomain.AUDIO) + && (!(iEntry.flags().get(Condenser.flag_cat_hasaudio))) ) { + continue pollloop; + } + if ( (this.query.contentdom == ContentDomain.VIDEO) + && (!(iEntry.flags().get(Condenser.flag_cat_hasvideo))) ) { + continue pollloop; + } + if ( (this.query.contentdom == ContentDomain.IMAGE) + && (!(iEntry.flags().get(Condenser.flag_cat_hasimage))) ) { + continue pollloop; + } + if ( (this.query.contentdom == ContentDomain.APP) + && (!(iEntry.flags().get(Condenser.flag_cat_hasapp))) ) { + continue pollloop; + } } // check tld domain @@ -238,81 +298,105 @@ public final class RWIProcess extends Thread { // check site constraints final String hosthash = iEntry.hosthash(); - if (this.query.sitehash == null) { + if ( this.query.sitehash == null ) { // no site constraint there; maybe collect host navigation information - if (nav_hosts && this.query.urlMask_isCatchall) { + if ( nav_hosts && this.query.urlMask_isCatchall ) { this.hostNavigator.inc(hosthash); this.hostResolver.put(hosthash, iEntry.urlhash()); } } else { - if (!hosthash.equals(this.query.sitehash)) { + if ( !hosthash.equals(this.query.sitehash) ) { // filter out all domains that do not match with the site constraint continue pollloop; } } // check protocol - if (!this.query.urlMask_isCatchall) { + if ( !this.query.urlMask_isCatchall ) { final boolean httpFlagSet = DigestURI.flag4HTTPset(iEntry.urlHash); - if (httpPattern && !httpFlagSet) continue pollloop; - if (noHttpButProtocolPattern && httpFlagSet) continue pollloop; + if ( httpPattern && !httpFlagSet ) { + continue pollloop; + } + if ( noHttpButProtocolPattern && httpFlagSet ) { + continue pollloop; + } } // finally make a double-check and insert result to stack // the url hashes should be unique, no reason to check that //if (!this.urlhashes.has(iEntry.urlhash())) { - this.urlhashes.putUnique(iEntry.urlhash()); - rankingtryloop: while (true) { - try { - this.stack.put(new ReverseElement(iEntry, this.order.cardinal(iEntry))); // inserts the element and removes the worst (which is smallest) - break rankingtryloop; - } catch (final ArithmeticException e) { - // this may happen if the concurrent normalizer changes values during cardinal computation - continue rankingtryloop; - } + this.urlhashes.putUnique(iEntry.urlhash()); + rankingtryloop: while ( true ) { + try { + this.stack.put(new ReverseElement(iEntry, this.order + .cardinal(iEntry))); // inserts the element and removes the worst (which is smallest) + break rankingtryloop; + } catch ( final ArithmeticException e ) { + // this may happen if the concurrent normalizer changes values during cardinal computation + continue rankingtryloop; } - // increase counter for statistics - if (local) this.local_indexCount++; else this.remote_indexCount++; - //} + } + // increase counter for statistics + if ( local ) { + this.local_indexCount++; + } else { + this.remote_indexCount++; + //} + } } - } catch (final InterruptedException e) {} catch (final RowSpaceExceededException e) {} finally { - if (finalizeAddAtEnd) this.addRunning = false; + } catch ( final InterruptedException e ) { + } catch ( final RowSpaceExceededException e ) { + } finally { + if ( finalizeAddAtEnd ) { + this.addRunning = false; + } } //if ((query.neededResults() > 0) && (container.size() > query.neededResults())) remove(true, true); - EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(this.query.id(true), SearchEvent.Type.PRESORT, resourceName, index.size(), System.currentTimeMillis() - timer), false); + EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch( + this.query.id(true), + SearchEvent.Type.PRESORT, + resourceName, + index.size(), + System.currentTimeMillis() - timer), false); } /** * method to signal the incoming stack that one feeder has terminated */ public void oneFeederTerminated() { - final int c = this.feeders.decrementAndGet(); - assert c >= 0 : "feeders = " + c; + final int c = this.feeders.decrementAndGet(); + assert c >= 0 : "feeders = " + c; } public void moreFeeders(final int countMoreFeeders) { - this.feeders.addAndGet(countMoreFeeders); + this.feeders.addAndGet(countMoreFeeders); } public boolean feedingIsFinished() { - return this.feeders.get() <= 0; + return this.feeders.get() <= 0; } private boolean testFlags(final WordReference ientry) { - if (this.query.constraint == null) return true; + if ( this.query.constraint == null ) { + return true; + } // test if ientry matches with filter // if all = true: let only entries pass that has all matching bits // if all = false: let all entries pass that has at least one matching bit - if (this.query.allofconstraint) { - for (int i = 0; i < 32; i++) { - if ((this.query.constraint.get(i)) && (!ientry.flags().get(i))) return false; + if ( this.query.allofconstraint ) { + for ( int i = 0; i < 32; i++ ) { + if ( (this.query.constraint.get(i)) && (!ientry.flags().get(i)) ) { + return false; + } } return true; } - for (int i = 0; i < 32; i++) { - if ((this.query.constraint.get(i)) && (ientry.flags().get(i))) return true; + for ( int i = 0; i < 32; i++ ) { + if ( (this.query.constraint.get(i)) && (ientry.flags().get(i)) ) { + return true; + } } return false; } @@ -323,7 +407,9 @@ public final class RWIProcess extends Thread { return this.localSearchInclusion; } - private WeakPriorityBlockingQueue.Element takeRWI(final boolean skipDoubleDom, final long waitingtime) { + private WeakPriorityBlockingQueue.Element takeRWI( + final boolean skipDoubleDom, + final long waitingtime) { // returns from the current RWI list the best entry and removes this entry from the list WeakPriorityBlockingQueue m; @@ -334,28 +420,51 @@ public final class RWIProcess extends Thread { //System.out.println("stack.poll: feeders = " + this.feeders + ", stack.sizeQueue = " + stack.sizeQueue()); int loops = 0; // a loop counter to terminate the reading if all the results are from the same domain final long timeout = System.currentTimeMillis() + waitingtime; - while (((!feedingIsFinished() && this.addRunning) || this.stack.sizeQueue() > 0) && - (this.query.itemsPerPage < 1 || loops++ < this.query.itemsPerPage)) { - if (waitingtime <= 0) { + // wait some time if we did not get so much remote results so far to get a better ranking over remote results + // we wait at most 30 milliseconds to get a maximum total waiting time of 300 milliseconds for 10 results + long wait = + this.receivedRemoteReferences.get() == 0 ? maxWaitPerResult : Math.min( + maxWaitPerResult, + maxWaitPerResult + * this.initialExpectedRemoteReferences + / this.receivedRemoteReferences.get()); + if ( wait > 0 ) { + Thread.sleep(wait); + } + // loop as long as we can expect that we should get more results + while ( ((!feedingIsFinished() && this.addRunning) || this.stack.sizeQueue() > 0) + && (this.query.itemsPerPage < 1 || loops++ < this.query.itemsPerPage) ) { + if ( waitingtime <= 0 ) { rwi = this.stack.poll(); - } else timeoutloop:while (System.currentTimeMillis() < timeout) { - if (feedingIsFinished() && this.stack.sizeQueue() == 0) break timeoutloop; - rwi = this.stack.poll(50); - if (rwi != null) break timeoutloop; + } else { + timeoutloop: while ( System.currentTimeMillis() < timeout ) { + if ( feedingIsFinished() && this.stack.sizeQueue() == 0 ) { + break timeoutloop; + } + rwi = this.stack.poll(50); + if ( rwi != null ) { + break timeoutloop; + } + } + } + if ( rwi == null ) { + break; } - if (rwi == null) break; - if (!skipDoubleDom) { + if ( !skipDoubleDom ) { //System.out.println("!skipDoubleDom"); return rwi; - } + } // check doubledom final String hosthash = rwi.getElement().hosthash(); - synchronized (this.doubleDomCache) { + synchronized ( this.doubleDomCache ) { m = this.doubleDomCache.get(hosthash); - if (m == null) { + if ( m == null ) { // first appearance of dom. we create an entry to signal that one of that domain was already returned - m = new WeakPriorityBlockingQueue((this.query.specialRights) ? maxDoubleDomSpecial : maxDoubleDomAll); + m = + new WeakPriorityBlockingQueue((this.query.specialRights) + ? maxDoubleDomSpecial + : maxDoubleDomAll); this.doubleDomCache.put(hosthash, m); return rwi; } @@ -363,35 +472,47 @@ public final class RWIProcess extends Thread { m.put(rwi); } } - } catch (final InterruptedException e1) {} - if (this.doubleDomCache.isEmpty()) return null; + } catch ( final InterruptedException e1 ) { + } + if ( this.doubleDomCache.isEmpty() ) { + return null; + } // no more entries in sorted RWI entries. Now take Elements from the doubleDomCache // find best entry from all caches WeakPriorityBlockingQueue.Element bestEntry = null; WeakPriorityBlockingQueue.Element o; - synchronized (this.doubleDomCache) { - final Iterator> i = this.doubleDomCache.values().iterator(); - while (i.hasNext()) { + synchronized ( this.doubleDomCache ) { + final Iterator> i = + this.doubleDomCache.values().iterator(); + while ( i.hasNext() ) { try { m = i.next(); - } catch (final ConcurrentModificationException e) { + } catch ( final ConcurrentModificationException e ) { Log.logException(e); continue; // not the best solution... } - if (m == null) continue; - if (m.isEmpty()) continue; - if (bestEntry == null) { + if ( m == null ) { + continue; + } + if ( m.isEmpty() ) { + continue; + } + if ( bestEntry == null ) { bestEntry = m.peek(); continue; } o = m.peek(); - if (o == null) continue; - if (o.getWeight() < bestEntry.getWeight()) { + if ( o == null ) { + continue; + } + if ( o.getWeight() < bestEntry.getWeight() ) { bestEntry = o; } } - if (bestEntry == null) return null; + if ( bestEntry == null ) { + return null; + } // finally remove the best entry from the doubledom cache m = this.doubleDomCache.get(bestEntry.getElement().hosthash()); @@ -401,44 +522,48 @@ public final class RWIProcess extends Thread { } /** - * get one metadata entry from the ranked results. This will be the 'best' entry so far - * according to the applied ranking. If there are no more entries left or the timeout - * limit is reached then null is returned. The caller may distinguish the timeout case - * from the case where there will be no more also in the future by calling this.feedingIsFinished() + * get one metadata entry from the ranked results. This will be the 'best' entry so far according to the + * applied ranking. If there are no more entries left or the timeout limit is reached then null is + * returned. The caller may distinguish the timeout case from the case where there will be no more also in + * the future by calling this.feedingIsFinished() + * * @param skipDoubleDom should be true if it is wanted that double domain entries are skipped * @param waitingtime the time this method may take for a result computation * @return a metadata entry for a url */ public URIMetadataRow takeURL(final boolean skipDoubleDom, final long waitingtime) { // returns from the current RWI list the best URL entry and removes this entry from the list - final long timeout = System.currentTimeMillis() + Math.max(10, waitingtime); - int p = -1; - long timeleft; - while ((timeleft = timeout - System.currentTimeMillis()) > 0) { - //System.out.println("timeleft = " + timeleft); - final WeakPriorityBlockingQueue.Element obrwi = takeRWI(skipDoubleDom, timeleft); - if (obrwi == null) return null; // all time was already wasted in takeRWI to get another element + final long timeout = System.currentTimeMillis() + Math.max(10, waitingtime); + int p = -1; + long timeleft; + while ( (timeleft = timeout - System.currentTimeMillis()) > 0 ) { + //System.out.println("timeleft = " + timeleft); + final WeakPriorityBlockingQueue.Element obrwi = + takeRWI(skipDoubleDom, timeleft); + if ( obrwi == null ) { + return null; // all time was already wasted in takeRWI to get another element + } final URIMetadataRow page = this.query.getSegment().urlMetadata().load(obrwi); - if (page == null) { - try { + if ( page == null ) { + try { this.misses.putUnique(obrwi.getElement().urlhash()); - } catch (final RowSpaceExceededException e) { + } catch ( final RowSpaceExceededException e ) { } - continue; + continue; } // prepare values for constraint check final URIMetadataRow.Components metadata = page.metadata(); // check errors - if (metadata == null) { + if ( metadata == null ) { this.sortout++; continue; // rare case where the url is corrupted } - if (!this.query.urlMask_isCatchall) { + if ( !this.query.urlMask_isCatchall ) { // check url mask - if (!metadata.matches(this.query.urlMask)) { + if ( !metadata.matches(this.query.urlMask) ) { this.sortout++; continue; } @@ -452,7 +577,7 @@ public final class RWIProcess extends Thread { } // check for more errors - if (metadata.url() == null) { + if ( metadata.url() == null ) { this.sortout++; continue; // rare case where the url is corrupted } @@ -462,19 +587,19 @@ public final class RWIProcess extends Thread { final String pagetitle = metadata.dc_title().toLowerCase(); // check exclusion - if ((QueryParams.anymatch(pagetitle, this.query.excludeHashes)) || - (QueryParams.anymatch(pageurl.toLowerCase(), this.query.excludeHashes)) || - (QueryParams.anymatch(pageauthor.toLowerCase(), this.query.excludeHashes))) { + if ( (QueryParams.anymatch(pagetitle, this.query.excludeHashes)) + || (QueryParams.anymatch(pageurl.toLowerCase(), this.query.excludeHashes)) + || (QueryParams.anymatch(pageauthor.toLowerCase(), this.query.excludeHashes)) ) { this.sortout++; continue; } // check index-of constraint - if ((this.query.constraint != null) && - (this.query.constraint.get(Condenser.flag_cat_indexof)) && - (!(pagetitle.startsWith("index of")))) { + if ( (this.query.constraint != null) + && (this.query.constraint.get(Condenser.flag_cat_indexof)) + && (!(pagetitle.startsWith("index of"))) ) { final Iterator wi = this.query.queryHashes.iterator(); - while (wi.hasNext()) { + while ( wi.hasNext() ) { this.query.getSegment().termIndex().removeDelayed(wi.next(), page.hash()); } this.sortout++; @@ -482,47 +607,47 @@ public final class RWIProcess extends Thread { } // check location constraint - if ((this.query.constraint != null) && - (this.query.constraint.get(Condenser.flag_cat_haslocation)) && - (metadata.lat() == 0.0f || metadata.lon() == 0.0f)) { + if ( (this.query.constraint != null) + && (this.query.constraint.get(Condenser.flag_cat_haslocation)) + && (metadata.lat() == 0.0f || metadata.lon() == 0.0f) ) { this.sortout++; continue; } // check content domain - if ((this.query.contentdom == ContentDomain.AUDIO && page.laudio() == 0) || - (this.query.contentdom == ContentDomain.VIDEO && page.lvideo() == 0) || - (this.query.contentdom == ContentDomain.IMAGE && page.limage() == 0) || - (this.query.contentdom == ContentDomain.APP && page.lapp() == 0)) { + if ( (this.query.contentdom == ContentDomain.AUDIO && page.laudio() == 0) + || (this.query.contentdom == ContentDomain.VIDEO && page.lvideo() == 0) + || (this.query.contentdom == ContentDomain.IMAGE && page.limage() == 0) + || (this.query.contentdom == ContentDomain.APP && page.lapp() == 0) ) { this.sortout++; - continue; + continue; } // evaluate information of metadata for navigation // author navigation: - if (pageauthor != null && pageauthor.length() > 0) { - // add author to the author navigator + if ( pageauthor != null && pageauthor.length() > 0 ) { + // add author to the author navigator final String authorhash = ASCII.String(Word.word2hash(pageauthor)); // check if we already are filtering for authors - if (this.query.authorhash != null && !this.query.authorhash.equals(authorhash)) { + if ( this.query.authorhash != null && !this.query.authorhash.equals(authorhash) ) { this.sortout++; - continue; - } + continue; + } - // add author to the author navigator + // add author to the author navigator this.authorNavigator.inc(pageauthor); - } else if (this.query.authorhash != null) { + } else if ( this.query.authorhash != null ) { this.sortout++; - continue; + continue; } // namespace navigation String pagepath = metadata.url().getPath(); - if ((p = pagepath.indexOf(':')) >= 0) { - pagepath = pagepath.substring(0,p); + if ( (p = pagepath.indexOf(':')) >= 0 ) { + pagepath = pagepath.substring(0, p); p = pagepath.lastIndexOf('/'); - if (p >= 0) { + if ( p >= 0 ) { pagepath = pagepath.substring(p + 1); this.namespaceNavigator.inc(pagepath); } @@ -534,10 +659,12 @@ public final class RWIProcess extends Thread { // file type navigation final String fileext = metadata.url().getFileExtension(); - if (fileext.length() > 0) this.filetypeNavigator.inc(fileext); + if ( fileext.length() > 0 ) { + this.filetypeNavigator.inc(fileext); + } // check Scanner - if (!Scanner.acceptURL(metadata.url())) { + if ( !Scanner.acceptURL(metadata.url()) ) { this.sortout++; continue; } @@ -550,7 +677,7 @@ public final class RWIProcess extends Thread { public int sizeQueue() { int c = this.stack.sizeQueue(); - for (final WeakPriorityBlockingQueue s: this.doubleDomCache.values()) { + for ( final WeakPriorityBlockingQueue s : this.doubleDomCache.values() ) { c += s.sizeQueue(); } return c; @@ -558,22 +685,26 @@ public final class RWIProcess extends Thread { public int sizeAvailable() { int c = this.stack.sizeAvailable(); - for (final WeakPriorityBlockingQueue s: this.doubleDomCache.values()) { + for ( final WeakPriorityBlockingQueue s : this.doubleDomCache.values() ) { c += s.sizeAvailable(); } return c; } public boolean isEmpty() { - if (!this.stack.isEmpty()) return false; - for (final WeakPriorityBlockingQueue s: this.doubleDomCache.values()) { - if (!s.isEmpty()) return false; + if ( !this.stack.isEmpty() ) { + return false; + } + for ( final WeakPriorityBlockingQueue s : this.doubleDomCache.values() ) { + if ( !s.isEmpty() ) { + return false; + } } return true; } public int[] flagCount() { - return this.flagcount; + return this.flagcount; } // "results from a total number of known ( local, remote), links from other YaCy peers." @@ -616,59 +747,89 @@ public final class RWIProcess extends Thread { } public ScoreMap getNamespaceNavigator() { - if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("namespace",0) < 0) return new ClusteredScoreMap(); - if (this.namespaceNavigator.sizeSmaller(2)) this.namespaceNavigator.clear(); // navigators with one entry are not useful + if ( !this.query.navigators.equals("all") && this.query.navigators.indexOf("namespace", 0) < 0 ) { + return new ClusteredScoreMap(); + } + if ( this.namespaceNavigator.sizeSmaller(2) ) { + this.namespaceNavigator.clear(); // navigators with one entry are not useful + } return this.namespaceNavigator; } public ScoreMap getHostNavigator() { final ScoreMap result = new ConcurrentScoreMap(); - if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("hosts",0) < 0) return result; + if ( !this.query.navigators.equals("all") && this.query.navigators.indexOf("hosts", 0) < 0 ) { + return result; + } final Iterator domhashs = this.hostNavigator.keys(false); URIMetadataRow row; byte[] urlhash; String hosthash, hostname; - if (this.hostResolver != null) while (domhashs.hasNext() && result.sizeSmaller(30)) { - hosthash = domhashs.next(); - if (hosthash == null) continue; - urlhash = this.hostResolver.get(hosthash); - row = urlhash == null ? null : this.query.getSegment().urlMetadata().load(urlhash); - hostname = row == null ? null : row.metadata().url().getHost(); - if (hostname != null) { - result.set(hostname, this.hostNavigator.get(hosthash)); + if ( this.hostResolver != null ) { + while ( domhashs.hasNext() && result.sizeSmaller(30) ) { + hosthash = domhashs.next(); + if ( hosthash == null ) { + continue; + } + urlhash = this.hostResolver.get(hosthash); + row = urlhash == null ? null : this.query.getSegment().urlMetadata().load(urlhash); + hostname = row == null ? null : row.metadata().url().getHost(); + if ( hostname != null ) { + result.set(hostname, this.hostNavigator.get(hosthash)); + } } } - if (result.sizeSmaller(2)) result.clear(); // navigators with one entry are not useful + if ( result.sizeSmaller(2) ) { + result.clear(); // navigators with one entry are not useful + } return result; } public ScoreMap getProtocolNavigator() { - if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("protocol",0) < 0) return new ClusteredScoreMap(); - if (this.protocolNavigator.sizeSmaller(2)) this.protocolNavigator.clear(); // navigators with one entry are not useful + if ( !this.query.navigators.equals("all") && this.query.navigators.indexOf("protocol", 0) < 0 ) { + return new ClusteredScoreMap(); + } + if ( this.protocolNavigator.sizeSmaller(2) ) { + this.protocolNavigator.clear(); // navigators with one entry are not useful + } return this.protocolNavigator; } public ScoreMap getFiletypeNavigator() { - if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("filetype",0) < 0) return new ClusteredScoreMap(); - if (this.filetypeNavigator.sizeSmaller(2)) this.filetypeNavigator.clear(); // navigators with one entry are not useful + if ( !this.query.navigators.equals("all") && this.query.navigators.indexOf("filetype", 0) < 0 ) { + return new ClusteredScoreMap(); + } + if ( this.filetypeNavigator.sizeSmaller(2) ) { + this.filetypeNavigator.clear(); // navigators with one entry are not useful + } return this.filetypeNavigator; } - public static final Comparator> mecomp = new Comparator>() { - public int compare(final Map.Entry o1, final Map.Entry o2) { - if (o1.getValue().intValue() < o2.getValue().intValue()) return 1; - if (o2.getValue().intValue() < o1.getValue().intValue()) return -1; - return 0; - } - }; + public static final Comparator> mecomp = + new Comparator>() { + @Override + public int compare(final Map.Entry o1, final Map.Entry o2) { + if ( o1.getValue().intValue() < o2.getValue().intValue() ) { + return 1; + } + if ( o2.getValue().intValue() < o1.getValue().intValue() ) { + return -1; + } + return 0; + } + }; public ScoreMap getTopicNavigator(final int count) { // create a list of words that had been computed by statistics over all // words that appeared in the url or the description of all urls final ScoreMap result = new ConcurrentScoreMap(); - if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("topics",0) < 0) return result; - if (this.ref.sizeSmaller(2)) this.ref.clear(); // navigators with one entry are not useful + if ( !this.query.navigators.equals("all") && this.query.navigators.indexOf("topics", 0) < 0 ) { + return result; + } + if ( this.ref.sizeSmaller(2) ) { + this.ref.clear(); // navigators with one entry are not useful + } final Map counts = new HashMap(); final Iterator i = this.ref.keys(false); String word; @@ -676,20 +837,24 @@ public final class RWIProcess extends Thread { int c; float q, min = Float.MAX_VALUE, max = Float.MIN_VALUE; int ic = count; - while (ic-- > 0 && i.hasNext()) { + while ( ic-- > 0 && i.hasNext() ) { word = i.next(); - if (word == null) continue; + if ( word == null ) { + continue; + } termHash = Word.word2hash(word); c = this.query.getSegment().termIndex().count(termHash); - if (c > 0) { + if ( c > 0 ) { q = ((float) this.ref.get(word)) / ((float) c); min = Math.min(min, q); max = Math.max(max, q); counts.put(word, q); } } - if (max > min) for (final Map.Entry ce: counts.entrySet()) { - result.set(ce.getKey(), (int) (((double) count) * (ce.getValue() - min) / (max - min))); + if ( max > min ) { + for ( final Map.Entry ce : counts.entrySet() ) { + result.set(ce.getKey(), (int) (((double) count) * (ce.getValue() - min) / (max - min))); + } } return this.ref; } @@ -698,14 +863,15 @@ public final class RWIProcess extends Thread { public void addTopic(final String[] words) { String word; - for (final String w : words) { + for ( final String w : words ) { word = w.toLowerCase(); - if (word.length() > 2 && - "http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_zum_der_die_das_und_the_zur_bzw_mit_blog_wiki_aus_bei_off".indexOf(word) < 0 && - !this.query.queryHashes.has(Word.word2hash(word)) && - lettermatch.matcher(word).matches() && - !Switchboard.badwords.contains(word) && - !Switchboard.stopwords.contains(word)) { + if ( word.length() > 2 + && "http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_zum_der_die_das_und_the_zur_bzw_mit_blog_wiki_aus_bei_off" + .indexOf(word) < 0 + && !this.query.queryHashes.has(Word.word2hash(word)) + && lettermatch.matcher(word).matches() + && !Switchboard.badwords.contains(word) + && !Switchboard.stopwords.contains(word) ) { this.ref.inc(word); } } @@ -713,7 +879,9 @@ public final class RWIProcess extends Thread { public void addTopics(final ResultEntry resultEntry) { // take out relevant information for reference computation - if ((resultEntry.url() == null) || (resultEntry.title() == null)) return; + if ( (resultEntry.url() == null) || (resultEntry.title() == null) ) { + return; + } //final String[] urlcomps = htmlFilterContentScraper.urlComps(resultEntry.url().toNormalform(true, true)); // word components of the url final String[] descrcomps = MultiProtocolURI.splitpattern.split(resultEntry.title().toLowerCase()); // words in the description @@ -725,8 +893,12 @@ public final class RWIProcess extends Thread { public ScoreMap getAuthorNavigator() { // create a list of words that had been computed by statistics over all // words that appeared in the url or the description of all urls - if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("authors",0) < 0) return new ConcurrentScoreMap(); - if (this.authorNavigator.sizeSmaller(2)) this.authorNavigator.clear(); // navigators with one entry are not useful + if ( !this.query.navigators.equals("all") && this.query.navigators.indexOf("authors", 0) < 0 ) { + return new ConcurrentScoreMap(); + } + if ( this.authorNavigator.sizeSmaller(2) ) { + this.authorNavigator.clear(); // navigators with one entry are not useful + } return this.authorNavigator; } diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index 3c79ba71c..5c254d29b 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -58,11 +58,25 @@ import net.yacy.search.ranking.ReferenceOrder; import net.yacy.search.snippet.ResultEntry; import de.anomic.data.WorkTables; -public final class SearchEvent { +public final class SearchEvent +{ public enum Type { - INITIALIZATION, COLLECTION, JOIN, PRESORT, URLFETCH, NORMALIZING, FINALIZATION, - REMOTESEARCH_START, REMOTESEARCH_TERMINATE, ABSTRACTS, CLEANUP, SNIPPETFETCH_START, ONERESULT, REFERENCECOLLECTION, RESULTLIST; + INITIALIZATION, + COLLECTION, + JOIN, + PRESORT, + URLFETCH, + NORMALIZING, + FINALIZATION, + REMOTESEARCH_START, + REMOTESEARCH_TERMINATE, + ABSTRACTS, + CLEANUP, + SNIPPETFETCH_START, + ONERESULT, + REFERENCECOLLECTION, + RESULTLIST; } public static final int max_results_preparation = 3000; @@ -87,24 +101,30 @@ public final class SearchEvent { private byte[] IAmaxcounthash, IAneardhthash; private final ReferenceOrder order; - protected SearchEvent(final QueryParams query, - final SeedDB peers, - final WorkTables workTables, - final SortedMap preselectedPeerHashes, - final boolean generateAbstracts, - final LoaderDispatcher loader, - final int remote_maxcount, - final long remote_maxtime, - final int burstRobinsonPercent, - final int burstMultiwordPercent, - final boolean deleteIfSnippetFail) { - if (MemoryControl.available() < 1024 * 1024 * 100) SearchEventCache.cleanupEvents(true); + protected SearchEvent( + final QueryParams query, + final SeedDB peers, + final WorkTables workTables, + final SortedMap preselectedPeerHashes, + final boolean generateAbstracts, + final LoaderDispatcher loader, + final int remote_maxcount, + final long remote_maxtime, + final int burstRobinsonPercent, + final int burstMultiwordPercent, + final boolean deleteIfSnippetFail) { + if ( MemoryControl.available() < 1024 * 1024 * 100 ) { + SearchEventCache.cleanupEvents(true); + } this.eventTime = System.currentTimeMillis(); // for lifetime check this.peers = peers; this.workTables = workTables; this.query = query; - this.secondarySearchSuperviser = (this.query.queryHashes.size() > 1) ? new SecondarySearchSuperviser() : null; // generate abstracts only for combined searches - if (this.secondarySearchSuperviser != null) this.secondarySearchSuperviser.start(); + this.secondarySearchSuperviser = + (this.query.queryHashes.size() > 1) ? new SecondarySearchSuperviser() : null; // generate abstracts only for combined searches + if ( this.secondarySearchSuperviser != null ) { + this.secondarySearchSuperviser.start(); + } this.primarySearchThreads = null; this.secondarySearchThreads = null; this.preselectedPeerHashes = preselectedPeerHashes; @@ -115,7 +135,11 @@ public final class SearchEvent { this.IAneardhthash = null; this.localSearchThread = null; this.order = new ReferenceOrder(this.query.ranking, UTF8.getBytes(this.query.targetlang)); - final boolean remote = (peers != null && peers.sizeConnected() > 0) && (this.query.domType == QueryParams.Searchdom.CLUSTER || (this.query.domType == QueryParams.Searchdom.GLOBAL && peers.mySeed().getFlagAcceptRemoteIndex())); + final boolean remote = + (peers != null && peers.sizeConnected() > 0) + && (this.query.domType == QueryParams.Searchdom.CLUSTER || (this.query.domType == QueryParams.Searchdom.GLOBAL && peers + .mySeed() + .getFlagAcceptRemoteIndex())); final long start = System.currentTimeMillis(); // initialize a ranking process that is the target for data @@ -125,10 +149,11 @@ public final class SearchEvent { // start a local search concurrently this.rankingProcess.start(); - if (remote) { + if ( remote ) { // start global searches final long timer = System.currentTimeMillis(); - this.primarySearchThreads = (this.query.queryHashes.isEmpty()) ? null : RemoteSearch.primaryRemoteSearches( + this.primarySearchThreads = + (this.query.queryHashes.isEmpty()) ? null : RemoteSearch.primaryRemoteSearches( QueryParams.hashSet2hashString(this.query.queryHashes), QueryParams.hashSet2hashString(this.query.excludeHashes), this.query.prefer, @@ -151,22 +176,40 @@ public final class SearchEvent { (this.query.domType == QueryParams.Searchdom.GLOBAL) ? null : preselectedPeerHashes, burstRobinsonPercent, burstMultiwordPercent); - if (this.primarySearchThreads != null) { - Log.logFine("SEARCH_EVENT", "STARTING " + this.primarySearchThreads.length + " THREADS TO CATCH EACH " + remote_maxcount + " URLs"); + if ( this.primarySearchThreads != null ) { + Log.logFine("SEARCH_EVENT", "STARTING " + + this.primarySearchThreads.length + + " THREADS TO CATCH EACH " + + remote_maxcount + + " URLs"); this.rankingProcess.moreFeeders(this.primarySearchThreads.length); - EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(this.query.id(true), Type.REMOTESEARCH_START, "", this.primarySearchThreads.length, System.currentTimeMillis() - timer), false); + this.rankingProcess.setExpectedRemoteReferences(this.primarySearchThreads.length + * remote_maxcount); + EventTracker.update( + EventTracker.EClass.SEARCH, + new ProfilingGraph.EventSearch( + this.query.id(true), + Type.REMOTESEARCH_START, + "", + this.primarySearchThreads.length, + System.currentTimeMillis() - timer), + false); // finished searching - Log.logFine("SEARCH_EVENT", "SEARCH TIME AFTER GLOBAL-TRIGGER TO " + this.primarySearchThreads.length + " PEERS: " + ((System.currentTimeMillis() - start) / 1000) + " seconds"); + Log.logFine("SEARCH_EVENT", "SEARCH TIME AFTER GLOBAL-TRIGGER TO " + + this.primarySearchThreads.length + + " PEERS: " + + ((System.currentTimeMillis() - start) / 1000) + + " seconds"); } else { // no search since query is empty, user might have entered no data or filters have removed all search words Log.logFine("SEARCH_EVENT", "NO SEARCH STARTED DUE TO EMPTY SEARCH REQUEST."); } } else { - if (generateAbstracts) { + if ( generateAbstracts ) { // we need the results now try { this.rankingProcess.join(); - } catch (final Throwable e) { + } catch ( final Throwable e ) { } // compute index abstracts final long timer = System.currentTimeMillis(); @@ -174,30 +217,45 @@ public final class SearchEvent { long mindhtdistance = Long.MAX_VALUE, l; byte[] wordhash; assert this.rankingProcess.searchContainerMap() != null; - for (final Map.Entry> entry : this.rankingProcess.searchContainerMap().entrySet()) { + for ( final Map.Entry> entry : this.rankingProcess + .searchContainerMap() + .entrySet() ) { wordhash = entry.getKey(); final ReferenceContainer container = entry.getValue(); - assert (Base64Order.enhancedCoder.equal(container.getTermHash(), wordhash)) : "container.getTermHash() = " + ASCII.String(container.getTermHash()) + ", wordhash = " + ASCII.String(wordhash); - if (container.size() > maxcount) { + assert (Base64Order.enhancedCoder.equal(container.getTermHash(), wordhash)) : "container.getTermHash() = " + + ASCII.String(container.getTermHash()) + + ", wordhash = " + + ASCII.String(wordhash); + if ( container.size() > maxcount ) { this.IAmaxcounthash = wordhash; maxcount = container.size(); } l = FlatWordPartitionScheme.std.dhtDistance(wordhash, null, peers.mySeed()); - if (l < mindhtdistance) { + if ( l < mindhtdistance ) { // calculate the word hash that is closest to our dht position mindhtdistance = l; this.IAneardhthash = wordhash; } this.IACount.put(wordhash, LargeNumberCache.valueOf(container.size())); - this.IAResults.put(wordhash, WordReferenceFactory.compressIndex(container, null, 1000).toString()); + this.IAResults.put(wordhash, WordReferenceFactory + .compressIndex(container, null, 1000) + .toString()); } - EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(this.query.id(true), Type.ABSTRACTS, "", this.rankingProcess.searchContainerMap().size(), System.currentTimeMillis() - timer), false); + EventTracker.update( + EventTracker.EClass.SEARCH, + new ProfilingGraph.EventSearch( + this.query.id(true), + Type.ABSTRACTS, + "", + this.rankingProcess.searchContainerMap().size(), + System.currentTimeMillis() - timer), + false); } else { // give process time to accumulate a certain amount of data // before a reading process wants to get results from it try { this.rankingProcess.join(100); - } catch (final Throwable e) { + } catch ( final Throwable e ) { } // this will reduce the maximum waiting time until results are available to 100 milliseconds // while we always get a good set of ranked data @@ -205,115 +263,157 @@ public final class SearchEvent { } // start worker threads to fetch urls and snippets - this.resultFetcher = new SnippetProcess(loader, this.rankingProcess, this.query, this.peers, this.workTables, 5000, deleteIfSnippetFail); + this.resultFetcher = + new SnippetProcess( + loader, + this.rankingProcess, + this.query, + this.peers, + this.workTables, + 5000, + deleteIfSnippetFail); // clean up events SearchEventCache.cleanupEvents(false); - EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(this.query.id(true), Type.CLEANUP, "", 0, 0), false); + EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch( + this.query.id(true), + Type.CLEANUP, + "", + 0, + 0), false); // store this search to a cache so it can be re-used - if (MemoryControl.available() < 1024 * 1024 * 100) SearchEventCache.cleanupEvents(true); + if ( MemoryControl.available() < 1024 * 1024 * 100 ) { + SearchEventCache.cleanupEvents(true); + } SearchEventCache.put(this.query.id(false), this); - } - - - public ReferenceOrder getOrder() { - return this.order; - } - - public long getEventTime() { - return this.eventTime; - } - - public void resetEventTime() { - this.eventTime = System.currentTimeMillis(); - } - - public QueryParams getQuery() { - return this.query; - } - - public void setQuery(final QueryParams query) { - this.query = query; - this.resultFetcher.query = query; - } - - public void cleanup() { - this.resultFetcher.setCleanupState(); - - // stop all threads - if (this.primarySearchThreads != null) { - for (final RemoteSearch search : this.primarySearchThreads) { - if (search != null) synchronized (search) { - if (search.isAlive()) search.interrupt(); - } - } - } - if (this.secondarySearchThreads != null) { - for (final RemoteSearch search : this.secondarySearchThreads) { - if (search != null) synchronized (search) { - if (search.isAlive()) search.interrupt(); - } - } - } - - // call the worker threads and ask them to stop - for (final Worker w: this.resultFetcher.workerThreads) { - if (w != null && w.isAlive()) { - w.pleaseStop(); - w.interrupt(); - // the interrupt may occur during a MD5 computation which is resistant against interruption - // therefore set some more interrupts on the process - int ic = 10; - while (ic-- > 0 & w.isAlive()) w.interrupt(); - } - } - - // clear all data structures - if (this.preselectedPeerHashes != null) this.preselectedPeerHashes.clear(); - if (this.localSearchThread != null) if (this.localSearchThread.isAlive()) this.localSearchThread.interrupt(); - if (this.IACount != null) this.IACount.clear(); - if (this.IAResults != null) this.IAResults.clear(); - if (this.heuristics != null) this.heuristics.clear(); - } - - public Iterator> abstractsString() { - return this.IAResults.entrySet().iterator(); - } - - public String abstractsString(final byte[] hash) { - return this.IAResults.get(hash); - } - - public Iterator> abstractsCount() { - return this.IACount.entrySet().iterator(); - } - - public int abstractsCount(final byte[] hash) { - final Integer i = this.IACount.get(hash); - if (i == null) return -1; - return i.intValue(); - } - - public byte[] getAbstractsMaxCountHash() { - return this.IAmaxcounthash; - } - - public byte[] getAbstractsNearDHTHash() { - return this.IAneardhthash; - } - - boolean anyRemoteSearchAlive() { + } + + public ReferenceOrder getOrder() { + return this.order; + } + + public long getEventTime() { + return this.eventTime; + } + + public void resetEventTime() { + this.eventTime = System.currentTimeMillis(); + } + + public QueryParams getQuery() { + return this.query; + } + + public void setQuery(final QueryParams query) { + this.query = query; + this.resultFetcher.query = query; + } + + public void cleanup() { + this.resultFetcher.setCleanupState(); + + // stop all threads + if ( this.primarySearchThreads != null ) { + for ( final RemoteSearch search : this.primarySearchThreads ) { + if ( search != null ) { + synchronized ( search ) { + if ( search.isAlive() ) { + search.interrupt(); + } + } + } + } + } + if ( this.secondarySearchThreads != null ) { + for ( final RemoteSearch search : this.secondarySearchThreads ) { + if ( search != null ) { + synchronized ( search ) { + if ( search.isAlive() ) { + search.interrupt(); + } + } + } + } + } + + // call the worker threads and ask them to stop + for ( final Worker w : this.resultFetcher.workerThreads ) { + if ( w != null && w.isAlive() ) { + w.pleaseStop(); + w.interrupt(); + // the interrupt may occur during a MD5 computation which is resistant against interruption + // therefore set some more interrupts on the process + int ic = 10; + while ( ic-- > 0 & w.isAlive() ) { + w.interrupt(); + } + } + } + + // clear all data structures + if ( this.preselectedPeerHashes != null ) { + this.preselectedPeerHashes.clear(); + } + if ( this.localSearchThread != null ) { + if ( this.localSearchThread.isAlive() ) { + this.localSearchThread.interrupt(); + } + } + if ( this.IACount != null ) { + this.IACount.clear(); + } + if ( this.IAResults != null ) { + this.IAResults.clear(); + } + if ( this.heuristics != null ) { + this.heuristics.clear(); + } + } + + public Iterator> abstractsString() { + return this.IAResults.entrySet().iterator(); + } + + public String abstractsString(final byte[] hash) { + return this.IAResults.get(hash); + } + + public Iterator> abstractsCount() { + return this.IACount.entrySet().iterator(); + } + + public int abstractsCount(final byte[] hash) { + final Integer i = this.IACount.get(hash); + if ( i == null ) { + return -1; + } + return i.intValue(); + } + + public byte[] getAbstractsMaxCountHash() { + return this.IAmaxcounthash; + } + + public byte[] getAbstractsNearDHTHash() { + return this.IAneardhthash; + } + + boolean anyRemoteSearchAlive() { // check primary search threads - if ((this.primarySearchThreads != null) && (this.primarySearchThreads.length != 0)) { - for (final RemoteSearch primarySearchThread : this.primarySearchThreads) { - if ((primarySearchThread != null) && (primarySearchThread.isAlive())) return true; + if ( (this.primarySearchThreads != null) && (this.primarySearchThreads.length != 0) ) { + for ( final RemoteSearch primarySearchThread : this.primarySearchThreads ) { + if ( (primarySearchThread != null) && (primarySearchThread.isAlive()) ) { + return true; + } } } // maybe a secondary search thread is alive, check this - if ((this.secondarySearchThreads != null) && (this.secondarySearchThreads.length != 0)) { - for (final RemoteSearch secondarySearchThread : this.secondarySearchThreads) { - if ((secondarySearchThread != null) && (secondarySearchThread.isAlive())) return true; + if ( (this.secondarySearchThreads != null) && (this.secondarySearchThreads.length != 0) ) { + for ( final RemoteSearch secondarySearchThread : this.secondarySearchThreads ) { + if ( (secondarySearchThread != null) && (secondarySearchThread.isAlive()) ) { + return true; + } } } return false; @@ -358,13 +458,13 @@ public final class SearchEvent { } public void addHeuristic(final byte[] urlhash, final String heuristicName, final boolean redundant) { - synchronized (this.heuristics) { + synchronized ( this.heuristics ) { this.heuristics.put(urlhash, new HeuristicResult(urlhash, heuristicName, redundant)); } } public HeuristicResult getHeuristic(final byte[] urlhash) { - synchronized (this.heuristics) { + synchronized ( this.heuristics ) { return this.heuristics.get(urlhash); } } @@ -386,23 +486,30 @@ public final class SearchEvent { //boolean secondarySearchStartet = false; - public static class HeuristicResult /*implements Comparable*/ { - private final byte[] urlhash; public final String heuristicName; public final boolean redundant; + public static class HeuristicResult /*implements Comparable*/ + { + private final byte[] urlhash; + public final String heuristicName; + public final boolean redundant; + private HeuristicResult(final byte[] urlhash, final String heuristicName, final boolean redundant) { - this.urlhash = urlhash; this.heuristicName = heuristicName; this.redundant = redundant; + this.urlhash = urlhash; + this.heuristicName = heuristicName; + this.redundant = redundant; }/* - public int compareTo(HeuristicResult o) { + public int compareTo(HeuristicResult o) { return Base64Order.enhancedCoder.compare(this.urlhash, o.urlhash); - } - public int hashCode() { + } + public int hashCode() { return (int) Base64Order.enhancedCoder.cardinal(this.urlhash); - } - public boolean equals(Object o) { + } + public boolean equals(Object o) { return Base64Order.enhancedCoder.equal(this.urlhash, ((HeuristicResult) o).urlhash); - }*/ + }*/ } - public class SecondarySearchSuperviser extends Thread { + public class SecondarySearchSuperviser extends Thread + { // cache for index abstracts; word:TreeMap mapping where the embedded TreeMap is a urlhash:peerlist relation // this relation contains the information where specific urls can be found in specific peers @@ -418,14 +525,15 @@ public final class SearchEvent { /** * add a single abstract to the existing set of abstracts + * * @param wordhash * @param singleAbstract // a mapping from url-hashes to a string of peer-hashes */ public void addAbstract(final String wordhash, final TreeMap singleAbstract) { final SortedMap oldAbstract; - synchronized (this.abstractsCache) { + synchronized ( this.abstractsCache ) { oldAbstract = this.abstractsCache.get(wordhash); - if (oldAbstract == null) { + if ( oldAbstract == null ) { // new abstracts in the cache this.abstractsCache.put(wordhash, singleAbstract); return; @@ -433,13 +541,16 @@ public final class SearchEvent { } // extend the abstracts in the cache: join the single abstracts new Thread() { + @Override public void run() { - for (final Map.Entry oneref: singleAbstract.entrySet()) { + for ( final Map.Entry oneref : singleAbstract.entrySet() ) { final String urlhash = oneref.getKey(); final StringBuilder peerlistNew = oneref.getValue(); - synchronized (oldAbstract) { + synchronized ( oldAbstract ) { final StringBuilder peerlistOld = oldAbstract.put(urlhash, peerlistNew); - if (peerlistOld != null) peerlistOld.append(peerlistNew); + if ( peerlistOld != null ) { + peerlistOld.append(peerlistNew); + } } } } @@ -458,23 +569,26 @@ public final class SearchEvent { SortedMap urlPeerlist; int p; boolean hasURL; - synchronized (this) { - final Iterator>> i = this.abstractsCache.entrySet().iterator(); - while (i.hasNext()) { + synchronized ( this ) { + final Iterator>> i = + this.abstractsCache.entrySet().iterator(); + while ( i.hasNext() ) { entry = i.next(); word = entry.getKey(); urlPeerlist = entry.getValue(); hasURL = true; - for (int j = 0; j < urls.length(); j = j + 12) { + for ( int j = 0; j < urls.length(); j = j + 12 ) { url = urls.substring(j, j + 12); peerlist = urlPeerlist.get(url); p = (peerlist == null) ? -1 : peerlist.indexOf(peerhash); - if ((p < 0) || (p % 12 != 0)) { + if ( (p < 0) || (p % 12 != 0) ) { hasURL = false; break; } } - if (hasURL) wordlist += word; + if ( hasURL ) { + wordlist += word; + } } } return wordlist; @@ -484,21 +598,26 @@ public final class SearchEvent { public void run() { try { int t = 0; - while (this.trigger.tryAcquire(10000, TimeUnit.MILLISECONDS)) { + while ( this.trigger.tryAcquire(10000, TimeUnit.MILLISECONDS) ) { // a trigger was released prepareSecondarySearch(); t++; - if (t > 10) break; + if ( t > 10 ) { + break; + } } - } catch (final InterruptedException e) { + } catch ( final InterruptedException e ) { // the thread was interrupted // do nothing } - // the time-out was reached + // the time-out was reached } private void prepareSecondarySearch() { - if (this.abstractsCache == null || this.abstractsCache.size() != SearchEvent.this.query.queryHashes.size()) return; // secondary search not possible (yet) + if ( this.abstractsCache == null + || this.abstractsCache.size() != SearchEvent.this.query.queryHashes.size() ) { + return; // secondary search not possible (yet) + } // catch up index abstracts and join them; then call peers again to submit their urls @@ -510,12 +629,17 @@ public final class SearchEvent { */ // find out if there are enough references for all words that are searched - if (this.abstractsCache.size() != SearchEvent.this.query.queryHashes.size()) return; + if ( this.abstractsCache.size() != SearchEvent.this.query.queryHashes.size() ) { + return; + } // join all the urlhash:peerlist relations: the resulting map has values with a combined peer-list list - final SortedMap abstractJoin = SetTools.joinConstructive(this.abstractsCache.values(), true); - if (abstractJoin.isEmpty()) return; - // the join result is now a urlhash: peer-list relation + final SortedMap abstractJoin = + SetTools.joinConstructive(this.abstractsCache.values(), true); + if ( abstractJoin.isEmpty() ) { + return; + // the join result is now a urlhash: peer-list relation + } // generate a list of peers that have the urls for the joined search result final SortedMap secondarySearchURLs = new TreeMap(); // a (peerhash:urlhash-liststring) mapping @@ -524,17 +648,19 @@ public final class SearchEvent { final String mypeerhash = SearchEvent.this.peers.mySeed().hash; boolean mypeerinvolved = false; int mypeercount; - for (final Map.Entry entry: abstractJoin.entrySet()) { + for ( final Map.Entry entry : abstractJoin.entrySet() ) { url = entry.getKey(); peerlist = entry.getValue(); //System.out.println("DEBUG-INDEXABSTRACT: url " + url + ": from peers " + peerlist); mypeercount = 0; - for (int j = 0; j < peerlist.length(); j += 12) { + for ( int j = 0; j < peerlist.length(); j += 12 ) { peer = peerlist.substring(j, j + 12); - if ((peer.equals(mypeerhash)) && (mypeercount++ > 1)) continue; + if ( (peer.equals(mypeerhash)) && (mypeercount++ > 1) ) { + continue; + } //if (peers.indexOf(peer) < j) continue; // avoid doubles that may appear in the abstractJoin urls = secondarySearchURLs.get(peer); - if (urls == null) { + if ( urls == null ) { urls = new StringBuilder(24); urls.append(url); secondarySearchURLs.put(peer, urls); @@ -543,27 +669,47 @@ public final class SearchEvent { } secondarySearchURLs.put(peer, urls); } - if (mypeercount == 1) mypeerinvolved = true; + if ( mypeercount == 1 ) { + mypeerinvolved = true; + } } // compute words for secondary search and start the secondary searches String words; - SearchEvent.this.secondarySearchThreads = new RemoteSearch[(mypeerinvolved) ? secondarySearchURLs.size() - 1 : secondarySearchURLs.size()]; + SearchEvent.this.secondarySearchThreads = + new RemoteSearch[(mypeerinvolved) ? secondarySearchURLs.size() - 1 : secondarySearchURLs + .size()]; int c = 0; - for (final Map.Entry entry: secondarySearchURLs.entrySet()) { + for ( final Map.Entry entry : secondarySearchURLs.entrySet() ) { peer = entry.getKey(); - if (peer.equals(mypeerhash)) continue; // we don't need to ask ourself - if (this.checkedPeers.contains(peer)) continue; // do not ask a peer again + if ( peer.equals(mypeerhash) ) { + continue; // we don't need to ask ourself + } + if ( this.checkedPeers.contains(peer) ) { + continue; // do not ask a peer again + } urls = entry.getValue(); words = wordsFromPeer(peer, urls); - if (words.length() == 0) continue; // ??? + if ( words.length() == 0 ) { + continue; // ??? + } assert words.length() >= 12 : "words = " + words; //System.out.println("DEBUG-INDEXABSTRACT ***: peer " + peer + " has urls: " + urls + " from words: " + words); SearchEvent.this.rankingProcess.moreFeeders(1); this.checkedPeers.add(peer); - SearchEvent.this.secondarySearchThreads[c++] = RemoteSearch.secondaryRemoteSearch( - words, urls.toString(), 6000, SearchEvent.this.query.getSegment(), SearchEvent.this.peers, SearchEvent.this.rankingProcess, peer, Switchboard.urlBlacklist, - SearchEvent.this.query.ranking, SearchEvent.this.query.constraint, SearchEvent.this.preselectedPeerHashes); + SearchEvent.this.secondarySearchThreads[c++] = + RemoteSearch.secondaryRemoteSearch( + words, + urls.toString(), + 6000, + SearchEvent.this.query.getSegment(), + SearchEvent.this.peers, + SearchEvent.this.rankingProcess, + peer, + Switchboard.urlBlacklist, + SearchEvent.this.query.ranking, + SearchEvent.this.query.constraint, + SearchEvent.this.preselectedPeerHashes); } } @@ -575,8 +721,14 @@ public final class SearchEvent { } public boolean workerAlive() { - if (this.resultFetcher== null || this.resultFetcher.workerThreads == null) return false; - for (final Worker w: this.resultFetcher.workerThreads) if (w != null && w.isAlive()) return true; + if ( this.resultFetcher == null || this.resultFetcher.workerThreads == null ) { + return false; + } + for ( final Worker w : this.resultFetcher.workerThreads ) { + if ( w != null && w.isAlive() ) { + return true; + } + } return false; }