From a8a5df4a51f75b7c0fa75b3318fa33ccc3d59737 Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 30 Jan 2008 21:58:30 +0000 Subject: [PATCH] - more dublin core naming of page metadata - better presentation of result counters in search results git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4420 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/Bookmarks.java | 8 +- htroot/CrawlResults.java | 4 +- htroot/IndexControlRWIs_p.java | 6 +- htroot/IndexControlURLs_p.java | 2 +- htroot/ViewFile.java | 2 +- htroot/js/yacysearch.js | 11 ++- htroot/yacy/search.java | 6 +- htroot/yacy/urls.java | 6 +- htroot/yacysearch.html | 2 +- htroot/yacysearch.java | 23 +++--- htroot/yacysearchitem.html | 2 +- htroot/yacysearchitem.java | 10 ++- source/de/anomic/index/indexURLEntry.java | 48 +++++------ .../de/anomic/kelondro/kelondroEcoTable.java | 1 + source/de/anomic/plasma/plasmaCrawlLURL.java | 10 +-- source/de/anomic/plasma/plasmaSearchAPI.java | 4 +- .../de/anomic/plasma/plasmaSearchEvent.java | 28 +++---- .../plasma/plasmaSearchRankingProcess.java | 82 ++++++++++++------- .../de/anomic/plasma/plasmaSnippetCache.java | 2 +- .../de/anomic/plasma/plasmaSwitchboard.java | 16 ++-- source/de/anomic/yacy/yacyClient.java | 7 +- 21 files changed, 155 insertions(+), 125 deletions(-) diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java index 9402a8814..4928b88b1 100644 --- a/htroot/Bookmarks.java +++ b/htroot/Bookmarks.java @@ -203,10 +203,10 @@ public class Bookmarks { document = plasmaSnippetCache.retrieveDocument(comp.url(), true, 5000, true); prop.put("mode_edit", "0"); // create mode prop.put("mode_url", comp.url().toNormalform(false, true)); - prop.putHTML("mode_title", comp.title()); - prop.putHTML("mode_description", (document == null) ? comp.title(): document.dc_title()); - prop.putHTML("mode_author", comp.author()); - prop.putHTML("mode_tags", (document == null) ? comp.tags() : document.dc_subject(',')); + prop.putHTML("mode_title", comp.dc_title()); + prop.putHTML("mode_description", (document == null) ? comp.dc_title(): document.dc_title()); + prop.putHTML("mode_author", comp.dc_creator()); + prop.putHTML("mode_tags", (document == null) ? comp.dc_subject() : document.dc_subject(',')); prop.putHTML("mode_path",""); prop.put("mode_public", "0"); prop.put("mode_feed", "0"); //TODO: check if it IS a feed diff --git a/htroot/CrawlResults.java b/htroot/CrawlResults.java index 53eab501a..d842efd4d 100644 --- a/htroot/CrawlResults.java +++ b/htroot/CrawlResults.java @@ -217,11 +217,11 @@ public class CrawlResults { } else { prop.put("table_indexed_" + cnt + "_showTitle_available", "1"); - if (comp.title() == null || comp.title().trim().length() == 0) + if (comp.dc_title() == null || comp.dc_title().trim().length() == 0) prop.put("table_indexed_" + cnt + "_showTitle_available_nodescr", "0"); else prop.put("table_indexed_" + cnt + "_showTitle_available_nodescr", "1"); - prop.putHTML("table_indexed_" + cnt + "_showTitle_available_nodescr_urldescr", comp.title()); + prop.putHTML("table_indexed_" + cnt + "_showTitle_available_nodescr_urldescr", comp.dc_title()); prop.put("table_indexed_" + cnt + "_showTitle_available_cachepath", cachepath); prop.putHTML("table_indexed_" + cnt + "_showTitle_available_urltitle", urlstr); diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index 9342d6ede..2b1e76515 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -89,7 +89,7 @@ public class IndexControlRWIs_p { if (post.containsKey("keystringsearch")) { keyhash = plasmaCondenser.word2hash(keystring); prop.put("keyhash", keyhash); - final plasmaSearchRankingProcess ranking = plasmaSearchAPI.genSearchresult(prop, sb, keyhash, null, sortorder, false); + final plasmaSearchRankingProcess ranking = plasmaSearchAPI.genSearchresult(prop, sb, keyhash, null, sortorder); if (ranking.filteredCount() == 0) { prop.put("searchresult", 1); prop.put("searchresult_word", keystring); @@ -100,7 +100,7 @@ public class IndexControlRWIs_p { if (keystring.length() == 0 || !plasmaCondenser.word2hash(keystring).equals(keyhash)) { prop.put("keystring", "<not possible to compute word from hash>"); } - final plasmaSearchRankingProcess ranking = plasmaSearchAPI.genSearchresult(prop, sb, keyhash, null, sortorder, false); + final plasmaSearchRankingProcess ranking = plasmaSearchAPI.genSearchresult(prop, sb, keyhash, null, sortorder); if (ranking.filteredCount() == 0) { prop.put("searchresult", 2); prop.put("searchresult_wordhash", keyhash); @@ -159,7 +159,7 @@ public class IndexControlRWIs_p { } kelondroBitfield flags = plasmaSearchAPI.compileFlags(post); int count = (post.get("lines", "all").equals("all")) ? -1 : post.getInt("lines", -1); - final plasmaSearchRankingProcess ranking = plasmaSearchAPI.genSearchresult(prop, sb, keyhash, flags, sortorder, true); + final plasmaSearchRankingProcess ranking = plasmaSearchAPI.genSearchresult(prop, sb, keyhash, flags, sortorder); plasmaSearchAPI.genURLList(prop, keyhash, keystring, ranking, flags, count, sortorder); } diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java index f393c0c1b..18fe83e82 100644 --- a/htroot/IndexControlURLs_p.java +++ b/htroot/IndexControlURLs_p.java @@ -241,7 +241,7 @@ public class IndexControlURLs_p { prop.put("genUrlProfile", "2"); prop.putHTML("genUrlProfile_urlNormalform", comp.url().toNormalform(false, true)); prop.put("genUrlProfile_urlhash", urlhash); - prop.put("genUrlProfile_urlDescr", comp.title()); + prop.put("genUrlProfile_urlDescr", comp.dc_title()); prop.put("genUrlProfile_moddate", entry.moddate().toString()); prop.put("genUrlProfile_loaddate", entry.loaddate().toString()); prop.put("genUrlProfile_referrer", (le == null) ? 0 : 1); diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index 14a708017..2ad6fa099 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -124,7 +124,7 @@ public class ViewFile { return prop; } url = comp.url(); - descr = comp.title(); + descr = comp.dc_title(); urlEntry.wordCount(); size = urlEntry.size(); pre = urlEntry.flags().get(plasmaCondenser.flag_cat_indexof); diff --git a/htroot/js/yacysearch.js b/htroot/js/yacysearch.js index 2ec23214c..febcc936a 100644 --- a/htroot/js/yacysearch.js +++ b/htroot/js/yacysearch.js @@ -70,9 +70,12 @@ function addHover() { } } -function statistics(offset, items, global, total) { +function statistics(offset, itemscount, totalcount, localResourceSize, remoteResourceSize, remoteIndexCount, remotePeerCount) { document.getElementById("resultsOffset").firstChild.nodeValue = offset; - document.getElementById("itemscount").firstChild.nodeValue = items; - document.getElementById("globalcount").firstChild.nodeValue = global; - document.getElementById("totalcount").firstChild.nodeValue = total; + document.getElementById("itemscount").firstChild.nodeValue = itemscount; + document.getElementById("totalcount").firstChild.nodeValue = totalcount; + document.getElementById("localResourceSize").firstChild.nodeValue = localResourceSize; + document.getElementById("remoteResourceSize").firstChild.nodeValue = remoteResourceSize; + document.getElementById("remoteIndexCount").firstChild.nodeValue = remoteIndexCount; + document.getElementById("remotePeerCount").firstChild.nodeValue = remotePeerCount; } \ No newline at end of file diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 7be788821..39d734df8 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -183,7 +183,7 @@ public final class search { snippetComputationAllTime = theSearch.getSnippetComputationTime(); // set statistic details of search result and find best result index set - if (theSearch.getLocalCount() == 0) { + if (theSearch.getRankingResult().getLocalResourceSize() == 0) { prop.put("indexcount", ""); prop.put("joincount", "0"); } else { @@ -207,11 +207,11 @@ public final class search { } prop.put("indexcount", indexcount.toString()); - if (theSearch.getLocalCount() == 0) { + if (theSearch.getRankingResult().getLocalResourceSize() == 0) { joincount = 0; prop.put("joincount", "0"); } else { - joincount = theSearch.getLocalCount(); + joincount = theSearch.getRankingResult().getLocalResourceSize(); prop.put("joincount", Integer.toString(joincount)); accu = theSearch.completeResults(duetime); } diff --git a/htroot/yacy/urls.java b/htroot/yacy/urls.java index 5a0a08a11..c114d6a75 100644 --- a/htroot/yacy/urls.java +++ b/htroot/yacy/urls.java @@ -104,11 +104,11 @@ public class urls { referrer = sb.getURL(entry.referrerHash()); // create RSS entry comp = entry.comp(); - prop.put("item_" + c + "_title", comp.title()); + prop.put("item_" + c + "_title", comp.dc_title()); prop.putHTML("item_" + c + "_link", comp.url().toNormalform(true, false)); prop.putHTML("item_" + c + "_referrer", (referrer == null) ? "" : referrer.toNormalform(true, false)); - prop.putHTML("item_" + c + "_description", comp.title()); - prop.put("item_" + c + "_author", comp.author()); + prop.putHTML("item_" + c + "_description", comp.dc_title()); + prop.put("item_" + c + "_author", comp.dc_creator()); prop.put("item_" + c + "_pubDate", serverDate.formatShortSecond(entry.moddate())); prop.put("item_" + c + "_guid", entry.hash()); c++; diff --git a/htroot/yacysearch.html b/htroot/yacysearch.html index df304a9f7..4f0ddfd22 100644 --- a/htroot/yacysearch.html +++ b/htroot/yacysearch.html @@ -99,7 +99,7 @@ document.getElementById("Enter").value = "search again - catch up more links";

No Results. (length of search words must be at least 3 characters)

::
- #[offset]#-#[linkcount]# results from a total number of #[totalcount]# known#(globalresults)#.::, #[globalcount]# links from other YaCy peers.#(/globalresults)# + #[offset]#-#[itemscount]# results from a total number of #[totalcount]# known#(globalresults)#.:: (#[localResourceSize]# local, #[remoteResourceSize]# remote), #[remoteIndexCount]# links from #[remotePeerCount]# other YaCy peers.#(/globalresults)# #[resnav]# ::

Searching the web with this peer is disabled for unauthorized users. Please log in as administrator to use the search function

diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index bb4e1ac71..491cb9269 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -240,8 +240,8 @@ public class yacysearch { // create a news message HashMap map = new HashMap(); map.put("url", comp.url().toNormalform(false, true).replace(',', '|')); - map.put("title", comp.title().replace(',', ' ')); - map.put("description", ((document == null) ? comp.title() : document.dc_title()).replace(',', ' ')); + map.put("title", comp.dc_title().replace(',', ' ')); + map.put("description", ((document == null) ? comp.dc_title() : document.dc_title()).replace(',', ' ')); map.put("author", ((document == null) ? "" : document.dc_creator())); map.put("tags", ((document == null) ? "" : document.dc_subject(' '))); yacyCore.newsPool.publishMyNews(yacyNewsRecord.newRecord(yacyNewsPool.CATEGORY_SURFTIPP_ADD, map)); @@ -306,12 +306,12 @@ public class yacysearch { // log serverLog.logInfo("LOCAL_SEARCH", "EXIT WORD SEARCH: " + theQuery.queryString + " - " + - (theSearch.getLocalCount() + theSearch.getGlobalCount()) + " links found, " + + (theSearch.getRankingResult().getLocalResourceSize() + theSearch.getRankingResult().getRemoteResourceSize()) + " links found, " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds"); // prepare search statistics Long trackerHandle = new Long(System.currentTimeMillis()); - HashMap searchProfile = theQuery.resultProfile(theSearch.getLocalCount() + theSearch.getGlobalCount(), System.currentTimeMillis() - timestamp, theSearch.getURLRetrievalTime(), theSearch.getSnippetComputationTime()); + HashMap searchProfile = theQuery.resultProfile(theSearch.getRankingResult().getLocalResourceSize() + theSearch.getRankingResult().getRemoteResourceSize(), System.currentTimeMillis() - timestamp, theSearch.getURLRetrievalTime(), theSearch.getSnippetComputationTime()); searchProfile.put("querystring", theQuery.queryString); searchProfile.put("time", trackerHandle); searchProfile.put("host", client); @@ -323,13 +323,16 @@ public class yacysearch { sb.localSearchTracker.put(client, handles); prop = new serverObjects(); - prop.put("num-results_totalcount", yFormatter.number(theSearch.getLocalCount() + theSearch.getGlobalCount(), !rss)); - prop.put("num-results_globalresults", "1"); - prop.put("num-results_globalresults_globalcount", yFormatter.number(theSearch.getGlobalCount(), !rss)); prop.put("num-results_offset", offset); - prop.put("num-results_linkcount", "0"); + prop.put("num-results_itemscount", "0"); prop.put("num-results_itemsPerPage", itemsPerPage); - + prop.put("num-results_totalcount", yFormatter.number(theSearch.getRankingResult().getLocalResourceSize() + theSearch.getRankingResult().getRemoteResourceSize(), !rss)); + prop.put("num-results_globalresults", (globalsearch) ? "1" : "0"); + prop.put("num-results_globalresults_localResourceSize", yFormatter.number(theSearch.getRankingResult().getLocalResourceSize(), !rss)); + prop.put("num-results_globalresults_remoteResourceSize", yFormatter.number(theSearch.getRankingResult().getRemoteResourceSize(), !rss)); + prop.put("num-results_globalresults_remoteIndexCount", yFormatter.number(theSearch.getRankingResult().getRemoteIndexCount(), !rss)); + prop.put("num-results_globalresults_remotePeerCount", yFormatter.number(theSearch.getRankingResult().getRemotePeerCount(), !rss)); + // compose page navigation StringBuffer resnav = new StringBuffer(); int thispage = offset / theQuery.displayResults(); @@ -337,7 +340,7 @@ public class yacysearch { resnav.append(navurla(thispage - 1, display, theQuery)); resnav.append("< "); } - int numberofpages = Math.min(10, Math.min(thispage + 2, (theSearch.getGlobalCount() + theSearch.getLocalCount()) / theQuery.displayResults())); + int numberofpages = Math.min(10, Math.min(thispage + 2, (theSearch.getRankingResult().getRemoteResourceSize() + theSearch.getRankingResult().getLocalResourceSize()) / theQuery.displayResults())); for (int i = 0; i < numberofpages; i++) { if (i == thispage) { resnav.append(""); diff --git a/htroot/yacysearchitem.html b/htroot/yacysearchitem.html index accde7bee..b65002174 100644 --- a/htroot/yacysearchitem.html +++ b/htroot/yacysearchitem.html @@ -62,7 +62,7 @@ #(/rssreferences)# #(dynamic)#:: #(/dynamic)# diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index 048f602a8..8ca1f459d 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -44,6 +44,7 @@ import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.tools.crypt; import de.anomic.tools.nxTools; +import de.anomic.tools.yFormatter; import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacyNewsPool; import de.anomic.yacy.yacySeed; @@ -85,9 +86,12 @@ public class yacysearchitem { // dynamically update count values if (!rss) { prop.put("dynamic_offset", theQuery.neededResults() - theQuery.displayResults() + 1); - prop.put("dynamic_global", theSearch.getGlobalCount()); - prop.put("dynamic_total", theSearch.getGlobalCount() + theSearch.getLocalCount()); - prop.put("dynamic_items", (item < 0) ? theQuery.neededResults() : item + 1); + prop.put("dynamic_itemscount", (item < 0) ? theQuery.neededResults() : item + 1); + prop.put("dynamic_totalcount", yFormatter.number(theSearch.getRankingResult().getLocalResourceSize() + theSearch.getRankingResult().getRemoteResourceSize(), !rss)); + prop.put("dynamic_localResourceSize", yFormatter.number(theSearch.getRankingResult().getLocalResourceSize(), !rss)); + prop.put("dynamic_remoteResourceSize", yFormatter.number(theSearch.getRankingResult().getRemoteResourceSize(), !rss)); + prop.put("dynamic_remoteIndexCount", yFormatter.number(theSearch.getRankingResult().getRemoteIndexCount(), !rss)); + prop.put("dynamic_remotePeerCount", yFormatter.number(theSearch.getRankingResult().getRemotePeerCount(), !rss)); prop.put("dynamic", "1"); } diff --git a/source/de/anomic/index/indexURLEntry.java b/source/de/anomic/index/indexURLEntry.java index c317fe80f..a3877f2bb 100644 --- a/source/de/anomic/index/indexURLEntry.java +++ b/source/de/anomic/index/indexURLEntry.java @@ -120,9 +120,9 @@ public class indexURLEntry { public indexURLEntry( yacyURL url, - String descr, - String author, - String tags, + String dc_title, + String dc_creator, + String dc_subject, String ETag, Date mod, Date load, @@ -143,7 +143,7 @@ public class indexURLEntry { // create new entry and store it into database this.entry = rowdef.newEntry(); this.entry.setCol(col_hash, url.hash(), null); - this.entry.setCol(col_comp, encodeComp(url, descr, author, tags, ETag)); + this.entry.setCol(col_comp, encodeComp(url, dc_title, dc_creator, dc_subject, ETag)); encodeDate(col_mod, mod); encodeDate(col_load, load); encodeDate(col_fresh, fresh); @@ -175,12 +175,12 @@ public class indexURLEntry { return new Date(86400000 * this.entry.getColLong(col)); } - public static byte[] encodeComp(yacyURL url, String descr, String author, String tags, String ETag) { + public static byte[] encodeComp(yacyURL url, String dc_title, String dc_creator, String dc_subject, String ETag) { serverCharBuffer s = new serverCharBuffer(200); s.append(url.toNormalform(false, true)).append(10); - s.append(descr).append(10); - s.append(author).append(10); - s.append(tags).append(10); + s.append(dc_title).append(10); + s.append(dc_creator).append(10); + s.append(dc_subject).append(10); s.append(ETag).append(10); return s.toString().getBytes(); } @@ -203,13 +203,13 @@ public class indexURLEntry { url = null; } String descr = crypt.simpleDecode(prop.getProperty("descr", ""), null); if (descr == null) descr = ""; - String author = crypt.simpleDecode(prop.getProperty("author", ""), null); if (author == null) author = ""; + String dc_creator = crypt.simpleDecode(prop.getProperty("author", ""), null); if (dc_creator == null) dc_creator = ""; String tags = crypt.simpleDecode(prop.getProperty("tags", ""), null); if (tags == null) tags = ""; String ETag = crypt.simpleDecode(prop.getProperty("ETag", ""), null); if (ETag == null) ETag = ""; this.entry = rowdef.newEntry(); this.entry.setCol(col_hash, url.hash(), null); - this.entry.setCol(col_comp, encodeComp(url, descr, author, tags, ETag)); + this.entry.setCol(col_comp, encodeComp(url, descr, dc_creator, tags, ETag)); try { encodeDate(col_mod, serverDate.parseShortDay(prop.getProperty("mod", "20000101"))); } catch (ParseException e) { @@ -256,9 +256,9 @@ public class indexURLEntry { try { s.append("hash=").append(hash()); s.append(",url=").append(crypt.simpleEncode(comp.url().toNormalform(false, true))); - s.append(",descr=").append(crypt.simpleEncode(comp.title())); - s.append(",author=").append(crypt.simpleEncode(comp.author())); - s.append(",tags=").append(crypt.simpleEncode(comp.tags())); + s.append(",descr=").append(crypt.simpleEncode(comp.dc_title())); + s.append(",author=").append(crypt.simpleEncode(comp.dc_creator())); + s.append(",tags=").append(crypt.simpleEncode(comp.dc_subject())); s.append(",ETag=").append(crypt.simpleEncode(comp.ETag())); s.append(",mod=").append(serverDate.formatShortDay(moddate())); s.append(",load=").append(serverDate.formatShortDay(loaddate())); @@ -429,7 +429,7 @@ public class indexURLEntry { null, comp().url(), referrerHash(), - comp().title(), + comp().dc_title(), loaddate(), null, 0, @@ -455,7 +455,7 @@ public class indexURLEntry { public class Components { private yacyURL url; - private String title, author, tags, ETag; + private String dc_title, dc_creator, dc_subject, ETag; public Components(String url, String urlhash, String title, String author, String tags, String ETag) { try { @@ -463,22 +463,22 @@ public class indexURLEntry { } catch (MalformedURLException e) { this.url = null; } - this.title = title; - this.author = author; - this.tags = tags; + this.dc_title = title; + this.dc_creator = author; + this.dc_subject = tags; this.ETag = ETag; } public Components(yacyURL url, String descr, String author, String tags, String ETag) { this.url = url; - this.title = descr; - this.author = author; - this.tags = tags; + this.dc_title = descr; + this.dc_creator = author; + this.dc_subject = tags; this.ETag = ETag; } public yacyURL url() { return this.url; } - public String title() { return this.title; } - public String author() { return this.author; } - public String tags() { return this.tags; } + public String dc_title() { return this.dc_title; } + public String dc_creator() { return this.dc_creator; } + public String dc_subject() { return this.dc_subject; } public String ETag() { return this.ETag; } } diff --git a/source/de/anomic/kelondro/kelondroEcoTable.java b/source/de/anomic/kelondro/kelondroEcoTable.java index 939b0a50f..8dea37d30 100644 --- a/source/de/anomic/kelondro/kelondroEcoTable.java +++ b/source/de/anomic/kelondro/kelondroEcoTable.java @@ -343,6 +343,7 @@ public class kelondroEcoTable implements kelondroIndex { } else { // read old value kelondroRow.Entry v = table.get(i); + assert v != null; System.arraycopy(row.getPrimaryKeyBytes(), 0, b, 0, rowdef.primaryKeyLength); System.arraycopy(v.bytes(), 0, b, rowdef.primaryKeyLength, rowdef.objectsize - rowdef.primaryKeyLength); // write new value diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index e2a6da400..45b07940d 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -72,10 +72,10 @@ import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroCache; import de.anomic.kelondro.kelondroCloneableIterator; import de.anomic.kelondro.kelondroException; -import de.anomic.kelondro.kelondroSplitTable; import de.anomic.kelondro.kelondroIndex; import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroRowSet; +import de.anomic.kelondro.kelondroSplitTable; import de.anomic.plasma.urlPattern.plasmaURLPattern; import de.anomic.server.serverCodings; import de.anomic.server.logging.serverLog; @@ -623,14 +623,14 @@ public final class plasmaCrawlLURL { pw.println(url); } if (format == 1) { - pw.println("" + htmlTools.encodeUnicode2html(comp.title(), true, true) + "
"); + pw.println("" + htmlTools.encodeUnicode2html(comp.dc_title(), true, true) + "
"); } if (format == 2) { pw.println(""); - pw.println("" + htmlTools.encodeUnicode2html(comp.title(), true, true) + ""); + pw.println("" + htmlTools.encodeUnicode2html(comp.dc_title(), true, true) + ""); pw.println("" + yacyURL.escape(url) + ""); - if (comp.author().length() > 0) pw.println("" + htmlTools.encodeUnicode2html(comp.author(), true, true) + ""); - if (comp.tags().length() > 0) pw.println("" + htmlTools.encodeUnicode2html(comp.tags(), true, true) + ""); + if (comp.dc_creator().length() > 0) pw.println("" + htmlTools.encodeUnicode2html(comp.dc_creator(), true, true) + ""); + if (comp.dc_subject().length() > 0) pw.println("" + htmlTools.encodeUnicode2html(comp.dc_subject(), true, true) + ""); pw.println("" + entry.moddate().toString() + ""); pw.println("" + entry.hash() + ""); pw.println(""); diff --git a/source/de/anomic/plasma/plasmaSearchAPI.java b/source/de/anomic/plasma/plasmaSearchAPI.java index c2b17f3a7..3b5979fc5 100644 --- a/source/de/anomic/plasma/plasmaSearchAPI.java +++ b/source/de/anomic/plasma/plasmaSearchAPI.java @@ -88,10 +88,10 @@ public class plasmaSearchAPI { } } - public static plasmaSearchRankingProcess genSearchresult(serverObjects prop, plasmaSwitchboard sb, String keyhash, kelondroBitfield filter, int sortorder, boolean fetchURLs) { + public static plasmaSearchRankingProcess genSearchresult(serverObjects prop, plasmaSwitchboard sb, String keyhash, kelondroBitfield filter, int sortorder) { plasmaSearchQuery query = new plasmaSearchQuery(keyhash, -1, sb.getRanking(), filter); plasmaSearchRankingProcess ranked = new plasmaSearchRankingProcess(sb.wordIndex, query, sortorder, Integer.MAX_VALUE); - ranked.execQuery(fetchURLs); + ranked.execQuery(); if (ranked.filteredCount() == 0) { prop.put("searchresult", 2); diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index 685553c77..736732e58 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -76,7 +76,6 @@ public final class plasmaSearchEvent { public TreeMap IAResults; public TreeMap IACount; public String IAmaxcounthash, IAneardhthash; - private int localcount; private resultWorker[] workerThreads; private ArrayList resultList; //private int resultListLock; // a pointer that shows that all elements below this pointer are fixed and may not be changed again @@ -101,7 +100,6 @@ public final class plasmaSearchEvent { this.IACount = new TreeMap(); this.IAmaxcounthash = null; this.IAneardhthash = null; - this.localcount = 0; this.urlRetrievalAllTime = 0; this.snippetComputationAllTime = 0; this.workerThreads = null; @@ -157,8 +155,7 @@ public final class plasmaSearchEvent { } else { // do a local search this.rankedCache = new plasmaSearchRankingProcess(wordIndex, query, 2, max_results_preparation); - this.rankedCache.execQuery(true); - this.localcount = this.rankedCache.filteredCount(); + this.rankedCache.execQuery(); //plasmaWordIndex.Finding finding = wordIndex.retrieveURLs(query, false, 2, ranking, process); if (generateAbstracts) { @@ -249,8 +246,7 @@ public final class plasmaSearchEvent { // sort the local containers and truncate it to a limited count, // so following sortings together with the global results will be fast synchronized (rankedCache) { - rankedCache.execQuery(true); - localcount = rankedCache.filteredCount(); + rankedCache.execQuery(); } } } @@ -291,13 +287,13 @@ public final class plasmaSearchEvent { long startTime = System.currentTimeMillis(); indexURLEntry.Components comp = page.comp(); - String pagetitle = comp.title().toLowerCase(); + String pagetitle = comp.dc_title().toLowerCase(); if (comp.url() == null) { registerFailure(page.hash(), "url corrupted (null)"); return null; // rare case where the url is corrupted } String pageurl = comp.url().toString().toLowerCase(); - String pageauthor = comp.author().toLowerCase(); + String pageauthor = comp.dc_creator().toLowerCase(); long dbRetrievalTime = System.currentTimeMillis() - startTime; // check exclusion @@ -315,7 +311,7 @@ public final class plasmaSearchEvent { // check constraints if ((query.constraint != null) && (query.constraint.get(plasmaCondenser.flag_cat_indexof)) && - (!(comp.title().startsWith("Index of")))) { + (!(comp.dc_title().startsWith("Index of")))) { final Iterator wi = query.queryHashes.iterator(); while (wi.hasNext()) wordIndex.removeEntry((String) wi.next(), page.hash()); registerFailure(page.hash(), "index-of constraint not fullfilled"); @@ -423,12 +419,8 @@ public final class plasmaSearchEvent { return secondarySearchThreads; } - public int getLocalCount() { - return this.localcount; - } - - public int getGlobalCount() { - return this.rankedCache.getGlobalCount(); + public plasmaSearchRankingProcess getRankingResult() { + return this.rankedCache; } public long getURLRetrievalTime() { @@ -465,7 +457,7 @@ public final class plasmaSearchEvent { if ((query.onlineSnippetFetch) && (!event.anyWorkerAlive()) && (event.resultList.size() < query.neededResults() + 10) && - ((event.getLocalCount() + event.getGlobalCount()) > event.resultList.size())) { + ((event.getRankingResult().getLocalResourceSize() + event.getRankingResult().getRemoteResourceSize()) > event.resultList.size())) { // set new timeout event.eventTime = System.currentTimeMillis(); // start worker threads to fetch urls and snippets @@ -764,7 +756,7 @@ public final class plasmaSearchEvent { ("yacyshare " + filename.replace('?', ' ') + " " + - urlcomps.title()).getBytes(), "UTF-8").keySet(), + urlcomps.dc_title()).getBytes(), "UTF-8").keySet(), urlentry.hash()); wordIndex.loadedURL.remove(urlentry.hash()); // clean up throw new RuntimeException("index void"); @@ -794,7 +786,7 @@ public final class plasmaSearchEvent { return (alternative_urlname == null) ? urlcomps.url().toNormalform(false, true) : alternative_urlname; } public String title() { - return urlcomps.title(); + return urlcomps.dc_title(); } public plasmaSnippetCache.TextSnippet textSnippet() { return this.textSnippet; diff --git a/source/de/anomic/plasma/plasmaSearchRankingProcess.java b/source/de/anomic/plasma/plasmaSearchRankingProcess.java index 6c212ce4a..fcd9b5e5d 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProcess.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProcess.java @@ -57,9 +57,8 @@ public final class plasmaSearchRankingProcess { private HashMap handover; // key = urlhash, value = urlstring; used for double-check of urls that had been handed over to search process private plasmaSearchQuery query; private int sortorder; - private int filteredCount; private int maxentries; - private int globalcount; + private int remote_peerCount, remote_indexCount, remote_resourceSize, local_resourceSize; private indexRWIEntryOrder order; private HashMap urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion) private kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic @@ -76,11 +75,13 @@ public final class plasmaSearchRankingProcess { this.sortedRWIEntries = new TreeMap(); this.doubleDomCache = new HashMap>(); this.handover = new HashMap(); - this.filteredCount = 0; this.order = null; this.query = query; this.maxentries = maxentries; - this.globalcount = 0; + this.remote_peerCount = 0; + this.remote_indexCount = 0; + this.remote_resourceSize = 0; + this.local_resourceSize = 0; this.urlhashes = new HashMap(); this.ref = new kelondroMScoreCluster(); this.misses = new TreeSet(); @@ -90,7 +91,7 @@ public final class plasmaSearchRankingProcess { for (int i = 0; i < 32; i++) {this.flagcount[i] = 0;} } - public void execQuery(boolean fetchURLs) { + public void execQuery() { long timer = System.currentTimeMillis(); this.localSearchContainerMaps = wordIndex.localSearchContainers(query, null); @@ -113,16 +114,24 @@ public final class plasmaSearchRankingProcess { } if (sortorder == 2) { - insertRanked(index, true); + insertRanked(index, true, index.size()); } else { - insertNoOrder(index, fetchURLs); + insertNoOrder(index, true, index.size()); } } - private void insertNoOrder(indexContainer index, boolean local) { + private void insertNoOrder(indexContainer index, boolean local, int fullResource) { final Iterator en = index.entries(); // generate a new map where the urls are sorted (not by hash but by the url text) + if (local) { + this.local_resourceSize += fullResource; + } else { + this.remote_resourceSize += fullResource; + this.remote_peerCount++; + this.remote_indexCount += index.size(); + } + indexRWIRowEntry ientry; indexURLEntry uentry; String u; @@ -141,20 +150,14 @@ public final class plasmaSearchRankingProcess { if (sortorder == 0) { this.sortedRWIEntries.put(ientry.urlHash(), ientry); this.urlhashes.put(ientry.urlHash(), ientry.urlHash()); - filteredCount++; } else { - if (local) { - uentry = wordIndex.loadedURL.load(ientry.urlHash(), ientry, 0); - if (uentry == null) { - this.misses.add(ientry.urlHash()); - } else { - u = uentry.comp().url().toNormalform(false, true); - this.sortedRWIEntries.put(u, ientry); - this.urlhashes.put(ientry.urlHash(), u); - filteredCount++; - } + uentry = wordIndex.loadedURL.load(ientry.urlHash(), ientry, 0); + if (uentry == null) { + this.misses.add(ientry.urlHash()); } else { - filteredCount++; + u = uentry.comp().url().toNormalform(false, true); + this.sortedRWIEntries.put(u, ientry); + this.urlhashes.put(ientry.urlHash(), u); } } @@ -163,12 +166,18 @@ public final class plasmaSearchRankingProcess { } // end loop } - public void insertRanked(indexContainer index, boolean local) { + public void insertRanked(indexContainer index, boolean local, int fullResource) { // we collect the urlhashes and construct a list with urlEntry objects // attention: if minEntries is too high, this method will not terminate within the maxTime assert (index != null); if (index.size() == 0) return; + if (local) { + this.local_resourceSize += fullResource; + } else { + this.remote_resourceSize += fullResource; + this.remote_peerCount++; + } long timer = System.currentTimeMillis(); if (this.order == null) { @@ -224,11 +233,8 @@ public final class plasmaSearchRankingProcess { } // increase counter for statistics - if (!local) this.globalcount++; + if (!local) this.remote_indexCount++; } - this.filteredCount = sortedRWIEntries.size(); - //long sc = Math.max(1, System.currentTimeMillis() - s0); - //System.out.println("###DEBUG### time to sort " + container.size() + " entries to " + this.filteredCount + ": " + sc + " milliseconds, " + (container.size() / sc) + " entries/millisecond, ranking = " + tc); //if ((query.neededResults() > 0) && (container.size() > query.neededResults())) remove(true, true); serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.PRESORT, index.size(), System.currentTimeMillis() - timer)); @@ -350,14 +356,34 @@ public final class plasmaSearchRankingProcess { return flagcount; } + // "results from a total number of known ( local, remote), links from other YaCy peers." + public int filteredCount() { - return this.filteredCount; + // the number of index entries that are considered as result set + return this.sortedRWIEntries.size(); } - public int getGlobalCount() { - return this.globalcount; + public int getRemoteIndexCount() { + // the number of result contributions from all the remote peers + return this.remote_indexCount; } + public int getRemotePeerCount() { + // the number of remote peers that have contributed + return this.remote_peerCount; + } + + public int getRemoteResourceSize() { + // the number of all hits in all the remote peers + return this.remote_resourceSize; + } + + public int getLocalResourceSize() { + // the number of hits in the local peer (index size, size of the collection in the own index) + return this.local_resourceSize; + } + + public indexRWIEntry remove(String urlHash) { Object r = (Long) urlhashes.get(urlHash); if (r == null) return null; diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index 47b314544..9c1587dfd 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -874,7 +874,7 @@ public class plasmaSnippetCache { plasmaSearchEvent event = plasmaSearchEvent.getEvent(eventID); assert plasmaSwitchboard.getSwitchboard() != null; assert plasmaSwitchboard.getSwitchboard().wordIndex != null; - assert event != null; + assert event != null : "eventID = " + eventID; assert event.getQuery() != null; plasmaSwitchboard.getSwitchboard().wordIndex.removeEntryMultiple(event.getQuery().queryHashes, urlHash); event.remove(urlHash); diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index debcb8a79..b9b0e29d0 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -2205,7 +2205,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser /* ========================================================================= * CREATE INDEX * ========================================================================= */ - String docDescription = document.dc_title(); + String dc_title = document.dc_title(); yacyURL referrerURL = entry.referrerURL(); String noIndexReason = plasmaCrawlEURL.DENIED_UNSPECIFIED_INDEXING_ERROR; @@ -2236,9 +2236,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser long ldate = System.currentTimeMillis(); indexURLEntry newEntry = new indexURLEntry( entry.url(), // URL - docDescription, // document description - document.dc_creator(), // author - document.dc_subject(' '), // tags + dc_title, // document description + document.dc_creator(), // author + document.dc_subject(' '), // tags "", // ETag docDate, // modification date new Date(), // loaded date @@ -2406,7 +2406,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // of string concatenation log.logInfo("*Indexed " + words + " words in URL " + entry.url() + " [" + entry.urlHash() + "]" + - "\n\tDescription: " + docDescription + + "\n\tDescription: " + dc_title + "\n\tMimeType: " + document.dc_format() + " | Charset: " + document.getCharset() + " | " + "Size: " + document.getTextLength() + " bytes | " + "Anchors: " + ((document.getAnchors() == null) ? 0 : document.getAnchors().size()) + @@ -2430,7 +2430,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } } else { log.logFine("Not Indexed Resource '" + entry.url().toNormalform(false, true) + "': process case=" + processCase); - addURLtoErrorDB(entry.url(), referrerURL.hash(), initiatorPeerHash, docDescription, plasmaCrawlEURL.DENIED_UNKNOWN_INDEXING_PROCESS_CASE, new kelondroBitfield()); + addURLtoErrorDB(entry.url(), referrerURL.hash(), initiatorPeerHash, dc_title, plasmaCrawlEURL.DENIED_UNKNOWN_INDEXING_PROCESS_CASE, new kelondroBitfield()); } } catch (Exception ee) { if (ee instanceof InterruptedException) throw (InterruptedException)ee; @@ -2443,7 +2443,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if (clusterhashes != null) initiatorPeer.setAlternativeAddress((String) clusterhashes.get(initiatorPeer.hash)); yacyClient.crawlReceipt(initiatorPeer, "crawl", "exception", ee.getMessage(), null, ""); } - addURLtoErrorDB(entry.url(), (referrerURL == null) ? null : referrerURL.hash(), initiatorPeerHash, docDescription, plasmaCrawlEURL.DENIED_UNSPECIFIED_INDEXING_ERROR, new kelondroBitfield()); + addURLtoErrorDB(entry.url(), (referrerURL == null) ? null : referrerURL.hash(), initiatorPeerHash, dc_title, plasmaCrawlEURL.DENIED_UNSPECIFIED_INDEXING_ERROR, new kelondroBitfield()); } } else { @@ -2451,7 +2451,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser checkInterruption(); log.logInfo("Not indexed any word in URL " + entry.url() + "; cause: " + noIndexReason); - addURLtoErrorDB(entry.url(), (referrerURL == null) ? null : referrerURL.hash(), initiatorPeerHash, docDescription, noIndexReason, new kelondroBitfield()); + addURLtoErrorDB(entry.url(), (referrerURL == null) ? null : referrerURL.hash(), initiatorPeerHash, dc_title, noIndexReason, new kelondroBitfield()); if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) { if (clusterhashes != null) initiatorPeer.setAlternativeAddress((String) clusterhashes.get(initiatorPeer.hash)); yacyClient.crawlReceipt(initiatorPeer, "crawl", "rejected", noIndexReason, null, ""); diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index d88a3e85d..faf1f278c 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -477,9 +477,10 @@ public final class yacyClient { // now create a plasmaIndex out of this result // System.out.println("yacyClient: " + ((urlhashes.length() == 0) ? "primary" : "secondary")+ " search result = " + result.toString()); // debug - int results = 0; + int results = 0, joincount = 0; try { - results = Integer.parseInt((String) result.get("count")); + results = Integer.parseInt(result.get("count")); + joincount = Integer.parseInt(result.get("joincount")); } catch (NumberFormatException e) { yacyCore.log.logFine("SEARCH failed FROM " + target.hash + ":" + target.getName() + ", wrong output format"); yacyCore.peerActions.peerDeparture(target, "search request to peer created number format exception"); @@ -557,7 +558,7 @@ public final class yacyClient { // store remote result to local result container synchronized (containerCache) { // insert one container into the search result buffer - containerCache.insertRanked(container[0], false); // one is enough + containerCache.insertRanked(container[0], false, joincount); // one is enough // integrate remote topwords String references = (String) result.get("references");