From bedd8dfbe212ad11a7126e0f44ca5aec1075ae5e Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 21 Feb 2008 14:53:51 +0000 Subject: [PATCH] - added image sorting by image size. This is the default now. This is performed using a 3-stage sorting process: - sort by relevance, then do snippet-fetch - sort snippets by relevance then do image link extraction - sort image links by image size; unknown sizes are handled like small sizes - only the exact amount of images as requested are shown git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4499 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/yacy/user/ysearchitem.html | 2 +- htroot/yacy/user/ysearchitem.java | 70 +++++++++---------- htroot/yacysearchitem.html | 2 +- htroot/yacysearchitem.java | 64 ++++++++--------- .../htmlFilter/htmlFilterImageEntry.java | 6 +- .../de/anomic/plasma/plasmaSearchEvent.java | 48 ++++++++++++- .../de/anomic/plasma/plasmaSnippetCache.java | 19 +++-- 7 files changed, 126 insertions(+), 85 deletions(-) diff --git a/htroot/yacy/user/ysearchitem.html b/htroot/yacy/user/ysearchitem.html index a210af865..b15f93d31 100644 --- a/htroot/yacy/user/ysearchitem.html +++ b/htroot/yacy/user/ysearchitem.html @@ -13,7 +13,7 @@ #[name]#
#[name]#
-
#[name]#
+
#[name]##[attr]#
#{/items}# :: diff --git a/htroot/yacy/user/ysearchitem.java b/htroot/yacy/user/ysearchitem.java index 9e9921971..79e161832 100644 --- a/htroot/yacy/user/ysearchitem.java +++ b/htroot/yacy/user/ysearchitem.java @@ -164,35 +164,32 @@ public class ysearchitem { return prop; } - - // generate result object - plasmaSearchEvent.ResultEntry result = theSearch.oneResult(item); - - if (result == null) { - // no content - return prop; - } - - if (rss) { - // text search for rss output - prop.put("rss", "1"); // switch on specific content - prop.putHTML("rss_title", result.title(), true); - prop.putHTML("rss_description", result.textSnippet().getLineRaw(), true); - prop.putHTML("rss_link", result.urlstring(), true); - prop.put("rss_urlhash", result.hash()); - prop.put("rss_date", plasmaSwitchboard.dateString822(result.modified())); - return prop; - } - + prop.put("rss", "0"); if (theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_TEXT) { // text search + + // generate result object + plasmaSearchEvent.ResultEntry result = theSearch.oneResult(item); + if (result == null) return prop; // no content + + if (rss) { + // text search for rss output + prop.put("rss", "1"); // switch on specific content + prop.putHTML("rss_title", result.title(), true); + prop.putHTML("rss_description", result.textSnippet().getLineRaw(), true); + prop.putHTML("rss_link", result.urlstring(), true); + prop.put("rss_urlhash", result.hash()); + prop.put("rss_date", plasmaSwitchboard.dateString822(result.modified())); + return prop; + } + prop.put("content", theQuery.contentdom + 1); // switch on specific content prop.put("content_authorized", authenticated ? "1" : "0"); prop.put("content_authorized_recommend", (yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, yacyNewsPool.CATEGORY_SURFTIPP_ADD, "url", result.urlstring()) == null) ? "1" : "0"); - prop.put("content_authorized_recommend_deletelink", "/ysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=local&time=3&deleteref=" + result.hash() + "&urlmaskfilter=.*"); - prop.put("content_authorized_recommend_recommendlink", "/ysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=local&time=3&recommendref=" + result.hash() + "&urlmaskfilter=.*"); + prop.put("content_authorized_recommend_deletelink", "/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=local&time=3&deleteref=" + result.hash() + "&urlmaskfilter=.*"); + prop.put("content_authorized_recommend_recommendlink", "/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=local&time=3&recommendref=" + result.hash() + "&urlmaskfilter=.*"); prop.put("content_authorized_urlhash", result.hash()); prop.putHTML("content_description", result.title()); prop.put("content_url", result.urlstring()); @@ -229,23 +226,17 @@ public class ysearchitem { if (theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) { // image search; shows thumbnails - // iterate over all images in the result + prop.put("content", theQuery.contentdom + 1); // switch on specific content - ArrayList images = result.mediaSnippets(); - if (images != null) { - plasmaSnippetCache.MediaSnippet ms; - int c = 0; - for (int i = 0; i < images.size(); i++) { - ms = (plasmaSnippetCache.MediaSnippet) images.get(i); - prop.putHTML("content_items_" + i + "_href", ms.href.toNormalform(true, false)); - prop.put("content_items_" + i + "_code", sb.licensedURLs.aquireLicense(ms.href)); - prop.putHTML("content_items_" + i + "_name", shorten(ms.name, namelength)); - prop.put("content_items_" + i + "_attr", ms.attr); // attributes, here: original size of image - c++; - } - prop.put("content_items", c); - } else { + plasmaSnippetCache.MediaSnippet ms = theSearch.oneImage(item); + if (ms == null) { prop.put("content_items", "0"); + } else { + prop.putHTML("content_items_0_href", ms.href.toNormalform(true, false)); + prop.put("content_items_0_code", sb.licensedURLs.aquireLicense(ms.href)); + prop.putHTML("content_items_0_name", shorten(ms.name, namelength)); + prop.put("content_items_0_attr", (ms.attr.equals("-1 x -1")) ? "" : " (" + ms.attr + ")"); // attributes, here: original size of image + prop.put("content_items", 1); } return prop; } @@ -254,6 +245,11 @@ public class ysearchitem { (theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) || (theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_APP)) { // any other media content + + // generate result object + plasmaSearchEvent.ResultEntry result = theSearch.oneResult(item); + if (result == null) return prop; // no content + prop.put("content", theQuery.contentdom + 1); // switch on specific content ArrayList media = result.mediaSnippets(); if (item == 0) col = true; diff --git a/htroot/yacysearchitem.html b/htroot/yacysearchitem.html index da06f31e5..bf1c1de38 100644 --- a/htroot/yacysearchitem.html +++ b/htroot/yacysearchitem.html @@ -26,7 +26,7 @@ #[name]# - + #{/items}# :: diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index 9238c768f..96880891a 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -166,29 +166,26 @@ public class yacysearchitem { return prop; } - // generate result object - plasmaSearchEvent.ResultEntry result = theSearch.oneResult(item); - - if (result == null) { - // no content - return prop; - } - - if (rss) { - // text search for rss output - prop.put("rss", "1"); // switch on specific content - prop.putHTML("rss_title", result.title(), true); - prop.putHTML("rss_description", result.textSnippet().getLineRaw(), true); - prop.putHTML("rss_link", result.urlstring(), true); - prop.put("rss_urlhash", result.hash()); - prop.put("rss_date", plasmaSwitchboard.dateString822(result.modified())); - return prop; - } - prop.put("rss", "0"); if (theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_TEXT) { // text search + + // generate result object + plasmaSearchEvent.ResultEntry result = theSearch.oneResult(item); + if (result == null) return prop; // no content + + if (rss) { + // text search for rss output + prop.put("rss", "1"); // switch on specific content + prop.putHTML("rss_title", result.title(), true); + prop.putHTML("rss_description", result.textSnippet().getLineRaw(), true); + prop.putHTML("rss_link", result.urlstring(), true); + prop.put("rss_urlhash", result.hash()); + prop.put("rss_date", plasmaSwitchboard.dateString822(result.modified())); + return prop; + } + prop.put("content", theQuery.contentdom + 1); // switch on specific content prop.put("content_authorized", authenticated ? "1" : "0"); prop.put("content_authorized_recommend", (yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, yacyNewsPool.CATEGORY_SURFTIPP_ADD, "url", result.urlstring()) == null) ? "1" : "0"); @@ -230,23 +227,17 @@ public class yacysearchitem { if (theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) { // image search; shows thumbnails - // iterate over all images in the result + prop.put("content", theQuery.contentdom + 1); // switch on specific content - ArrayList images = result.mediaSnippets(); - if (images != null) { - plasmaSnippetCache.MediaSnippet ms; - int c = 0; - for (int i = 0; i < images.size(); i++) { - ms = (plasmaSnippetCache.MediaSnippet) images.get(i); - prop.putHTML("content_items_" + i + "_href", ms.href.toNormalform(true, false)); - prop.put("content_items_" + i + "_code", sb.licensedURLs.aquireLicense(ms.href)); - prop.putHTML("content_items_" + i + "_name", shorten(ms.name, namelength)); - prop.put("content_items_" + i + "_attr", ms.attr); // attributes, here: original size of image - c++; - } - prop.put("content_items", c); - } else { + plasmaSnippetCache.MediaSnippet ms = theSearch.oneImage(item); + if (ms == null) { prop.put("content_items", "0"); + } else { + prop.putHTML("content_items_0_href", ms.href.toNormalform(true, false)); + prop.put("content_items_0_code", sb.licensedURLs.aquireLicense(ms.href)); + prop.putHTML("content_items_0_name", shorten(ms.name, namelength)); + prop.put("content_items_0_attr", (ms.attr.equals("-1 x -1")) ? "" : " (" + ms.attr + ")"); // attributes, here: original size of image + prop.put("content_items", 1); } return prop; } @@ -255,6 +246,11 @@ public class yacysearchitem { (theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) || (theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_APP)) { // any other media content + + // generate result object + plasmaSearchEvent.ResultEntry result = theSearch.oneResult(item); + if (result == null) return prop; // no content + prop.put("content", theQuery.contentdom + 1); // switch on specific content ArrayList media = result.mediaSnippets(); if (item == 0) col = true; diff --git a/source/de/anomic/htmlFilter/htmlFilterImageEntry.java b/source/de/anomic/htmlFilter/htmlFilterImageEntry.java index b65aa7dd2..bb0631902 100644 --- a/source/de/anomic/htmlFilter/htmlFilterImageEntry.java +++ b/source/de/anomic/htmlFilter/htmlFilterImageEntry.java @@ -80,10 +80,10 @@ public class htmlFilterImageEntry implements Comparable { // this hash method therefore tries to compute a 'perfect hash' based on the size of the images // unfortunately it can not be ensured that all images get different hashes, but this should appear // only in very rare cases - if ((width > 0) && (height > 0)) - return ((0xFFFF - (((width * height) >> 8) & 0xFFFF)) << 16) | (url.hashCode() & 0xFFFF); + if ((width >= 0) && (height >= 0)) + return ((0x7FFF - (((width * height) >> 9) & 0x7FFF)) << 16) | (url.hashCode() & 0xFFFF); else - return 0xFFFF0000 | (url.hashCode() & 0xFFFF); + return 0x7FFF0000 | (url.hashCode() & 0xFFFF); } public int compareTo(htmlFilterImageEntry h) { diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index f5db66c65..15829c4ed 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -81,6 +81,7 @@ public final class plasmaSearchEvent { public String IAmaxcounthash, IAneardhthash; private resultWorker[] workerThreads; private kelondroSortStore result; + private kelondroSortStore images; // container to sort images by size private HashMap failedURLs; // a mapping from a urlhash to a fail reason string TreeSet snippetFetchWordHashes; // a set of word hashes that are used to match with the snippets private long urlRetrievalAllTime; @@ -107,6 +108,7 @@ public final class plasmaSearchEvent { this.workerThreads = null; this.localSearchThread = null; this.result = new kelondroSortStore(-1); // this is the result, enriched with snippets, ranked and ordered by ranking + this.images = new kelondroSortStore(-1); this.failedURLs = new HashMap(); // a map of urls to reason strings where a worker thread tried to work on, but failed. // snippets do not need to match with the complete query hashes, @@ -465,7 +467,8 @@ public final class plasmaSearchEvent { // if worker threads had been alive, but did not succeed, start them again to fetch missing links if ((query.onlineSnippetFetch) && (!event.anyWorkerAlive()) && - (event.result.size() < query.neededResults() + 10) && + (((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (event.images.size() + 30 < query.neededResults())) || + (event.result.size() < query.neededResults() + 10)) && (event.getRankingResult().getLocalResourceSize() + event.getRankingResult().getRemoteResourceSize() > event.result.size())) { // set new timeout event.eventTime = System.currentTimeMillis(); @@ -507,7 +510,9 @@ public final class plasmaSearchEvent { while (System.currentTimeMillis() < this.timeout) { this.lastLifeSign = System.currentTimeMillis(); - if (result.size() >= query.neededResults() /*+ query.displayResults()*/) break; // we have enough + // check if we have enough + if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (images.size() >= query.neededResults() + 30)) break; + if ((query.contentdom != plasmaSearchQuery.CONTENTDOM_IMAGE) && (result.size() >= query.neededResults() + 10 /*+ query.displayResults()*/)) break; // get next entry page = rankedCache.bestURL(true); @@ -558,7 +563,7 @@ public final class plasmaSearchEvent { } public ResultEntry oneResult(int item) { - // first sleep a while to give accumulation threads a chance to work + // check if we already retrieved this item (happens if a search pages is accessed a second time) if (this.result.sizeStore() > item) { // we have the wanted result already in the result array .. return that return this.result.element(item).element; @@ -589,6 +594,43 @@ public final class plasmaSearchEvent { return this.result.element(item).element; } + private int resultCounter = 0; + public ResultEntry nextResult() { + ResultEntry re = oneResult(resultCounter); + resultCounter++; + return re; + } + + public plasmaSnippetCache.MediaSnippet oneImage(int item) { + // check if we already retrieved this item (happens if a search pages is accessed a second time) + if (this.images.sizeStore() > item) { + // we have the wanted result already in the result array .. return that + return this.images.element(item).element; + } + + // feed some results from the result stack into the image stack + int count = Math.min(5, Math.max(1, 10 * this.result.size() / (item + 1))); + for (int i = 0; i < count; i++) { + // generate result object + plasmaSearchEvent.ResultEntry result = nextResult(); + plasmaSnippetCache.MediaSnippet ms; + if (result != null) { + // iterate over all images in the result + ArrayList imagemedia = result.mediaSnippets(); + if (imagemedia != null) { + for (int j = 0; j < imagemedia.size(); j++) { + ms = imagemedia.get(j); + images.push(ms, ms.ranking); + } + } + } + } + + // now take the specific item from the image stack + if (this.images.size() <= item) return null; + return this.images.element(item).element; + } + public ArrayList.stackElement> completeResults(long waitingtime) { long timeout = System.currentTimeMillis() + waitingtime; while ((result.size() < query.neededResults()) && (anyWorkerAlive()) && (System.currentTimeMillis() < timeout)) { diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index 8e07c28b5..93a149298 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -232,11 +232,13 @@ public class plasmaSnippetCache { public int type; public yacyURL href; public String name, attr; - public MediaSnippet(int type, yacyURL href, String name, String attr) { + public int ranking; + public MediaSnippet(int type, yacyURL href, String name, String attr, int ranking) { this.type = type; this.href = href; this.name = name; this.attr = attr; + this.ranking = ranking; // the smaller the better! small values should be shown first if ((this.name == null) || (this.name.length() == 0)) this.name = "_"; if ((this.attr == null) || (this.attr.length() == 0)) this.attr = "_"; } @@ -677,12 +679,12 @@ public class plasmaSnippetCache { desc = entry.getValue(); s = removeAppearanceHashes(url.toNormalform(false, false), queryhashes); if (s.size() == 0) { - result.add(new MediaSnippet(mediatype, url, desc, null)); + result.add(new MediaSnippet(mediatype, url, desc, null, 0)); continue; } s = removeAppearanceHashes(desc, s); if (s.size() == 0) { - result.add(new MediaSnippet(mediatype, url, desc, null)); + result.add(new MediaSnippet(mediatype, url, desc, null, 0)); continue; } } @@ -691,7 +693,8 @@ public class plasmaSnippetCache { public static ArrayList computeImageSnippets(plasmaParserDocument document, Set queryhashes) { - TreeSet images = document.getImages(); + TreeSet images = document.getImages(); // iterates images in descending size order! + // a measurement for the size of the images can be retrieved using the htmlFilterImageEntry.hashCode() Iterator i = images.iterator(); htmlFilterImageEntry ientry; @@ -705,12 +708,16 @@ public class plasmaSnippetCache { desc = ientry.alt(); s = removeAppearanceHashes(url.toNormalform(false, false), queryhashes); if (s.size() == 0) { - result.add(new MediaSnippet(plasmaSearchQuery.CONTENTDOM_IMAGE, url, desc, ientry.width() + " x " + ientry.height())); + int ranking = ientry.hashCode(); + System.out.println(ranking); + result.add(new MediaSnippet(plasmaSearchQuery.CONTENTDOM_IMAGE, url, desc, ientry.width() + " x " + ientry.height(), ranking)); continue; } s = removeAppearanceHashes(desc, s); if (s.size() == 0) { - result.add(new MediaSnippet(plasmaSearchQuery.CONTENTDOM_IMAGE, url, desc, ientry.width() + " x " + ientry.height())); + int ranking = ientry.hashCode(); + System.out.println(ranking); + result.add(new MediaSnippet(plasmaSearchQuery.CONTENTDOM_IMAGE, url, desc, ientry.width() + " x " + ientry.height(), ranking)); continue; } }