- added image sorting by image size. This is the default now.

This is performed using a 3-stage sorting process:
  - sort by relevance, then do snippet-fetch
  - sort snippets by relevance then do image link extraction
  - sort image links by image size; unknown sizes are handled like small sizes
- only the exact amount of images as requested are shown

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4499 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent 727feb4358
commit bedd8dfbe2

@ -13,7 +13,7 @@
<img src="/ViewImage.png?maxwidth=96&maxheight=96&code=#[code]#" alt="#[name]#"> <img src="/ViewImage.png?maxwidth=96&maxheight=96&code=#[code]#" alt="#[name]#">
</a> </a>
<div class="highslide-caption"><a href="#[href]#">#[name]#</a></div> <div class="highslide-caption"><a href="#[href]#">#[name]#</a></div>
<div class="snippet"><a href="#[href]#">#[name]#</a></div> <div class="snippet"><a href="#[href]#">#[name]##[attr]#</a></div>
</div> </div>
#{/items}# #{/items}#
:: ::

@ -164,35 +164,32 @@ public class ysearchitem {
return prop; return prop;
} }
// generate result object
plasmaSearchEvent.ResultEntry result = theSearch.oneResult(item);
if (result == null) {
// no content
return prop;
}
if (rss) {
// text search for rss output
prop.put("rss", "1"); // switch on specific content
prop.putHTML("rss_title", result.title(), true);
prop.putHTML("rss_description", result.textSnippet().getLineRaw(), true);
prop.putHTML("rss_link", result.urlstring(), true);
prop.put("rss_urlhash", result.hash());
prop.put("rss_date", plasmaSwitchboard.dateString822(result.modified()));
return prop;
}
prop.put("rss", "0"); prop.put("rss", "0");
if (theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_TEXT) { if (theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_TEXT) {
// text search // text search
// generate result object
plasmaSearchEvent.ResultEntry result = theSearch.oneResult(item);
if (result == null) return prop; // no content
if (rss) {
// text search for rss output
prop.put("rss", "1"); // switch on specific content
prop.putHTML("rss_title", result.title(), true);
prop.putHTML("rss_description", result.textSnippet().getLineRaw(), true);
prop.putHTML("rss_link", result.urlstring(), true);
prop.put("rss_urlhash", result.hash());
prop.put("rss_date", plasmaSwitchboard.dateString822(result.modified()));
return prop;
}
prop.put("content", theQuery.contentdom + 1); // switch on specific content prop.put("content", theQuery.contentdom + 1); // switch on specific content
prop.put("content_authorized", authenticated ? "1" : "0"); prop.put("content_authorized", authenticated ? "1" : "0");
prop.put("content_authorized_recommend", (yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, yacyNewsPool.CATEGORY_SURFTIPP_ADD, "url", result.urlstring()) == null) ? "1" : "0"); prop.put("content_authorized_recommend", (yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, yacyNewsPool.CATEGORY_SURFTIPP_ADD, "url", result.urlstring()) == null) ? "1" : "0");
prop.put("content_authorized_recommend_deletelink", "/ysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=local&time=3&deleteref=" + result.hash() + "&urlmaskfilter=.*"); prop.put("content_authorized_recommend_deletelink", "/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=local&time=3&deleteref=" + result.hash() + "&urlmaskfilter=.*");
prop.put("content_authorized_recommend_recommendlink", "/ysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=local&time=3&recommendref=" + result.hash() + "&urlmaskfilter=.*"); prop.put("content_authorized_recommend_recommendlink", "/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=local&time=3&recommendref=" + result.hash() + "&urlmaskfilter=.*");
prop.put("content_authorized_urlhash", result.hash()); prop.put("content_authorized_urlhash", result.hash());
prop.putHTML("content_description", result.title()); prop.putHTML("content_description", result.title());
prop.put("content_url", result.urlstring()); prop.put("content_url", result.urlstring());
@ -229,23 +226,17 @@ public class ysearchitem {
if (theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) { if (theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) {
// image search; shows thumbnails // image search; shows thumbnails
// iterate over all images in the result
prop.put("content", theQuery.contentdom + 1); // switch on specific content prop.put("content", theQuery.contentdom + 1); // switch on specific content
ArrayList<plasmaSnippetCache.MediaSnippet> images = result.mediaSnippets(); plasmaSnippetCache.MediaSnippet ms = theSearch.oneImage(item);
if (images != null) { if (ms == null) {
plasmaSnippetCache.MediaSnippet ms;
int c = 0;
for (int i = 0; i < images.size(); i++) {
ms = (plasmaSnippetCache.MediaSnippet) images.get(i);
prop.putHTML("content_items_" + i + "_href", ms.href.toNormalform(true, false));
prop.put("content_items_" + i + "_code", sb.licensedURLs.aquireLicense(ms.href));
prop.putHTML("content_items_" + i + "_name", shorten(ms.name, namelength));
prop.put("content_items_" + i + "_attr", ms.attr); // attributes, here: original size of image
c++;
}
prop.put("content_items", c);
} else {
prop.put("content_items", "0"); prop.put("content_items", "0");
} else {
prop.putHTML("content_items_0_href", ms.href.toNormalform(true, false));
prop.put("content_items_0_code", sb.licensedURLs.aquireLicense(ms.href));
prop.putHTML("content_items_0_name", shorten(ms.name, namelength));
prop.put("content_items_0_attr", (ms.attr.equals("-1 x -1")) ? "" : " (" + ms.attr + ")"); // attributes, here: original size of image
prop.put("content_items", 1);
} }
return prop; return prop;
} }
@ -254,6 +245,11 @@ public class ysearchitem {
(theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) || (theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) ||
(theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_APP)) { (theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_APP)) {
// any other media content // any other media content
// generate result object
plasmaSearchEvent.ResultEntry result = theSearch.oneResult(item);
if (result == null) return prop; // no content
prop.put("content", theQuery.contentdom + 1); // switch on specific content prop.put("content", theQuery.contentdom + 1); // switch on specific content
ArrayList<plasmaSnippetCache.MediaSnippet> media = result.mediaSnippets(); ArrayList<plasmaSnippetCache.MediaSnippet> media = result.mediaSnippets();
if (item == 0) col = true; if (item == 0) col = true;

@ -26,7 +26,7 @@
<img src="/ViewImage.png?maxwidth=96&maxheight=96&code=#[code]#" alt="#[name]#"> <img src="/ViewImage.png?maxwidth=96&maxheight=96&code=#[code]#" alt="#[name]#">
</a> </a>
<div class="highslide-caption"><a href="#[href]#">#[name]#</a></div> <div class="highslide-caption"><a href="#[href]#">#[name]#</a></div>
<div class="TableCellDark"><a href="#[href]#">#[name]#</a></div> <div class="TableCellDark"><a href="#[href]#">#[name]##[attr]#</a></div>
</div> </div>
#{/items}# #{/items}#
:: ::

@ -166,29 +166,26 @@ public class yacysearchitem {
return prop; return prop;
} }
// generate result object
plasmaSearchEvent.ResultEntry result = theSearch.oneResult(item);
if (result == null) {
// no content
return prop;
}
if (rss) {
// text search for rss output
prop.put("rss", "1"); // switch on specific content
prop.putHTML("rss_title", result.title(), true);
prop.putHTML("rss_description", result.textSnippet().getLineRaw(), true);
prop.putHTML("rss_link", result.urlstring(), true);
prop.put("rss_urlhash", result.hash());
prop.put("rss_date", plasmaSwitchboard.dateString822(result.modified()));
return prop;
}
prop.put("rss", "0"); prop.put("rss", "0");
if (theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_TEXT) { if (theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_TEXT) {
// text search // text search
// generate result object
plasmaSearchEvent.ResultEntry result = theSearch.oneResult(item);
if (result == null) return prop; // no content
if (rss) {
// text search for rss output
prop.put("rss", "1"); // switch on specific content
prop.putHTML("rss_title", result.title(), true);
prop.putHTML("rss_description", result.textSnippet().getLineRaw(), true);
prop.putHTML("rss_link", result.urlstring(), true);
prop.put("rss_urlhash", result.hash());
prop.put("rss_date", plasmaSwitchboard.dateString822(result.modified()));
return prop;
}
prop.put("content", theQuery.contentdom + 1); // switch on specific content prop.put("content", theQuery.contentdom + 1); // switch on specific content
prop.put("content_authorized", authenticated ? "1" : "0"); prop.put("content_authorized", authenticated ? "1" : "0");
prop.put("content_authorized_recommend", (yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, yacyNewsPool.CATEGORY_SURFTIPP_ADD, "url", result.urlstring()) == null) ? "1" : "0"); prop.put("content_authorized_recommend", (yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, yacyNewsPool.CATEGORY_SURFTIPP_ADD, "url", result.urlstring()) == null) ? "1" : "0");
@ -230,23 +227,17 @@ public class yacysearchitem {
if (theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) { if (theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) {
// image search; shows thumbnails // image search; shows thumbnails
// iterate over all images in the result
prop.put("content", theQuery.contentdom + 1); // switch on specific content prop.put("content", theQuery.contentdom + 1); // switch on specific content
ArrayList<plasmaSnippetCache.MediaSnippet> images = result.mediaSnippets(); plasmaSnippetCache.MediaSnippet ms = theSearch.oneImage(item);
if (images != null) { if (ms == null) {
plasmaSnippetCache.MediaSnippet ms;
int c = 0;
for (int i = 0; i < images.size(); i++) {
ms = (plasmaSnippetCache.MediaSnippet) images.get(i);
prop.putHTML("content_items_" + i + "_href", ms.href.toNormalform(true, false));
prop.put("content_items_" + i + "_code", sb.licensedURLs.aquireLicense(ms.href));
prop.putHTML("content_items_" + i + "_name", shorten(ms.name, namelength));
prop.put("content_items_" + i + "_attr", ms.attr); // attributes, here: original size of image
c++;
}
prop.put("content_items", c);
} else {
prop.put("content_items", "0"); prop.put("content_items", "0");
} else {
prop.putHTML("content_items_0_href", ms.href.toNormalform(true, false));
prop.put("content_items_0_code", sb.licensedURLs.aquireLicense(ms.href));
prop.putHTML("content_items_0_name", shorten(ms.name, namelength));
prop.put("content_items_0_attr", (ms.attr.equals("-1 x -1")) ? "" : " (" + ms.attr + ")"); // attributes, here: original size of image
prop.put("content_items", 1);
} }
return prop; return prop;
} }
@ -255,6 +246,11 @@ public class yacysearchitem {
(theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) || (theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) ||
(theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_APP)) { (theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_APP)) {
// any other media content // any other media content
// generate result object
plasmaSearchEvent.ResultEntry result = theSearch.oneResult(item);
if (result == null) return prop; // no content
prop.put("content", theQuery.contentdom + 1); // switch on specific content prop.put("content", theQuery.contentdom + 1); // switch on specific content
ArrayList<plasmaSnippetCache.MediaSnippet> media = result.mediaSnippets(); ArrayList<plasmaSnippetCache.MediaSnippet> media = result.mediaSnippets();
if (item == 0) col = true; if (item == 0) col = true;

@ -80,10 +80,10 @@ public class htmlFilterImageEntry implements Comparable<htmlFilterImageEntry> {
// this hash method therefore tries to compute a 'perfect hash' based on the size of the images // this hash method therefore tries to compute a 'perfect hash' based on the size of the images
// unfortunately it can not be ensured that all images get different hashes, but this should appear // unfortunately it can not be ensured that all images get different hashes, but this should appear
// only in very rare cases // only in very rare cases
if ((width > 0) && (height > 0)) if ((width >= 0) && (height >= 0))
return ((0xFFFF - (((width * height) >> 8) & 0xFFFF)) << 16) | (url.hashCode() & 0xFFFF); return ((0x7FFF - (((width * height) >> 9) & 0x7FFF)) << 16) | (url.hashCode() & 0xFFFF);
else else
return 0xFFFF0000 | (url.hashCode() & 0xFFFF); return 0x7FFF0000 | (url.hashCode() & 0xFFFF);
} }
public int compareTo(htmlFilterImageEntry h) { public int compareTo(htmlFilterImageEntry h) {

@ -81,6 +81,7 @@ public final class plasmaSearchEvent {
public String IAmaxcounthash, IAneardhthash; public String IAmaxcounthash, IAneardhthash;
private resultWorker[] workerThreads; private resultWorker[] workerThreads;
private kelondroSortStore<ResultEntry> result; private kelondroSortStore<ResultEntry> result;
private kelondroSortStore<plasmaSnippetCache.MediaSnippet> images; // container to sort images by size
private HashMap<String, String> failedURLs; // a mapping from a urlhash to a fail reason string private HashMap<String, String> failedURLs; // a mapping from a urlhash to a fail reason string
TreeSet<String> snippetFetchWordHashes; // a set of word hashes that are used to match with the snippets TreeSet<String> snippetFetchWordHashes; // a set of word hashes that are used to match with the snippets
private long urlRetrievalAllTime; private long urlRetrievalAllTime;
@ -107,6 +108,7 @@ public final class plasmaSearchEvent {
this.workerThreads = null; this.workerThreads = null;
this.localSearchThread = null; this.localSearchThread = null;
this.result = new kelondroSortStore<ResultEntry>(-1); // this is the result, enriched with snippets, ranked and ordered by ranking this.result = new kelondroSortStore<ResultEntry>(-1); // this is the result, enriched with snippets, ranked and ordered by ranking
this.images = new kelondroSortStore<plasmaSnippetCache.MediaSnippet>(-1);
this.failedURLs = new HashMap<String, String>(); // a map of urls to reason strings where a worker thread tried to work on, but failed. this.failedURLs = new HashMap<String, String>(); // a map of urls to reason strings where a worker thread tried to work on, but failed.
// snippets do not need to match with the complete query hashes, // snippets do not need to match with the complete query hashes,
@ -465,7 +467,8 @@ public final class plasmaSearchEvent {
// if worker threads had been alive, but did not succeed, start them again to fetch missing links // if worker threads had been alive, but did not succeed, start them again to fetch missing links
if ((query.onlineSnippetFetch) && if ((query.onlineSnippetFetch) &&
(!event.anyWorkerAlive()) && (!event.anyWorkerAlive()) &&
(event.result.size() < query.neededResults() + 10) && (((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (event.images.size() + 30 < query.neededResults())) ||
(event.result.size() < query.neededResults() + 10)) &&
(event.getRankingResult().getLocalResourceSize() + event.getRankingResult().getRemoteResourceSize() > event.result.size())) { (event.getRankingResult().getLocalResourceSize() + event.getRankingResult().getRemoteResourceSize() > event.result.size())) {
// set new timeout // set new timeout
event.eventTime = System.currentTimeMillis(); event.eventTime = System.currentTimeMillis();
@ -507,7 +510,9 @@ public final class plasmaSearchEvent {
while (System.currentTimeMillis() < this.timeout) { while (System.currentTimeMillis() < this.timeout) {
this.lastLifeSign = System.currentTimeMillis(); this.lastLifeSign = System.currentTimeMillis();
if (result.size() >= query.neededResults() /*+ query.displayResults()*/) break; // we have enough // check if we have enough
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (images.size() >= query.neededResults() + 30)) break;
if ((query.contentdom != plasmaSearchQuery.CONTENTDOM_IMAGE) && (result.size() >= query.neededResults() + 10 /*+ query.displayResults()*/)) break;
// get next entry // get next entry
page = rankedCache.bestURL(true); page = rankedCache.bestURL(true);
@ -558,7 +563,7 @@ public final class plasmaSearchEvent {
} }
public ResultEntry oneResult(int item) { public ResultEntry oneResult(int item) {
// first sleep a while to give accumulation threads a chance to work // check if we already retrieved this item (happens if a search pages is accessed a second time)
if (this.result.sizeStore() > item) { if (this.result.sizeStore() > item) {
// we have the wanted result already in the result array .. return that // we have the wanted result already in the result array .. return that
return this.result.element(item).element; return this.result.element(item).element;
@ -589,6 +594,43 @@ public final class plasmaSearchEvent {
return this.result.element(item).element; return this.result.element(item).element;
} }
private int resultCounter = 0;
public ResultEntry nextResult() {
ResultEntry re = oneResult(resultCounter);
resultCounter++;
return re;
}
public plasmaSnippetCache.MediaSnippet oneImage(int item) {
// check if we already retrieved this item (happens if a search pages is accessed a second time)
if (this.images.sizeStore() > item) {
// we have the wanted result already in the result array .. return that
return this.images.element(item).element;
}
// feed some results from the result stack into the image stack
int count = Math.min(5, Math.max(1, 10 * this.result.size() / (item + 1)));
for (int i = 0; i < count; i++) {
// generate result object
plasmaSearchEvent.ResultEntry result = nextResult();
plasmaSnippetCache.MediaSnippet ms;
if (result != null) {
// iterate over all images in the result
ArrayList<plasmaSnippetCache.MediaSnippet> imagemedia = result.mediaSnippets();
if (imagemedia != null) {
for (int j = 0; j < imagemedia.size(); j++) {
ms = imagemedia.get(j);
images.push(ms, ms.ranking);
}
}
}
}
// now take the specific item from the image stack
if (this.images.size() <= item) return null;
return this.images.element(item).element;
}
public ArrayList<kelondroSortStack<ResultEntry>.stackElement> completeResults(long waitingtime) { public ArrayList<kelondroSortStack<ResultEntry>.stackElement> completeResults(long waitingtime) {
long timeout = System.currentTimeMillis() + waitingtime; long timeout = System.currentTimeMillis() + waitingtime;
while ((result.size() < query.neededResults()) && (anyWorkerAlive()) && (System.currentTimeMillis() < timeout)) { while ((result.size() < query.neededResults()) && (anyWorkerAlive()) && (System.currentTimeMillis() < timeout)) {

@ -232,11 +232,13 @@ public class plasmaSnippetCache {
public int type; public int type;
public yacyURL href; public yacyURL href;
public String name, attr; public String name, attr;
public MediaSnippet(int type, yacyURL href, String name, String attr) { public int ranking;
public MediaSnippet(int type, yacyURL href, String name, String attr, int ranking) {
this.type = type; this.type = type;
this.href = href; this.href = href;
this.name = name; this.name = name;
this.attr = attr; this.attr = attr;
this.ranking = ranking; // the smaller the better! small values should be shown first
if ((this.name == null) || (this.name.length() == 0)) this.name = "_"; if ((this.name == null) || (this.name.length() == 0)) this.name = "_";
if ((this.attr == null) || (this.attr.length() == 0)) this.attr = "_"; if ((this.attr == null) || (this.attr.length() == 0)) this.attr = "_";
} }
@ -677,12 +679,12 @@ public class plasmaSnippetCache {
desc = entry.getValue(); desc = entry.getValue();
s = removeAppearanceHashes(url.toNormalform(false, false), queryhashes); s = removeAppearanceHashes(url.toNormalform(false, false), queryhashes);
if (s.size() == 0) { if (s.size() == 0) {
result.add(new MediaSnippet(mediatype, url, desc, null)); result.add(new MediaSnippet(mediatype, url, desc, null, 0));
continue; continue;
} }
s = removeAppearanceHashes(desc, s); s = removeAppearanceHashes(desc, s);
if (s.size() == 0) { if (s.size() == 0) {
result.add(new MediaSnippet(mediatype, url, desc, null)); result.add(new MediaSnippet(mediatype, url, desc, null, 0));
continue; continue;
} }
} }
@ -691,7 +693,8 @@ public class plasmaSnippetCache {
public static ArrayList<MediaSnippet> computeImageSnippets(plasmaParserDocument document, Set<String> queryhashes) { public static ArrayList<MediaSnippet> computeImageSnippets(plasmaParserDocument document, Set<String> queryhashes) {
TreeSet<htmlFilterImageEntry> images = document.getImages(); TreeSet<htmlFilterImageEntry> images = document.getImages(); // iterates images in descending size order!
// a measurement for the size of the images can be retrieved using the htmlFilterImageEntry.hashCode()
Iterator<htmlFilterImageEntry> i = images.iterator(); Iterator<htmlFilterImageEntry> i = images.iterator();
htmlFilterImageEntry ientry; htmlFilterImageEntry ientry;
@ -705,12 +708,16 @@ public class plasmaSnippetCache {
desc = ientry.alt(); desc = ientry.alt();
s = removeAppearanceHashes(url.toNormalform(false, false), queryhashes); s = removeAppearanceHashes(url.toNormalform(false, false), queryhashes);
if (s.size() == 0) { if (s.size() == 0) {
result.add(new MediaSnippet(plasmaSearchQuery.CONTENTDOM_IMAGE, url, desc, ientry.width() + " x " + ientry.height())); int ranking = ientry.hashCode();
System.out.println(ranking);
result.add(new MediaSnippet(plasmaSearchQuery.CONTENTDOM_IMAGE, url, desc, ientry.width() + " x " + ientry.height(), ranking));
continue; continue;
} }
s = removeAppearanceHashes(desc, s); s = removeAppearanceHashes(desc, s);
if (s.size() == 0) { if (s.size() == 0) {
result.add(new MediaSnippet(plasmaSearchQuery.CONTENTDOM_IMAGE, url, desc, ientry.width() + " x " + ientry.height())); int ranking = ientry.hashCode();
System.out.println(ranking);
result.add(new MediaSnippet(plasmaSearchQuery.CONTENTDOM_IMAGE, url, desc, ientry.width() + " x " + ientry.height(), ranking));
continue; continue;
} }
} }

Loading…
Cancel
Save