*) favicons that are specified in the document content via html link-tags

are now detected and displayed on the search page (requested by allo).

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3845 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 18 years ago
parent 854eb1492f
commit 339153d40e

@ -66,6 +66,7 @@ function handleTextState(req) {
var urlHash = response.getElementsByTagName("urlHash")[0].firstChild.data; var urlHash = response.getElementsByTagName("urlHash")[0].firstChild.data;
var status = response.getElementsByTagName("status")[0].firstChild.data; var status = response.getElementsByTagName("status")[0].firstChild.data;
var span = document.getElementById("h" + urlHash); var span = document.getElementById("h" + urlHash);
removeAllChildren(span); removeAllChildren(span);
//span.removeChild(span.firstChild); //span.removeChild(span.firstChild);
@ -78,6 +79,12 @@ function handleTextState(req) {
document.getElementById("hidden_results").innerHTML='Some results were hidden, because they do not contain your searchwords anymore, or because they are not accessible. Click here to <a href="javascript:show_hidden_results()">show them</a>'; document.getElementById("hidden_results").innerHTML='Some results were hidden, because they do not contain your searchwords anymore, or because they are not accessible. Click here to <a href="javascript:show_hidden_results()">show them</a>';
} }
// set URL to favicon (if a link-tag was found in the document)
if (response.getElementsByTagName("favicon")[0].firstChild != null) {
var img = document.getElementById("f" + urlHash);
img.src = response.getElementsByTagName("favicon")[0].firstChild.data;
}
// replace "<b>" text by <strong> node // replace "<b>" text by <strong> node
var pos1=snippetText.indexOf("<b>"); var pos1=snippetText.indexOf("<b>");
var pos2=snippetText.indexOf("</b>"); var pos2=snippetText.indexOf("</b>");

@ -70,6 +70,7 @@ public class snippet {
} }
prop.put("link", 0); prop.put("link", 0);
prop.put("links", 0); prop.put("links", 0);
prop.putSafeXML("favicon",snippet.getFavicon()==null?"":snippet.getFavicon().toString());
} else { } else {
// attach media information // attach media information
ArrayList mediaSnippets = switchboard.snippetCache.retrieveMediaSnippets(url, queryHashes, media, true, mediasnippet_timeout); ArrayList mediaSnippets = switchboard.snippetCache.retrieveMediaSnippets(url, queryHashes, media, true, mediasnippet_timeout);
@ -85,6 +86,7 @@ public class snippet {
prop.put("text", ""); prop.put("text", "");
prop.put("link", mediaSnippets.size()); prop.put("link", mediaSnippets.size());
prop.put("links", mediaSnippets.size()); prop.put("links", mediaSnippets.size());
prop.put("favicon","");
} }

@ -4,6 +4,7 @@
<status>#[status]#</status> <status>#[status]#</status>
<urlHash>#[urlHash]#</urlHash> <urlHash>#[urlHash]#</urlHash>
<links>#[links]#</links> <links>#[links]#</links>
<favicon>#[favicon]#</favicon>
#{link}# #{link}#
<link> <link>
<type>#[type]#</type> <type>#[type]#</type>

@ -66,6 +66,7 @@ public class getpageinfo_p {
serverObjects prop = new serverObjects(); serverObjects prop = new serverObjects();
prop.put("sitemap", ""); prop.put("sitemap", "");
prop.put("title", ""); prop.put("title", "");
prop.put("favicon","");
prop.put("robots-allowed", 3); //unknown prop.put("robots-allowed", 3); //unknown
String actions="title"; String actions="title";
if(post!=null && post.containsKey("url")){ if(post!=null && post.containsKey("url")){
@ -90,15 +91,21 @@ public class getpageinfo_p {
serverFileUtils.write(contentString,writer); serverFileUtils.write(contentString,writer);
writer.close(); writer.close();
// put the document title
prop.put("title", scraper.getTitle()); prop.put("title", scraper.getTitle());
// put the favicon that belongs to the document
prop.putSafeXML("favicon", (scraper.getFavicon()==null)?"":scraper.getFavicon().toString());
// put keywords
String list[]=scraper.getKeywords(); String list[]=scraper.getKeywords();
for(int i=0;i<list.length;i++){ for(int i=0;i<list.length;i++){
prop.putSafeXML("tags_"+i+"_tag", list[i]); prop.putSafeXML("tags_"+i+"_tag", list[i]);
} }
prop.put("tags", list.length); prop.put("tags", list.length);
} catch (MalformedURLException e) { } catch (MalformedURLException e) { /* ignore this */
} catch (IOException e) { } catch (IOException e) { /* ignore this */
} }
} }
if(actions.indexOf("robots")>=0){ if(actions.indexOf("robots")>=0){
@ -106,11 +113,7 @@ public class getpageinfo_p {
URL theURL = new URL(url); URL theURL = new URL(url);
// determine if crawling of the current URL is allowed // determine if crawling of the current URL is allowed
if(robotsParser.isDisallowed(theURL)){ prop.put("robots-allowed", robotsParser.isDisallowed(theURL) ? 0:1);
prop.put("robots-allowed", 0);
}else{
prop.put("robots-allowed", 1);
}
// get the sitemap URL of the domain // get the sitemap URL of the domain
URL sitemapURL = robotsParser.getSitemapURL(theURL); URL sitemapURL = robotsParser.getSitemapURL(theURL);

@ -3,6 +3,7 @@
<title>#[title]#</title> <title>#[title]#</title>
<robots>#(robots-allowed)#0::1::#(/robots-allowed)#</robots> <robots>#(robots-allowed)#0::1::#(/robots-allowed)#</robots>
<sitemap>#[sitemap]#</sitemap> <sitemap>#[sitemap]#</sitemap>
<favicon>#[favicon]#</favicon>
<tags> <tags>
#{tags}# #{tags}#
<tag name="#[tag]#" /> <tag name="#[tag]#" />

@ -150,7 +150,7 @@ document.getElementById("Enter").value = "search again - catch up more links";
#(/recommend)# #(/recommend)#
</div> </div>
#(/authorized)# #(/authorized)#
<h4 class="linktitle"><img src="#[favicon]#" class="favicon" width="16" height="16" /><a href="#[url]#" target="_parent">#[description]#</a></h4> <h4 class="linktitle"><img src="#[favicon]#" id="f#[urlhash]#" class="favicon" width="16" height="16" /><a href="#[url]#" target="_parent">#[description]#</a></h4>
<p class="snippet iconindented"><span class="#(snippet)#snippetLoading::snippetLoaded#(/snippet)#" id="h#[urlhash]#">#(snippet)#loading snippet ...::#[text]##(/snippet)#</span></p> <p class="snippet iconindented"><span class="#(snippet)#snippetLoading::snippetLoaded#(/snippet)#" id="h#[urlhash]#">#(snippet)#loading snippet ...::#[text]##(/snippet)#</span></p>
<p class="url iconindented"><a href="#[url]#" id="url#[urlhash]#" target="_parent">#[urlname]#</a></p> <p class="url iconindented"><a href="#[url]#" id="url#[urlhash]#" target="_parent">#[urlname]#</a></p>
<p class="urlinfo iconindented">#[date]# | YBR-#[ybr]# | <a href="ViewFile.html?urlHash=#[urlhash]#&amp;words=#[words]#">Info</a> | <a href="yacysearch.html?cat=image&amp;url=#[url]#&amp;search=#[former]#">Pictures</a></p> <p class="urlinfo iconindented">#[date]# | YBR-#[ybr]# | <a href="ViewFile.html?urlHash=#[urlhash]#&amp;words=#[words]#">Info</a> | <a href="yacysearch.html?cat=image&amp;url=#[url]#&amp;search=#[former]#">Pictures</a></p>

@ -109,6 +109,14 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
private serverCharBuffer content; private serverCharBuffer content;
private EventListenerList htmlFilterEventListeners = new EventListenerList(); private EventListenerList htmlFilterEventListeners = new EventListenerList();
/**
* {@link URL} to the favicon that belongs to the document
*/
private URL favicon;
/**
* The document root {@link URL}
*/
private URL root; private URL root;
public htmlFilterContentScraper(URL root) { public htmlFilterContentScraper(URL root) {
@ -208,6 +216,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
if (type.equalsIgnoreCase("shortcut icon")) { if (type.equalsIgnoreCase("shortcut icon")) {
htmlFilterImageEntry ie = new htmlFilterImageEntry(newLink, linktitle, -1,-1); htmlFilterImageEntry ie = new htmlFilterImageEntry(newLink, linktitle, -1,-1);
images.add(ie); images.add(ie);
this.favicon = newLink;
} else if (!type.equalsIgnoreCase("stylesheet") && !type.equalsIgnoreCase("alternate stylesheet")) { } else if (!type.equalsIgnoreCase("stylesheet") && !type.equalsIgnoreCase("alternate stylesheet")) {
anchors.put(newLink.toString(), linktitle); anchors.put(newLink.toString(), linktitle);
} }
@ -347,6 +356,13 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
return metas; return metas;
} }
/**
* @return the {@link URL} to the favicon that belongs to the document
*/
public URL getFavicon() {
return this.favicon;
}
public String getDescription() { public String getDescription() {
String s = (String) metas.get("description"); String s = (String) metas.get("description");
if (s == null) return ""; else return s; if (s == null) return ""; else return s;

@ -766,6 +766,7 @@ public final class plasmaParser {
scraper.getAnchors(), scraper.getAnchors(),
scraper.getImages()); scraper.getImages());
//scraper.close(); //scraper.close();
ppd.setFavicon(scraper.getFavicon());
return ppd; return ppd;
} catch (MalformedURLException e) { } catch (MalformedURLException e) {
//e.printStackTrace(); //e.printStackTrace();

@ -83,6 +83,7 @@ public class plasmaParserDocument {
// text in image tags. // text in image tags.
private Map hyperlinks, audiolinks, videolinks, applinks; private Map hyperlinks, audiolinks, videolinks, applinks;
private Map emaillinks; private Map emaillinks;
private URL favicon;
private boolean resorted; private boolean resorted;
private InputStream textStream; private InputStream textStream;
@ -395,6 +396,20 @@ public class plasmaParserDocument {
images.addAll(doc.getImages()); images.addAll(doc.getImages());
} }
/**
* @return the {@link URL} to the favicon that belongs to the document
*/
public URL getFavicon() {
return this.favicon;
}
/**
* @param faviconURL the {@link URL} to the favicon that belongs to the document
*/
public void setFavicon(URL faviconURL) {
this.favicon = faviconURL;
}
public void close() { public void close() {
// try close the output stream // try close the output stream
if (this.textStream != null) { if (this.textStream != null) {

@ -89,6 +89,15 @@ public class plasmaSnippetCache {
private int snippetsScoreCounter; private int snippetsScoreCounter;
private kelondroMScoreCluster snippetsScore; private kelondroMScoreCluster snippetsScore;
private HashMap snippetsCache; private HashMap snippetsCache;
/**
* a cache holding URLs to favicons specified by the page content, e.g. by using the html link-tag. e.g.
* <pre>
* &lt;link rel="shortcut icon" type="image/x-icon" href="../src/favicon.ico"&gt;
* </pre>
*/
private HashMap faviconCache;
private plasmaHTCache cacheManager; private plasmaHTCache cacheManager;
private plasmaParser parser; private plasmaParser parser;
private serverLog log; private serverLog log;
@ -107,6 +116,7 @@ public class plasmaSnippetCache {
this.snippetsScoreCounter = 0; this.snippetsScoreCounter = 0;
this.snippetsScore = new kelondroMScoreCluster(); this.snippetsScore = new kelondroMScoreCluster();
this.snippetsCache = new HashMap(); this.snippetsCache = new HashMap();
this.faviconCache = new HashMap();
} }
public class TextSnippet { public class TextSnippet {
@ -115,12 +125,19 @@ public class plasmaSnippetCache {
private String error; private String error;
private int errorCode; private int errorCode;
private Set remaingHashes; private Set remaingHashes;
private URL favicon;
public TextSnippet(URL url, String line, int errorCode, Set remaingHashes, String errortext) { public TextSnippet(URL url, String line, int errorCode, Set remaingHashes, String errortext) {
this(url,line,errorCode,remaingHashes,errortext,null);
}
public TextSnippet(URL url, String line, int errorCode, Set remaingHashes, String errortext, URL favicon) {
this.url = url; this.url = url;
this.line = line; this.line = line;
this.errorCode = errorCode; this.errorCode = errorCode;
this.error = errortext; this.error = errortext;
this.remaingHashes = remaingHashes; this.remaingHashes = remaingHashes;
this.favicon = favicon;
} }
public URL getUrl() { public URL getUrl() {
return this.url; return this.url;
@ -213,6 +230,10 @@ public class plasmaSnippetCache {
} }
return l.toString().trim(); return l.toString().trim();
} }
public URL getFavicon() {
return this.favicon;
}
} }
public class MediaSnippet { public class MediaSnippet {
@ -246,7 +267,7 @@ public class plasmaSnippetCache {
String line = retrieveFromCache(wordhashes, urlhash); String line = retrieveFromCache(wordhashes, urlhash);
if (line != null) { if (line != null) {
//System.out.println("found snippet for URL " + url + " in cache: " + line); //System.out.println("found snippet for URL " + url + " in cache: " + line);
return new TextSnippet(url, line, source, null, null); return new TextSnippet(url, line, source, null, null,(URL)this.faviconCache.get(urlhash));
} }
/* =========================================================================== /* ===========================================================================
@ -312,11 +333,13 @@ public class plasmaSnippetCache {
/* =========================================================================== /* ===========================================================================
* COMPUTE SNIPPET * COMPUTE SNIPPET
* =========================================================================== */ * =========================================================================== */
URL resFavicon = document.getFavicon();
if (resFavicon != null) this.faviconCache.put(urlhash,resFavicon);
// we have found a parseable non-empty file: use the lines // we have found a parseable non-empty file: use the lines
// compute snippet from text // compute snippet from text
final Iterator sentences = document.getSentences(pre); final Iterator sentences = document.getSentences(pre);
if (sentences == null) return new TextSnippet(url, null, ERROR_PARSER_NO_LINES, queryhashes, "parser returned no sentences"); if (sentences == null) return new TextSnippet(url, null, ERROR_PARSER_NO_LINES, queryhashes, "parser returned no sentences",resFavicon);
Object[] tsr = computeTextSnippet(sentences, queryhashes, snippetMaxLength); Object[] tsr = computeTextSnippet(sentences, queryhashes, snippetMaxLength);
String textline = (tsr == null) ? null : (String) tsr[0]; String textline = (tsr == null) ? null : (String) tsr[0];
Set remainingHashes = (tsr == null) ? queryhashes : (Set) tsr[1]; Set remainingHashes = (tsr == null) ? queryhashes : (Set) tsr[1];
@ -335,13 +358,13 @@ public class plasmaSnippetCache {
//if (hrefline != null) line += (line.length() == 0) ? hrefline : "<br />" + hrefline; //if (hrefline != null) line += (line.length() == 0) ? hrefline : "<br />" + hrefline;
if (textline != null) line += (line.length() == 0) ? textline : "<br />" + textline; if (textline != null) line += (line.length() == 0) ? textline : "<br />" + textline;
if ((line == null) || (remainingHashes.size() > 0)) return new TextSnippet(url, null, ERROR_NO_MATCH, remainingHashes, "no matching snippet found"); if ((line == null) || (remainingHashes.size() > 0)) return new TextSnippet(url, null, ERROR_NO_MATCH, remainingHashes, "no matching snippet found",resFavicon);
if (line.length() > snippetMaxLength) line = line.substring(0, snippetMaxLength); if (line.length() > snippetMaxLength) line = line.substring(0, snippetMaxLength);
// finally store this snippet in our own cache // finally store this snippet in our own cache
storeToCache(wordhashes, urlhash, line); storeToCache(wordhashes, urlhash, line);
document.close(); document.close();
return new TextSnippet(url, line, source, null, null); return new TextSnippet(url, line, source, null, null, resFavicon);
} }
/** /**

Loading…
Cancel
Save