diff --git a/htroot/js/yacysearch.js b/htroot/js/yacysearch.js index 05e2d536b..e1cffc202 100644 --- a/htroot/js/yacysearch.js +++ b/htroot/js/yacysearch.js @@ -65,6 +65,7 @@ function handleTextState(req) { var snippetText = response.getElementsByTagName("text")[0].firstChild.data; var urlHash = response.getElementsByTagName("urlHash")[0].firstChild.data; var status = response.getElementsByTagName("status")[0].firstChild.data; + var span = document.getElementById("h" + urlHash); removeAllChildren(span); @@ -77,6 +78,12 @@ function handleTextState(req) { span.parentNode.parentNode.setAttribute("style", "display: none"); document.getElementById("hidden_results").innerHTML='Some results were hidden, because they do not contain your searchwords anymore, or because they are not accessible. Click here to show them'; } + + // set URL to favicon (if a link-tag was found in the document) + if (response.getElementsByTagName("favicon")[0].firstChild != null) { + var img = document.getElementById("f" + urlHash); + img.src = response.getElementsByTagName("favicon")[0].firstChild.data; + } // replace "" text by node var pos1=snippetText.indexOf(""); diff --git a/htroot/xml/snippet.java b/htroot/xml/snippet.java index 0194d7449..6f99809b5 100644 --- a/htroot/xml/snippet.java +++ b/htroot/xml/snippet.java @@ -70,6 +70,7 @@ public class snippet { } prop.put("link", 0); prop.put("links", 0); + prop.putSafeXML("favicon",snippet.getFavicon()==null?"":snippet.getFavicon().toString()); } else { // attach media information ArrayList mediaSnippets = switchboard.snippetCache.retrieveMediaSnippets(url, queryHashes, media, true, mediasnippet_timeout); @@ -85,6 +86,7 @@ public class snippet { prop.put("text", ""); prop.put("link", mediaSnippets.size()); prop.put("links", mediaSnippets.size()); + prop.put("favicon",""); } diff --git a/htroot/xml/snippet.xml b/htroot/xml/snippet.xml index 3589d50be..452af6a99 100644 --- a/htroot/xml/snippet.xml +++ b/htroot/xml/snippet.xml @@ -4,6 +4,7 @@ #[status]# #[urlHash]# #[links]# + #[favicon]# #{link}# #[type]# diff --git a/htroot/xml/util/getpageinfo_p.java b/htroot/xml/util/getpageinfo_p.java index 13ce26deb..dbd8e900f 100644 --- a/htroot/xml/util/getpageinfo_p.java +++ b/htroot/xml/util/getpageinfo_p.java @@ -66,6 +66,7 @@ public class getpageinfo_p { serverObjects prop = new serverObjects(); prop.put("sitemap", ""); prop.put("title", ""); + prop.put("favicon",""); prop.put("robots-allowed", 3); //unknown String actions="title"; if(post!=null && post.containsKey("url")){ @@ -90,15 +91,21 @@ public class getpageinfo_p { serverFileUtils.write(contentString,writer); writer.close(); + // put the document title prop.put("title", scraper.getTitle()); + + // put the favicon that belongs to the document + prop.putSafeXML("favicon", (scraper.getFavicon()==null)?"":scraper.getFavicon().toString()); + + // put keywords String list[]=scraper.getKeywords(); for(int i=0;i=0){ @@ -106,11 +113,7 @@ public class getpageinfo_p { URL theURL = new URL(url); // determine if crawling of the current URL is allowed - if(robotsParser.isDisallowed(theURL)){ - prop.put("robots-allowed", 0); - }else{ - prop.put("robots-allowed", 1); - } + prop.put("robots-allowed", robotsParser.isDisallowed(theURL) ? 0:1); // get the sitemap URL of the domain URL sitemapURL = robotsParser.getSitemapURL(theURL); diff --git a/htroot/xml/util/getpageinfo_p.xml b/htroot/xml/util/getpageinfo_p.xml index cd040da2b..a89d94140 100644 --- a/htroot/xml/util/getpageinfo_p.xml +++ b/htroot/xml/util/getpageinfo_p.xml @@ -3,6 +3,7 @@ #[title]# #(robots-allowed)#0::1::#(/robots-allowed)# #[sitemap]# + #[favicon]# #{tags}# diff --git a/htroot/yacysearch.html b/htroot/yacysearch.html index a2d8e6005..878467cdf 100644 --- a/htroot/yacysearch.html +++ b/htroot/yacysearch.html @@ -150,7 +150,7 @@ document.getElementById("Enter").value = "search again - catch up more links"; #(/recommend)# #(/authorized)# -

#[description]#

+

#[description]#

#(snippet)#loading snippet ...::#[text]##(/snippet)#

#[urlname]#

#[date]# | YBR-#[ybr]# | Info | Pictures

diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index 6017af1ef..596cb1266 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -109,6 +109,14 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen private serverCharBuffer content; private EventListenerList htmlFilterEventListeners = new EventListenerList(); + /** + * {@link URL} to the favicon that belongs to the document + */ + private URL favicon; + + /** + * The document root {@link URL} + */ private URL root; public htmlFilterContentScraper(URL root) { @@ -207,7 +215,8 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen if (type.equalsIgnoreCase("shortcut icon")) { htmlFilterImageEntry ie = new htmlFilterImageEntry(newLink, linktitle, -1,-1); - images.add(ie); + images.add(ie); + this.favicon = newLink; } else if (!type.equalsIgnoreCase("stylesheet") && !type.equalsIgnoreCase("alternate stylesheet")) { anchors.put(newLink.toString(), linktitle); } @@ -346,6 +355,13 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen public Map getMetas() { return metas; } + + /** + * @return the {@link URL} to the favicon that belongs to the document + */ + public URL getFavicon() { + return this.favicon; + } public String getDescription() { String s = (String) metas.get("description"); diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index 0b7921b96..628d3efd2 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -765,7 +765,8 @@ public final class plasmaParser { scraper.getText(), scraper.getAnchors(), scraper.getImages()); - //scraper.close(); + //scraper.close(); + ppd.setFavicon(scraper.getFavicon()); return ppd; } catch (MalformedURLException e) { //e.printStackTrace(); diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java index 0e378b529..30233d062 100644 --- a/source/de/anomic/plasma/plasmaParserDocument.java +++ b/source/de/anomic/plasma/plasmaParserDocument.java @@ -83,6 +83,7 @@ public class plasmaParserDocument { // text in image tags. private Map hyperlinks, audiolinks, videolinks, applinks; private Map emaillinks; + private URL favicon; private boolean resorted; private InputStream textStream; @@ -395,6 +396,20 @@ public class plasmaParserDocument { images.addAll(doc.getImages()); } + /** + * @return the {@link URL} to the favicon that belongs to the document + */ + public URL getFavicon() { + return this.favicon; + } + + /** + * @param faviconURL the {@link URL} to the favicon that belongs to the document + */ + public void setFavicon(URL faviconURL) { + this.favicon = faviconURL; + } + public void close() { // try close the output stream if (this.textStream != null) { diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index 2a2425b9a..e3db13d9f 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -89,6 +89,15 @@ public class plasmaSnippetCache { private int snippetsScoreCounter; private kelondroMScoreCluster snippetsScore; private HashMap snippetsCache; + + /** + * a cache holding URLs to favicons specified by the page content, e.g. by using the html link-tag. e.g. + *
+     * 	 <link rel="shortcut icon" type="image/x-icon" href="../src/favicon.ico">
+     * 
+ */ + private HashMap faviconCache; + private plasmaHTCache cacheManager; private plasmaParser parser; private serverLog log; @@ -106,7 +115,8 @@ public class plasmaSnippetCache { this.sb = theSb; this.snippetsScoreCounter = 0; this.snippetsScore = new kelondroMScoreCluster(); - this.snippetsCache = new HashMap(); + this.snippetsCache = new HashMap(); + this.faviconCache = new HashMap(); } public class TextSnippet { @@ -115,12 +125,19 @@ public class plasmaSnippetCache { private String error; private int errorCode; private Set remaingHashes; + private URL favicon; + public TextSnippet(URL url, String line, int errorCode, Set remaingHashes, String errortext) { + this(url,line,errorCode,remaingHashes,errortext,null); + } + + public TextSnippet(URL url, String line, int errorCode, Set remaingHashes, String errortext, URL favicon) { this.url = url; this.line = line; this.errorCode = errorCode; this.error = errortext; this.remaingHashes = remaingHashes; + this.favicon = favicon; } public URL getUrl() { return this.url; @@ -213,6 +230,10 @@ public class plasmaSnippetCache { } return l.toString().trim(); } + + public URL getFavicon() { + return this.favicon; + } } public class MediaSnippet { @@ -244,9 +265,9 @@ public class plasmaSnippetCache { int source = SOURCE_CACHE; String wordhashes = yacySearch.set2string(queryhashes); String line = retrieveFromCache(wordhashes, urlhash); - if (line != null) { + if (line != null) { //System.out.println("found snippet for URL " + url + " in cache: " + line); - return new TextSnippet(url, line, source, null, null); + return new TextSnippet(url, line, source, null, null,(URL)this.faviconCache.get(urlhash)); } /* =========================================================================== @@ -300,7 +321,7 @@ public class plasmaSnippetCache { * =========================================================================== */ plasmaParserDocument document = null; try { - document = parseDocument(url, resContentLength, resContent, resInfo); + document = parseDocument(url, resContentLength, resContent, resInfo); } catch (ParserException e) { return new TextSnippet(url, null, ERROR_PARSER_FAILED, queryhashes, e.getMessage()); // cannot be parsed } finally { @@ -311,12 +332,14 @@ public class plasmaSnippetCache { /* =========================================================================== * COMPUTE SNIPPET - * =========================================================================== */ + * =========================================================================== */ + URL resFavicon = document.getFavicon(); + if (resFavicon != null) this.faviconCache.put(urlhash,resFavicon); // we have found a parseable non-empty file: use the lines // compute snippet from text final Iterator sentences = document.getSentences(pre); - if (sentences == null) return new TextSnippet(url, null, ERROR_PARSER_NO_LINES, queryhashes, "parser returned no sentences"); + if (sentences == null) return new TextSnippet(url, null, ERROR_PARSER_NO_LINES, queryhashes, "parser returned no sentences",resFavicon); Object[] tsr = computeTextSnippet(sentences, queryhashes, snippetMaxLength); String textline = (tsr == null) ? null : (String) tsr[0]; Set remainingHashes = (tsr == null) ? queryhashes : (Set) tsr[1]; @@ -335,13 +358,13 @@ public class plasmaSnippetCache { //if (hrefline != null) line += (line.length() == 0) ? hrefline : "
" + hrefline; if (textline != null) line += (line.length() == 0) ? textline : "
" + textline; - if ((line == null) || (remainingHashes.size() > 0)) return new TextSnippet(url, null, ERROR_NO_MATCH, remainingHashes, "no matching snippet found"); + if ((line == null) || (remainingHashes.size() > 0)) return new TextSnippet(url, null, ERROR_NO_MATCH, remainingHashes, "no matching snippet found",resFavicon); if (line.length() > snippetMaxLength) line = line.substring(0, snippetMaxLength); // finally store this snippet in our own cache storeToCache(wordhashes, urlhash, line); document.close(); - return new TextSnippet(url, line, source, null, null); + return new TextSnippet(url, line, source, null, null, resFavicon); } /**