#(snippet)#loading snippet ...::#[text]##(/snippet)#
#[date]# | YBR-#[ybr]# | Info | Pictures
diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index 6017af1ef..596cb1266 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -109,6 +109,14 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen private serverCharBuffer content; private EventListenerList htmlFilterEventListeners = new EventListenerList(); + /** + * {@link URL} to the favicon that belongs to the document + */ + private URL favicon; + + /** + * The document root {@link URL} + */ private URL root; public htmlFilterContentScraper(URL root) { @@ -207,7 +215,8 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen if (type.equalsIgnoreCase("shortcut icon")) { htmlFilterImageEntry ie = new htmlFilterImageEntry(newLink, linktitle, -1,-1); - images.add(ie); + images.add(ie); + this.favicon = newLink; } else if (!type.equalsIgnoreCase("stylesheet") && !type.equalsIgnoreCase("alternate stylesheet")) { anchors.put(newLink.toString(), linktitle); } @@ -346,6 +355,13 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen public Map getMetas() { return metas; } + + /** + * @return the {@link URL} to the favicon that belongs to the document + */ + public URL getFavicon() { + return this.favicon; + } public String getDescription() { String s = (String) metas.get("description"); diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index 0b7921b96..628d3efd2 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -765,7 +765,8 @@ public final class plasmaParser { scraper.getText(), scraper.getAnchors(), scraper.getImages()); - //scraper.close(); + //scraper.close(); + ppd.setFavicon(scraper.getFavicon()); return ppd; } catch (MalformedURLException e) { //e.printStackTrace(); diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java index 0e378b529..30233d062 100644 --- a/source/de/anomic/plasma/plasmaParserDocument.java +++ b/source/de/anomic/plasma/plasmaParserDocument.java @@ -83,6 +83,7 @@ public class plasmaParserDocument { // text in image tags. private Map hyperlinks, audiolinks, videolinks, applinks; private Map emaillinks; + private URL favicon; private boolean resorted; private InputStream textStream; @@ -395,6 +396,20 @@ public class plasmaParserDocument { images.addAll(doc.getImages()); } + /** + * @return the {@link URL} to the favicon that belongs to the document + */ + public URL getFavicon() { + return this.favicon; + } + + /** + * @param faviconURL the {@link URL} to the favicon that belongs to the document + */ + public void setFavicon(URL faviconURL) { + this.favicon = faviconURL; + } + public void close() { // try close the output stream if (this.textStream != null) { diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index 2a2425b9a..e3db13d9f 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -89,6 +89,15 @@ public class plasmaSnippetCache { private int snippetsScoreCounter; private kelondroMScoreCluster snippetsScore; private HashMap snippetsCache; + + /** + * a cache holding URLs to favicons specified by the page content, e.g. by using the html link-tag. e.g. + *+ * <link rel="shortcut icon" type="image/x-icon" href="../src/favicon.ico"> + *+ */ + private HashMap faviconCache; + private plasmaHTCache cacheManager; private plasmaParser parser; private serverLog log; @@ -106,7 +115,8 @@ public class plasmaSnippetCache { this.sb = theSb; this.snippetsScoreCounter = 0; this.snippetsScore = new kelondroMScoreCluster(); - this.snippetsCache = new HashMap(); + this.snippetsCache = new HashMap(); + this.faviconCache = new HashMap(); } public class TextSnippet { @@ -115,12 +125,19 @@ public class plasmaSnippetCache { private String error; private int errorCode; private Set remaingHashes; + private URL favicon; + public TextSnippet(URL url, String line, int errorCode, Set remaingHashes, String errortext) { + this(url,line,errorCode,remaingHashes,errortext,null); + } + + public TextSnippet(URL url, String line, int errorCode, Set remaingHashes, String errortext, URL favicon) { this.url = url; this.line = line; this.errorCode = errorCode; this.error = errortext; this.remaingHashes = remaingHashes; + this.favicon = favicon; } public URL getUrl() { return this.url; @@ -213,6 +230,10 @@ public class plasmaSnippetCache { } return l.toString().trim(); } + + public URL getFavicon() { + return this.favicon; + } } public class MediaSnippet { @@ -244,9 +265,9 @@ public class plasmaSnippetCache { int source = SOURCE_CACHE; String wordhashes = yacySearch.set2string(queryhashes); String line = retrieveFromCache(wordhashes, urlhash); - if (line != null) { + if (line != null) { //System.out.println("found snippet for URL " + url + " in cache: " + line); - return new TextSnippet(url, line, source, null, null); + return new TextSnippet(url, line, source, null, null,(URL)this.faviconCache.get(urlhash)); } /* =========================================================================== @@ -300,7 +321,7 @@ public class plasmaSnippetCache { * =========================================================================== */ plasmaParserDocument document = null; try { - document = parseDocument(url, resContentLength, resContent, resInfo); + document = parseDocument(url, resContentLength, resContent, resInfo); } catch (ParserException e) { return new TextSnippet(url, null, ERROR_PARSER_FAILED, queryhashes, e.getMessage()); // cannot be parsed } finally { @@ -311,12 +332,14 @@ public class plasmaSnippetCache { /* =========================================================================== * COMPUTE SNIPPET - * =========================================================================== */ + * =========================================================================== */ + URL resFavicon = document.getFavicon(); + if (resFavicon != null) this.faviconCache.put(urlhash,resFavicon); // we have found a parseable non-empty file: use the lines // compute snippet from text final Iterator sentences = document.getSentences(pre); - if (sentences == null) return new TextSnippet(url, null, ERROR_PARSER_NO_LINES, queryhashes, "parser returned no sentences"); + if (sentences == null) return new TextSnippet(url, null, ERROR_PARSER_NO_LINES, queryhashes, "parser returned no sentences",resFavicon); Object[] tsr = computeTextSnippet(sentences, queryhashes, snippetMaxLength); String textline = (tsr == null) ? null : (String) tsr[0]; Set remainingHashes = (tsr == null) ? queryhashes : (Set) tsr[1]; @@ -335,13 +358,13 @@ public class plasmaSnippetCache { //if (hrefline != null) line += (line.length() == 0) ? hrefline : "