From d5bac64421e7b2671da95945fb0a1536920234ae Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 2 Dec 2014 12:52:36 +0100 Subject: [PATCH] recognize more html file types for snapshots --- source/net/yacy/repository/LoaderDispatcher.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index fd33e6fb4..bdfad4ed2 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -61,6 +61,7 @@ import net.yacy.crawler.retrieval.SMBLoader; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.TextParser; +import net.yacy.document.parser.htmlParser; import net.yacy.kelondro.util.FileUtils; import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.search.Switchboard; @@ -212,7 +213,9 @@ public final class LoaderDispatcher { if (protocol.equals("http") || protocol.equals("https")) { // load pdf in case that is wanted. This can later be used to compute a web page preview in the search results boolean depthok = crawlProfile != null && request.depth() <= crawlProfile.snapshotMaxdepth(); - boolean extok = request.url().getFile().length() == 0 || "html|shtml|php".indexOf(MultiProtocolURL.getFileExtension(request.url().getFile())) >= 0; + String file = request.url().getFile(); + String ext = MultiProtocolURL.getFileExtension(file).toLowerCase(); + boolean extok = ext.length() == 0 || file.length() <= 1 || htmlParser.htmlExtensionsSet.contains(ext); if (depthok && extok) { File snapshotFile = sb.snapshots.downloadPDFSnapshot(request.url(), request.depth(), new Date(), crawlProfile.snapshotReplaceold(), sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null); log.info("SNAPSHOT - " + (snapshotFile == null ? "could not generate snapshot for " + request.url().toNormalform(true) : "wrote " + snapshotFile + " for " + request.url().toNormalform(true)));