recognize more html file types for snapshots

10 years ago · d5bac64421
parent 6f0167fac1
commit d5bac64421
1 changed files with 4 additions and 1 deletions
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@ -61,6 +61,7 @@ import net.yacy.crawler.retrieval.SMBLoader;
 import net.yacy.document.Document;
 import net.yacy.document.Parser;
 import net.yacy.document.TextParser;
+import net.yacy.document.parser.htmlParser;
 import net.yacy.kelondro.util.FileUtils;
 import net.yacy.repository.Blacklist.BlacklistType;
 import net.yacy.search.Switchboard;
@ -212,7 +213,9 @@ public final class LoaderDispatcher {
        if (protocol.equals("http") || protocol.equals("https")) {
            // load pdf in case that is wanted. This can later be used to compute a web page preview in the search results
            boolean depthok = crawlProfile != null && request.depth() <= crawlProfile.snapshotMaxdepth();
-            boolean extok = request.url().getFile().length() == 0 || "html|shtml|php".indexOf(MultiProtocolURL.getFileExtension(request.url().getFile())) >= 0;
+            String file = request.url().getFile();
+            String ext = MultiProtocolURL.getFileExtension(file).toLowerCase();
+            boolean extok = ext.length() == 0 || file.length() <= 1 || htmlParser.htmlExtensionsSet.contains(ext);
            if (depthok && extok) {
                File snapshotFile = sb.snapshots.downloadPDFSnapshot(request.url(), request.depth(), new Date(), crawlProfile.snapshotReplaceold(), sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null);
                log.info("SNAPSHOT - " + (snapshotFile == null ? "could not generate snapshot for " + request.url().toNormalform(true) : "wrote " + snapshotFile + " for " + request.url().toNormalform(true)));