recognize more html file types for snapshots

pull/1/head
Michael Peter Christen 10 years ago
parent 6f0167fac1
commit d5bac64421

@ -61,6 +61,7 @@ import net.yacy.crawler.retrieval.SMBLoader;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.parser.htmlParser;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard;
@ -212,7 +213,9 @@ public final class LoaderDispatcher {
if (protocol.equals("http") || protocol.equals("https")) {
// load pdf in case that is wanted. This can later be used to compute a web page preview in the search results
boolean depthok = crawlProfile != null && request.depth() <= crawlProfile.snapshotMaxdepth();
boolean extok = request.url().getFile().length() == 0 || "html|shtml|php".indexOf(MultiProtocolURL.getFileExtension(request.url().getFile())) >= 0;
String file = request.url().getFile();
String ext = MultiProtocolURL.getFileExtension(file).toLowerCase();
boolean extok = ext.length() == 0 || file.length() <= 1 || htmlParser.htmlExtensionsSet.contains(ext);
if (depthok && extok) {
File snapshotFile = sb.snapshots.downloadPDFSnapshot(request.url(), request.depth(), new Date(), crawlProfile.snapshotReplaceold(), sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null);
log.info("SNAPSHOT - " + (snapshotFile == null ? "could not generate snapshot for " + request.url().toNormalform(true) : "wrote " + snapshotFile + " for " + request.url().toNormalform(true)));

Loading…
Cancel
Save