From a1ee10107968ee124364e588c637bf57131c57db Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 2 Dec 2014 12:10:44 +0100 Subject: [PATCH] recognize more html file extensions --- .../net/yacy/document/parser/htmlParser.java | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java index 2c5007145..881fa6cba 100644 --- a/source/net/yacy/document/parser/htmlParser.java +++ b/source/net/yacy/document/parser/htmlParser.java @@ -33,7 +33,9 @@ import java.net.MalformedURLException; import java.nio.charset.Charset; import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.UnsupportedCharsetException; +import java.util.HashSet; import java.util.LinkedHashMap; +import java.util.Set; import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.AnchorURL; @@ -56,22 +58,21 @@ public class htmlParser extends AbstractParser implements Parser { private static final int maxLinks = 10000; + public final static String[] htmlExtensions = new String[]{ + "htm","html","phtml","shtml","shtm","stm","xhtml","phtml","phtm", + "tpl","php","php2","php3","php4","php5","cfm","asp","aspx","tex","txt" + }; + + public final static Set htmlExtensionsSet; + + static { + htmlExtensionsSet = new HashSet<>(htmlExtensions.length); + for (String ext: htmlExtensions) htmlExtensionsSet.add(ext); + } + public htmlParser() { super("Streaming HTML Parser"); - this.SUPPORTED_EXTENSIONS.add("htm"); - this.SUPPORTED_EXTENSIONS.add("html"); - this.SUPPORTED_EXTENSIONS.add("phtml"); - this.SUPPORTED_EXTENSIONS.add("shtml"); - this.SUPPORTED_EXTENSIONS.add("xhtml"); - this.SUPPORTED_EXTENSIONS.add("php"); - this.SUPPORTED_EXTENSIONS.add("php3"); - this.SUPPORTED_EXTENSIONS.add("php4"); - this.SUPPORTED_EXTENSIONS.add("php5"); - this.SUPPORTED_EXTENSIONS.add("cfm"); - this.SUPPORTED_EXTENSIONS.add("asp"); - this.SUPPORTED_EXTENSIONS.add("aspx"); - this.SUPPORTED_EXTENSIONS.add("tex"); - this.SUPPORTED_EXTENSIONS.add("txt"); + this.SUPPORTED_EXTENSIONS.addAll(htmlExtensionsSet); this.SUPPORTED_MIME_TYPES.add("text/html"); this.SUPPORTED_MIME_TYPES.add("text/xhtml+xml"); this.SUPPORTED_MIME_TYPES.add("application/xhtml+xml");