From 8ca1f5d400c370f2c61d1abe8c406967fd75cb6f Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 9 Jul 2009 20:56:30 +0000 Subject: [PATCH] - some work to integrate the html parser the same way as the other parsers are integrated (not finished) - added migration of code of settings pages (hmm.. does not work correctly yet, sorry) - more refactoring - removed more unused code git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6187 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- defaults/yacy.init | 13 +- htroot/SettingsAck_p.java | 57 ++--- htroot/Settings_Parser.inc | 22 +- htroot/Settings_p.java | 49 +--- source/de/anomic/document/AbstractParser.java | 16 +- .../de/anomic/document/ParserDispatcher.java | 220 +----------------- .../de/anomic/document/parser/bzipParser.java | 12 +- .../de/anomic/document/parser/docParser.java | 10 +- .../de/anomic/document/parser/gzipParser.java | 8 +- .../document/parser/html/ContentScraper.java | 4 +- .../de/anomic/document/parser/htmlParser.java | 195 ++++++++++++++++ .../document/parser/mimeTypeParser.java | 13 +- .../de/anomic/document/parser/odtParser.java | 8 +- .../de/anomic/document/parser/pdfParser.java | 14 +- .../de/anomic/document/parser/pptParser.java | 15 +- .../de/anomic/document/parser/psParser.java | 10 +- .../de/anomic/document/parser/rpmParser.java | 10 +- .../de/anomic/document/parser/rssParser.java | 10 +- .../de/anomic/document/parser/rtfParser.java | 10 +- .../document/parser/sevenzipParser.java | 10 +- .../de/anomic/document/parser/swfParser.java | 8 +- .../de/anomic/document/parser/tarParser.java | 10 +- .../de/anomic/document/parser/vcfParser.java | 12 +- .../de/anomic/document/parser/vsdParser.java | 11 +- .../de/anomic/document/parser/xlsParser.java | 11 +- .../de/anomic/document/parser/zipParser.java | 8 +- source/de/anomic/http/httpdFileHandler.java | 3 +- .../de/anomic/plasma/plasmaSwitchboard.java | 2 +- .../plasma/plasmaSwitchboardConstants.java | 5 +- source/de/anomic/search/RankingProcess.java | 2 - source/migration.java | 6 - 31 files changed, 283 insertions(+), 501 deletions(-) create mode 100644 source/de/anomic/document/parser/htmlParser.java diff --git a/defaults/yacy.init b/defaults/yacy.init index a2ca90d95..6624a162e 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -249,16 +249,15 @@ minimumGlobalDelta = 500 # the following mime-types are the whitelist for indexing # -# parseableMimeTypes.HTML: specifies mime-types that can be indexed with built-in html parser -# parseableMime: specifies mime-types that can be indexed but not on the fly -parseableMimeTypes.HTML=application/xhtml+xml,text/html,text/plain,text/sgml +# parseableMime: specifies mime-types that can be indexed with any built-in parser parseableMimeTypes=application/atom+xml,application/bzip2,application/excel,application/gzip,application/java-archive,application/msexcel,application/mspowerpoint,application/msword,application/octet-stream,application/pdf,application/postscript,application/powerpoint,application/rdf+xml,application/rss+xml,application/rtf,application/tar,application/vcard,application/visio,application/visio.drawing,application/vnd.ms-excel,application/vnd.ms-powerpoint,application/vnd.oasis.opendocument.text,application/vnd.visio,application/vsd,application/x-7z-compressed,application/x-bz2,application/x-bzip2,application/x-compress,application/x-compressed,application/x-excel,application/x-gzip,application/x-msexcel,application/x-redhat packet manager,application/x-redhat-package-manager,application/x-rpm,application/x-shockwave-flash,application/x-shockwave-flash2-preview,application/x-tar,application/x-visio,application/x-vnd.oasis.opendocument.text,application/x-vsd,application/x-xml,application/x-zip,application/x-zip-compressed,application/xml,application/zip,image/x-vsd,text/postscript,text/rss,text/rtf,text/x-vcard,text/xml,zz-application/zz-winassoc-vsd -parseableMimeTypes.CRAWLER=application/atom+xml,application/bzip2,application/excel,application/gzip,application/java-archive,application/msexcel,application/mspowerpoint,application/msword,application/octet-stream,application/pdf,application/postscript,application/powerpoint,application/rdf+xml,application/rss+xml,application/rtf,application/tar,application/vcard,application/visio,application/visio.drawing,application/vnd.ms-excel,application/vnd.ms-powerpoint,application/vnd.oasis.opendocument.text,application/vnd.visio,application/vsd,application/x-7z-compressed,application/x-bz2,application/x-bzip2,application/x-compress,application/x-compressed,application/x-excel,application/x-gzip,application/x-msexcel,application/x-redhat packet manager,application/x-redhat-package-manager,application/x-rpm,application/x-shockwave-flash,application/x-shockwave-flash2-preview,application/x-tar,application/x-visio,application/x-vnd.oasis.opendocument.text,application/x-vsd,application/x-xml,application/x-zip,application/x-zip-compressed,application/xml,application/zip,image/x-vsd,text/postscript,text/rss,text/rtf,text/x-vcard,text/xml,zz-application/zz-winassoc-vsd -parseableMimeTypes.PROXY=application/atom+xml,application/bzip2,application/excel,application/gzip,application/java-archive,application/msexcel,application/mspowerpoint,application/msword,application/octet-stream,application/pdf,application/postscript,application/powerpoint,application/rdf+xml,application/rss+xml,application/rtf,application/tar,application/vcard,application/visio,application/visio.drawing,application/vnd.ms-excel,application/vnd.ms-powerpoint,application/vnd.oasis.opendocument.text,application/vnd.visio,application/vsd,application/x-7z-compressed,application/x-bz2,application/x-bzip2,application/x-compress,application/x-compressed,application/x-excel,application/x-gzip,application/x-msexcel,application/x-redhat packet manager,application/x-redhat-package-manager,application/x-rpm,application/x-shockwave-flash,application/x-shockwave-flash2-preview,application/x-tar,application/x-visio,application/x-vnd.oasis.opendocument.text,application/x-vsd,application/x-xml,application/x-zip,application/x-zip-compressed,application/xml,application/zip,image/x-vsd,text/postscript,text/rss,text/rtf,text/x-vcard,text/xml,zz-application/zz-winassoc-vsd -parseableMimeTypes.ICAP=application/atom+xml,application/bzip2,application/excel,application/gzip,application/java-archive,application/msexcel,application/mspowerpoint,application/msword,application/octet-stream,application/pdf,application/postscript,application/powerpoint,application/rdf+xml,application/rss+xml,application/rtf,application/tar,application/vcard,application/visio,application/visio.drawing,application/vnd.ms-excel,application/vnd.ms-powerpoint,application/vnd.oasis.opendocument.text,application/vnd.visio,application/vsd,application/x-7z-compressed,application/x-bz2,application/x-bzip2,application/x-compress,application/x-compressed,application/x-excel,application/x-gzip,application/x-msexcel,application/x-redhat packet manager,application/x-redhat-package-manager,application/x-rpm,application/x-shockwave-flash,application/x-shockwave-flash2-preview,application/x-tar,application/x-visio,application/x-vnd.oasis.opendocument.text,application/x-vsd,application/x-xml,application/x-zip,application/x-zip-compressed,application/xml,application/zip,image/x-vsd,text/postscript,text/rss,text/rtf,text/x-vcard,text/xml,zz-application/zz-winassoc-vsd -parseableMimeTypes.URLREDIRECTOR=application/atom+xml,application/bzip2,application/excel,application/gzip,application/java-archive,application/msexcel,application/mspowerpoint,application/msword,application/octet-stream,application/pdf,application/postscript,application/powerpoint,application/rdf+xml,application/rss+xml,application/rtf,application/tar,application/vcard,application/visio,application/visio.drawing,application/vnd.ms-excel,application/vnd.ms-powerpoint,application/vnd.oasis.opendocument.text,application/vnd.visio,application/vsd,application/x-7z-compressed,application/x-bz2,application/x-bzip2,application/x-compress,application/x-compressed,application/x-excel,application/x-gzip,application/x-msexcel,application/x-redhat packet manager,application/x-redhat-package-manager,application/x-rpm,application/x-shockwave-flash,application/x-shockwave-flash2-preview,application/x-tar,application/x-visio,application/x-vnd.oasis.opendocument.text,application/x-vsd,application/x-xml,application/x-zip,application/x-zip-compressed,application/xml,application/zip,image/x-vsd,text/postscript,text/rss,text/rtf,text/x-vcard,text/xml,zz-application/zz-winassoc-vsd + +# parseableMimeTypes.IMAGE: specifies mime-types that refer to image type content parseableMimeTypes.IMAGE=image/gif,image/jpeg,image/png,image/tiff,image/vnd.wap.wbmp,image/x-icon,image/bmp +# parseableMimeTypes.HTML: specifies mime-types that can be indexed with built-in html parser +parseableMimeTypes.HTML=application/xhtml+xml,text/html,text/plain,text/sgml + # media extension string # a comma-separated list of extensions that denote media file formats # this is important to recognize - tags as not-html reference diff --git a/htroot/SettingsAck_p.java b/htroot/SettingsAck_p.java index d995e9e53..d35ba6609 100644 --- a/htroot/SettingsAck_p.java +++ b/htroot/SettingsAck_p.java @@ -33,7 +33,6 @@ import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; -import java.util.Set; import java.util.StringTokenizer; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; @@ -458,54 +457,34 @@ public class SettingsAck_p { */ if (post.containsKey("parserSettings")) { post.remove("parserSettings"); - /* - final Set parserModes = ParserDispatcher.getParserConfigList().keySet(); - final HashMap> newConfigList = new HashMap>(); - Iterator parserModeIter = parserModes.iterator(); - while (parserModeIter.hasNext()) { - final String currParserMode = parserModeIter.next(); - newConfigList.put(currParserMode, new HashSet()); - } - // looping through all received settings - int pos; + final HashSet newConfig = new HashSet(); + + // loop through all received settings final Iterator keyEnum = post.keySet().iterator(); - while (keyEnum.hasNext()) { - final String key = keyEnum.next(); - if ((pos = key.indexOf(".")) != -1) { - final String currParserMode = key.substring(0,pos).trim().toUpperCase(); - final String currMimeType = key.substring(pos+1).replaceAll("\n", ""); - if (parserModes.contains(currParserMode)) { - HashSet currEnabledMimeTypes; - assert (newConfigList.containsKey(currParserMode)) : "Unexpected Error"; - currEnabledMimeTypes = newConfigList.get(currParserMode); - currEnabledMimeTypes.add(currMimeType); - } - } + while (keyEnum.hasNext()) { + String key = keyEnum.next(); + if (key.startsWith("mimename")) newConfig.add(post.get(key)); } int enabledMimesCount = 0; final StringBuilder currEnabledMimesTxt = new StringBuilder(); - parserModeIter = newConfigList.keySet().iterator(); - while (parserModeIter.hasNext()) { - final String currParserMode = parserModeIter.next(); - final String[] enabledMimes = ParserDispatcher.setEnabledParserList(newConfigList.get(currParserMode)); - Arrays.sort(enabledMimes); - - currEnabledMimesTxt.setLength(0); - for (int i=0; i < enabledMimes.length; i++) { - currEnabledMimesTxt.append(enabledMimes[i]).append(","); - prop.put("info_parser_" + enabledMimesCount + "_parserMode",currParserMode); - prop.put("info_parser_" + enabledMimesCount + "_enabledMime",enabledMimes[i]); - enabledMimesCount++; - } - if (currEnabledMimesTxt.length() > 0) currEnabledMimesTxt.deleteCharAt(currEnabledMimesTxt.length()-1); - env.setConfig("parseableMimeTypes." + currParserMode,currEnabledMimesTxt.toString()); + final String[] enabledMimes = ParserDispatcher.setEnabledParserList(newConfig); + Arrays.sort(enabledMimes); + + currEnabledMimesTxt.setLength(0); + for (int i=0; i < enabledMimes.length; i++) { + currEnabledMimesTxt.append(enabledMimes[i]).append(","); + prop.put("info_parser_" + enabledMimesCount + "_enabledMime", newConfig.toString()); + enabledMimesCount++; } + if (currEnabledMimesTxt.length() > 0) currEnabledMimesTxt.deleteCharAt(currEnabledMimesTxt.length()-1); + env.setConfig("parseableMimeTypes", currEnabledMimesTxt.toString()); + prop.put("info_parser",enabledMimesCount); prop.put("info", "18"); return prop; - */ + } // Crawler settings diff --git a/htroot/Settings_Parser.inc b/htroot/Settings_Parser.inc index 621a3af0c..6038a4be6 100644 --- a/htroot/Settings_Parser.inc +++ b/htroot/Settings_Parser.inc @@ -6,31 +6,27 @@ http://www.iana.org/assignments/media-types/

- #{parserMode}# - #{/parserMode}# + + - #{parser}# - - - + + #{mime}# - #{parserMode}# - #{/parserMode}# + + - #{/mime}# #{/parser}# - #{parserMode}# + #{/parserMode}# - + - +
#[name]#
enable/disable Parser Mime-TypeParser Usage
#[name]# V#[version]# #[usage]##[name]# 
#[mimetype]# 
-   Enable all parsers
Changes take effect immediately Changes take effect immediately
diff --git a/htroot/Settings_p.java b/htroot/Settings_p.java index 2dbf77d95..4631c6aee 100644 --- a/htroot/Settings_p.java +++ b/htroot/Settings_p.java @@ -26,11 +26,10 @@ import java.util.Enumeration; import java.util.HashMap; -import java.util.HashSet; import java.util.Iterator; +import de.anomic.document.Parser; import de.anomic.document.ParserDispatcher; -import de.anomic.document.ParserConfig; import de.anomic.http.httpHeader; import de.anomic.http.httpRequestHeader; import de.anomic.plasma.plasmaSwitchboard; @@ -217,47 +216,19 @@ public final class Settings_p { /* * Parser Configuration */ - /* - final HashMap configList = ParserDispatcher.getParserConfigList(); - final plasmaParserConfig[] configArray = configList.values().toArray(new plasmaParserConfig[configList.size()]); - - final HashSet parserInfos = new HashSet(ParserDispatcher.getAvailableParserList().values()); - -// // fetching a list of all available mimetypes -// List availableParserKeys = Arrays.asList(availableParsers.entrySet().toArray(new ParserInfo[availableParsers.size()])); -// -// // sort it -// Collections.sort(availableParserKeys); - - // loop through the mimeTypes and add it to the properties - final boolean[] allParsersEnabled = new boolean[configList.size()]; - for (int i=0; i availableParserIter = parserInfos.iterator(); + final Iterator availableParserIter = ParserDispatcher.availableParserList.values().iterator(); while (availableParserIter.hasNext()) { - final ParserInfo parserInfo = availableParserIter.next(); - prop.put("parser_" + parserIdx + "_name", parserInfo.parserName); - prop.putXML("parser_" + parserIdx + "_version", parserInfo.parserVersionNr); - prop.put("parser_" + parserIdx + "_usage", parserInfo.usageCount); - prop.put("parser_" + parserIdx + "_colspan", configArray.length); + final Parser parserInfo = availableParserIter.next(); + prop.put("parser_" + parserIdx + "_name", parserInfo.getName()); int mimeIdx = 0; - final Enumeration mimeTypeIter = parserInfo.supportedMimeTypes.keys(); + final Enumeration mimeTypeIter = parserInfo.getSupportedMimeTypes().keys(); while (mimeTypeIter.hasMoreElements()) { final String mimeType = mimeTypeIter.nextElement(); - prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_mimetype", mimeType); - //prop.put("parser_" + parserIdx + "_name", parserName); - //prop.put("parser_" + parserIdx + "_shortname", parserName.substring(parserName.lastIndexOf(".")+1)); - for (int i=0; i enabledParsers = configArray[i].getEnabledParserList(); - prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_parserMode_" + i + "_optionName", configArray[i].parserMode + "." + mimeType); - prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_parserMode_" + i + "_status", enabledParsers.contains(mimeType) ? "1" : "0"); - allParsersEnabled[i] &= enabledParsers.contains(mimeType); - } - prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_parserMode", configArray.length); + prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_status", (ParserDispatcher.supportedMimeTypesContains(mimeType)) ? 1 : 0); mimeIdx++; } prop.put("parser_" + parserIdx + "_mime", mimeIdx); @@ -265,14 +236,8 @@ public final class Settings_p { parserIdx++; } - for (int i=0; i videoExtSet = new HashSet(); private static final HashSet appsExtSet = new HashSet(); - /** - * This {@link FilenameFilter} is used to find all classes based on there filenames - * which seems to be additional content parsers. - * Currently the filenames of all content parser classes must end with Parser.class - */ - /* - private static final FilenameFilter parserFileNameFilter = new FilenameFilter() { - public boolean accept(File dir, String name) { - return name.endsWith("Parser.class"); - } - }; - */ - - /** - * This {@link FileFilter} is used to get all subpackages - * of the parser package. - */ - /* - private static final FileFilter parserDirectoryFilter = new FileFilter() { - public boolean accept(File file) { - return file.isDirectory(); - } - }; - */ /** * Initializing the @@ -146,9 +115,6 @@ public final class ParserDispatcher { private static final Log theLogger = new Log("PARSER"); - public Log getLogger() { - return theLogger; - } /** * This function is used to initialize the HTMLParsableMimeTypes List. @@ -187,59 +153,42 @@ public final class ParserDispatcher { } } - public static void initImageExt(final List imageExtList) { + private static void initImageExt(final List imageExtList) { synchronized (imageExtSet) { imageExtSet.addAll(imageExtList); } } - public static void initAudioExt(final List audioExtList) { + private static void initAudioExt(final List audioExtList) { synchronized (audioExtSet) { audioExtSet.addAll(audioExtList); } } - public static void initVideoExt(final List videoExtList) { + private static void initVideoExt(final List videoExtList) { synchronized (videoExtSet) { videoExtSet.addAll(videoExtList); } } - public static void initAppsExt(final List appsExtList) { + private static void initAppsExt(final List appsExtList) { synchronized (appsExtSet) { appsExtSet.addAll(appsExtList); } } - public static String getMediaExtList() { - synchronized (mediaExtSet) { - return mediaExtSet.toString(); - } - } - public static void initSupportedHTMLFileExt(final List supportedRealtimeFileExtList) { synchronized (supportedHTMLFileExt) { supportedHTMLFileExt.addAll(supportedRealtimeFileExtList); } } - public static boolean HTMLParsableMimeTypesContains(String mimeType) { + private static boolean HTMLParsableMimeTypesContains(String mimeType) { mimeType = normalizeMimeType(mimeType); synchronized (supportedHTMLMimeTypes) { return supportedHTMLMimeTypes.contains(mimeType); } } - - public static boolean supportedHTMLContent(final yacyURL url, final String mimeType) { - return HTMLParsableMimeTypesContains(mimeType) && supportedHTMLFileExtContains(url); - } - - public static boolean supportedHTMLFileExtContains(final yacyURL url) { - final String fileExt = getFileExt(url); - synchronized (supportedHTMLFileExt) { - return supportedHTMLFileExt.contains(fileExt); - } - } public static String getFileExt(final yacyURL url) { // getting the file path @@ -300,81 +249,6 @@ public final class ParserDispatcher { } } - /** - * some html authors use wrong encoding names, either because they don't know exactly what they - * are doing or they produce a type. Many times, the upper/downcase scheme of the name is fuzzy - * This method patches wrong encoding names. The correct names are taken from - * http://www.iana.org/assignments/character-sets - * @param encoding - * @return patched encoding name - */ - public static String patchCharsetEncoding(String encoding) { - - // return the system default encoding - if ((encoding == null) || (encoding.length() < 3)) return Charset.defaultCharset().name(); - - // trim encoding string - encoding = encoding.trim(); - - // fix upper/lowercase - encoding = encoding.toUpperCase(); - if (encoding.startsWith("SHIFT")) return "Shift_JIS"; - if (encoding.startsWith("BIG")) return "Big5"; - // all other names but such with "windows" use uppercase - if (encoding.startsWith("WINDOWS")) encoding = "windows" + encoding.substring(7); - if (encoding.startsWith("MACINTOSH")) encoding = "MacRoman"; - - // fix wrong fill characters - encoding = encoding.replaceAll("_", "-"); - - if (encoding.matches("GB[_-]?2312([-_]80)?")) return "GB2312"; - if (encoding.matches(".*UTF[-_]?8.*")) return "UTF-8"; - if (encoding.startsWith("US")) return "US-ASCII"; - if (encoding.startsWith("KOI")) return "KOI8-R"; - - // patch missing '-' - if (encoding.startsWith("windows") && encoding.length() > 7) { - final char c = encoding.charAt(7); - if ((c >= '0') && (c <= '9')) { - encoding = "windows-" + encoding.substring(7); - } - } - - if (encoding.startsWith("ISO")) { - // patch typos - if (encoding.length() > 3) { - final char c = encoding.charAt(3); - if ((c >= '0') && (c <= '9')) { - encoding = "ISO-" + encoding.substring(3); - } - } - if (encoding.length() > 8) { - final char c = encoding.charAt(8); - if ((c >= '0') && (c <= '9')) { - encoding = encoding.substring(0, 8) + "-" + encoding.substring(8); - } - } - } - - // patch wrong name - if (encoding.startsWith("ISO-8559")) { - // popular typo - encoding = "ISO-8859" + encoding.substring(8); - } - - // converting cp\d{4} -> windows-\d{4} - if (encoding.matches("CP([_-])?125[0-8]")) { - final char c = encoding.charAt(2); - if ((c >= '0') && (c <= '9')) { - encoding = "windows-" + encoding.substring(2); - } else { - encoding = "windows" + encoding.substring(2); - } - } - - return encoding; - } - public static String normalizeMimeType(String mimeType) { //if (mimeType == null) doMimeTypeAnalysis if (mimeType == null) mimeType = "application/octet-stream"; @@ -519,7 +393,7 @@ public final class ParserDispatcher { // getting the charset of the document // TODO: do a charset detection here .... - final String documentCharset = patchCharsetEncoding(theDocumentCharset); + final String documentCharset = htmlParser.patchCharsetEncoding(theDocumentCharset); // testing if parsing is supported for this resource if (!supportedContent(location,mimeType)) { @@ -543,7 +417,7 @@ public final class ParserDispatcher { // parse the resource doc = theParser.parse(location, mimeType,documentCharset,sourceStream); } else if (HTMLParsableMimeTypesContains(mimeType)) { - doc = parseHtml(location, mimeType, documentCharset, sourceStream); + doc = new htmlParser().parse(location, mimeType, documentCharset, sourceStream); } else { final String errorMsg = "No parser available to parse mimetype '" + mimeType + "' (2)"; theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); @@ -558,17 +432,6 @@ public final class ParserDispatcher { } return doc; - } catch (final UnsupportedEncodingException e) { - final String errorMsg = "unsupported charset encoding: " + e.getMessage(); - theLogger.logSevere("Unable to parse '" + location + "'. " + errorMsg, e); - throw new ParserException(errorMsg,location, errorMsg); - } catch (final IOException e) { - // IOExceptions may occur during html parsing when a server closes the connection during reading. - // This may happen here, because the html parser is a streaming parser - // that produces surrogates while the connection is active - final String errorMsg = "IOException - server may have closed the connection. " + e.getMessage(); - theLogger.logWarning("Unable to parse '" + location + "'. " + errorMsg); - throw new ParserException(errorMsg, location, errorMsg); } catch (final Exception e) { // Interrupted- and Parser-Exceptions should pass through if (e instanceof InterruptedException) throw (InterruptedException) e; @@ -586,71 +449,8 @@ public final class ParserDispatcher { } } - private static Document parseHtml( - final yacyURL location, - final String mimeType, - final String documentCharset, - final InputStream sourceStream) throws IOException, ParserException { - - // make a scraper and transformer - final ScraperInputStream htmlFilter = new ScraperInputStream(sourceStream,documentCharset,location,null,false); - String charset = htmlFilter.detectCharset(); - if (charset == null) { - charset = documentCharset; - } else { - charset = patchCharsetEncoding(charset); - } - - if (!documentCharset.equalsIgnoreCase(charset)) { - theLogger.logInfo("Charset transformation needed from '" + documentCharset + "' to '" + charset + "' for URL = " + location.toNormalform(true, true)); - } - - Charset c; - try { - c = Charset.forName(charset); - } catch (IllegalCharsetNameException e) { - c = Charset.defaultCharset(); - } catch (UnsupportedCharsetException e) { - c = Charset.defaultCharset(); - } - - // parsing the content - final ContentScraper scraper = new ContentScraper(location); - final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false); - FileUtils.copy(htmlFilter, writer, c); - writer.close(); - //OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false); - //serverFileUtils.copy(sourceFile, hfos); - //hfos.close(); - if (writer.binarySuspect()) { - final String errorMsg = "Binary data found in resource"; - theLogger.logSevere("Unable to parse '" + location + "'. " + errorMsg); - throw new ParserException(errorMsg,location); - } - return transformScraper(location, mimeType, documentCharset, scraper); - } - public static Document transformScraper(final yacyURL location, final String mimeType, final String charSet, final ContentScraper scraper) { - final String[] sections = new String[scraper.getHeadlines(1).length + scraper.getHeadlines(2).length + scraper.getHeadlines(3).length + scraper.getHeadlines(4).length]; - int p = 0; - for (int i = 1; i <= 4; i++) for (int j = 0; j < scraper.getHeadlines(i).length; j++) sections[p++] = scraper.getHeadlines(i)[j]; - final Document ppd = new Document( - location, - mimeType, - charSet, - scraper.getContentLanguages(), - scraper.getKeywords(), - scraper.getTitle(), - scraper.getAuthor(), - sections, - scraper.getDescription(), - scraper.getText(), - scraper.getAnchors(), - scraper.getImages()); - //scraper.close(); - ppd.setFavicon(scraper.getFavicon()); - return ppd; - } + /** * This function is used to determine the parser class that should be used for a given diff --git a/source/de/anomic/document/parser/bzipParser.java b/source/de/anomic/document/parser/bzipParser.java index baefa1316..33de51192 100644 --- a/source/de/anomic/document/parser/bzipParser.java +++ b/source/de/anomic/document/parser/bzipParser.java @@ -53,21 +53,15 @@ public class bzipParser extends AbstractParser implements Parser { static { SUPPORTED_MIME_TYPES.put("application/x-bzip2",fileExtensions); SUPPORTED_MIME_TYPES.put("application/bzip2", fileExtensions); - SUPPORTED_MIME_TYPES.put("application/x-bz2", fileExtensions); + SUPPORTED_MIME_TYPES.put("application/x-bz2", fileExtensions); SUPPORTED_MIME_TYPES.put("application/octet-stream",fileExtensions); SUPPORTED_MIME_TYPES.put("application/x-bzip",fileExtensions); SUPPORTED_MIME_TYPES.put("application/x-compressed",fileExtensions); SUPPORTED_MIME_TYPES.put("application/x-stuffit",fileExtensions); - } - - /** - * a list of library names that are needed by this parser - * @see Parser#getLibxDependences() - */ - private static final String[] LIBX_DEPENDENCIES = new String[] {}; + } public bzipParser() { - super(LIBX_DEPENDENCIES); + super(); this.parserName = "Bzip 2 UNIX Compressed File Parser"; } diff --git a/source/de/anomic/document/parser/docParser.java b/source/de/anomic/document/parser/docParser.java index fe7aaf532..c76e96d65 100644 --- a/source/de/anomic/document/parser/docParser.java +++ b/source/de/anomic/document/parser/docParser.java @@ -58,16 +58,8 @@ public class docParser extends AbstractParser implements Parser { SUPPORTED_MIME_TYPES.put("application/x-msword","doc"); } - /** - * a list of library names that are needed by this parser - * @see Parser#getLibxDependences() - */ - private static final String[] LIBX_DEPENDENCIES = new String[] { - "tm-extractors-1.0.jar" - }; - public docParser() { - super(LIBX_DEPENDENCIES); + super(); this.parserName = "Word Document Parser"; } diff --git a/source/de/anomic/document/parser/gzipParser.java b/source/de/anomic/document/parser/gzipParser.java index 195dc95df..730ed7690 100644 --- a/source/de/anomic/document/parser/gzipParser.java +++ b/source/de/anomic/document/parser/gzipParser.java @@ -62,14 +62,8 @@ public class gzipParser extends AbstractParser implements Parser { SUPPORTED_MIME_TYPES.put("application/x-tar",fileExtensions); } - /** - * a list of library names that are needed by this parser - * @see Parser#getLibxDependences() - */ - private static final String[] LIBX_DEPENDENCIES = new String[] {}; - public gzipParser() { - super(LIBX_DEPENDENCIES); + super(); this.parserName = "GNU Zip Compressed Archive Parser"; } diff --git a/source/de/anomic/document/parser/html/ContentScraper.java b/source/de/anomic/document/parser/html/ContentScraper.java index ac78fe894..45644f9e0 100644 --- a/source/de/anomic/document/parser/html/ContentScraper.java +++ b/source/de/anomic/document/parser/html/ContentScraper.java @@ -45,7 +45,7 @@ import java.util.Properties; import javax.swing.event.EventListenerList; import de.anomic.crawler.HTTPLoader; -import de.anomic.document.ParserDispatcher; +import de.anomic.document.parser.htmlParser; import de.anomic.http.httpClient; import de.anomic.http.httpHeader; import de.anomic.http.httpRequestHeader; @@ -501,7 +501,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { // scrape document to look up charset final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page),"UTF-8",new yacyURL("http://localhost", null),null,false); - final String charset = ParserDispatcher.patchCharsetEncoding(htmlFilter.detectCharset()); + final String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset()); // scrape content final ContentScraper scraper = new ContentScraper(new yacyURL("http://localhost", null)); diff --git a/source/de/anomic/document/parser/htmlParser.java b/source/de/anomic/document/parser/htmlParser.java new file mode 100644 index 000000000..f441b8e21 --- /dev/null +++ b/source/de/anomic/document/parser/htmlParser.java @@ -0,0 +1,195 @@ +package de.anomic.document.parser; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; +import java.nio.charset.IllegalCharsetNameException; +import java.nio.charset.UnsupportedCharsetException; +import java.util.Hashtable; + +import de.anomic.document.AbstractParser; +import de.anomic.document.Document; +import de.anomic.document.Parser; +import de.anomic.document.ParserException; +import de.anomic.document.parser.html.ContentScraper; +import de.anomic.document.parser.html.ScraperInputStream; +import de.anomic.document.parser.html.TransformerWriter; +import de.anomic.kelondro.util.FileUtils; +import de.anomic.yacy.yacyURL; + +public class htmlParser extends AbstractParser implements Parser { + + /** + * a list of mime types that are supported by this parser class + * @see #getSupportedMimeTypes() + */ + public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + static { + SUPPORTED_MIME_TYPES.put("application/xhtml+xml","htm,html,xhtml,php,asp"); + SUPPORTED_MIME_TYPES.put("text/html","htm,html,xhtml,php,asp"); + SUPPORTED_MIME_TYPES.put("text/plain","htm,html,xhtml,php,asp,txt"); + SUPPORTED_MIME_TYPES.put("text/sgml","htm,html,xhtml,php,asp,xml"); + } + + public htmlParser() { + super(); + this.parserName = "streaming html parser"; + } + + @Override + public Document parse( + final yacyURL location, + final String mimeType, + final String documentCharset, + final InputStream sourceStream) throws ParserException, InterruptedException { + + // make a scraper and transformer + final ScraperInputStream htmlFilter = new ScraperInputStream(sourceStream,documentCharset,location,null,false); + String charset = null; + try { + charset = htmlFilter.detectCharset(); + } catch (IOException e1) { + throw new ParserException("Charset error:" + e1.getMessage(), location); + } + if (charset == null) { + charset = documentCharset; + } else { + charset = patchCharsetEncoding(charset); + } + + if (!documentCharset.equalsIgnoreCase(charset)) { + theLogger.logInfo("Charset transformation needed from '" + documentCharset + "' to '" + charset + "' for URL = " + location.toNormalform(true, true)); + } + + Charset c; + try { + c = Charset.forName(charset); + } catch (IllegalCharsetNameException e) { + c = Charset.defaultCharset(); + } catch (UnsupportedCharsetException e) { + c = Charset.defaultCharset(); + } + + // parsing the content + final ContentScraper scraper = new ContentScraper(location); + final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false); + try { + FileUtils.copy(htmlFilter, writer, c); + writer.close(); + } catch (IOException e) { + throw new ParserException("IO error:" + e.getMessage(), location); + } + //OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false); + //serverFileUtils.copy(sourceFile, hfos); + //hfos.close(); + if (writer.binarySuspect()) { + final String errorMsg = "Binary data found in resource"; + theLogger.logSevere("Unable to parse '" + location + "'. " + errorMsg); + throw new ParserException(errorMsg,location); + } + return transformScraper(location, mimeType, documentCharset, scraper); + } + + private static Document transformScraper(final yacyURL location, final String mimeType, final String charSet, final ContentScraper scraper) { + final String[] sections = new String[scraper.getHeadlines(1).length + scraper.getHeadlines(2).length + scraper.getHeadlines(3).length + scraper.getHeadlines(4).length]; + int p = 0; + for (int i = 1; i <= 4; i++) for (int j = 0; j < scraper.getHeadlines(i).length; j++) sections[p++] = scraper.getHeadlines(i)[j]; + final Document ppd = new Document( + location, + mimeType, + charSet, + scraper.getContentLanguages(), + scraper.getKeywords(), + scraper.getTitle(), + scraper.getAuthor(), + sections, + scraper.getDescription(), + scraper.getText(), + scraper.getAnchors(), + scraper.getImages()); + //scraper.close(); + ppd.setFavicon(scraper.getFavicon()); + return ppd; + } + + + /** + * some html authors use wrong encoding names, either because they don't know exactly what they + * are doing or they produce a type. Many times, the upper/downcase scheme of the name is fuzzy + * This method patches wrong encoding names. The correct names are taken from + * http://www.iana.org/assignments/character-sets + * @param encoding + * @return patched encoding name + */ + public static String patchCharsetEncoding(String encoding) { + + // return the system default encoding + if ((encoding == null) || (encoding.length() < 3)) return Charset.defaultCharset().name(); + + // trim encoding string + encoding = encoding.trim(); + + // fix upper/lowercase + encoding = encoding.toUpperCase(); + if (encoding.startsWith("SHIFT")) return "Shift_JIS"; + if (encoding.startsWith("BIG")) return "Big5"; + // all other names but such with "windows" use uppercase + if (encoding.startsWith("WINDOWS")) encoding = "windows" + encoding.substring(7); + if (encoding.startsWith("MACINTOSH")) encoding = "MacRoman"; + + // fix wrong fill characters + encoding = encoding.replaceAll("_", "-"); + + if (encoding.matches("GB[_-]?2312([-_]80)?")) return "GB2312"; + if (encoding.matches(".*UTF[-_]?8.*")) return "UTF-8"; + if (encoding.startsWith("US")) return "US-ASCII"; + if (encoding.startsWith("KOI")) return "KOI8-R"; + + // patch missing '-' + if (encoding.startsWith("windows") && encoding.length() > 7) { + final char c = encoding.charAt(7); + if ((c >= '0') && (c <= '9')) { + encoding = "windows-" + encoding.substring(7); + } + } + + if (encoding.startsWith("ISO")) { + // patch typos + if (encoding.length() > 3) { + final char c = encoding.charAt(3); + if ((c >= '0') && (c <= '9')) { + encoding = "ISO-" + encoding.substring(3); + } + } + if (encoding.length() > 8) { + final char c = encoding.charAt(8); + if ((c >= '0') && (c <= '9')) { + encoding = encoding.substring(0, 8) + "-" + encoding.substring(8); + } + } + } + + // patch wrong name + if (encoding.startsWith("ISO-8559")) { + // popular typo + encoding = "ISO-8859" + encoding.substring(8); + } + + // converting cp\d{4} -> windows-\d{4} + if (encoding.matches("CP([_-])?125[0-8]")) { + final char c = encoding.charAt(2); + if ((c >= '0') && (c <= '9')) { + encoding = "windows-" + encoding.substring(2); + } else { + encoding = "windows" + encoding.substring(2); + } + } + + return encoding; + } + + public Hashtable getSupportedMimeTypes() { + return SUPPORTED_MIME_TYPES; + } + +} diff --git a/source/de/anomic/document/parser/mimeTypeParser.java b/source/de/anomic/document/parser/mimeTypeParser.java index 58baa2f97..387d1cd7d 100644 --- a/source/de/anomic/document/parser/mimeTypeParser.java +++ b/source/de/anomic/document/parser/mimeTypeParser.java @@ -64,17 +64,6 @@ public class mimeTypeParser extends AbstractParser implements Parser { SUPPORTED_MIME_TYPES.put("application/x-compressed",""); } - /** - * a list of library names that are needed by this parser - * @see Parser#getLibxDependences() - */ - private static final String[] LIBX_DEPENDENCIES = new String[] { - "commons-logging-1.1.1.jar", - "jmimemagic-0.1.0.jar", - "jakarta-oro-2.0.7.jar", - "log4j-1.2.9.jar" - }; - /** * Helping structure used to detect loops in the mimeType detection * process @@ -82,7 +71,7 @@ public class mimeTypeParser extends AbstractParser implements Parser { private static Hashtable threadLoopDetection = new Hashtable(); public mimeTypeParser() { - super(LIBX_DEPENDENCIES); + super(); this.parserName = "MimeType Parser"; } diff --git a/source/de/anomic/document/parser/odtParser.java b/source/de/anomic/document/parser/odtParser.java index 1d47f8f38..289e8d397 100644 --- a/source/de/anomic/document/parser/odtParser.java +++ b/source/de/anomic/document/parser/odtParser.java @@ -69,14 +69,8 @@ public class odtParser extends AbstractParser implements Parser { SUPPORTED_MIME_TYPES.put("application/x-vnd.oasis.opendocument.text","odt"); } - /** - * a list of library names that are needed by this parser - * @see Parser#getLibxDependences() - */ - private static final String[] LIBX_DEPENDENCIES = new String[] {"odf_utils_05_11_29.jar"}; - public odtParser() { - super(LIBX_DEPENDENCIES); + super(); this.parserName = "OASIS OpenDocument V2 Text Document Parser"; } diff --git a/source/de/anomic/document/parser/pdfParser.java b/source/de/anomic/document/parser/pdfParser.java index 334f311cf..ce54e79da 100644 --- a/source/de/anomic/document/parser/pdfParser.java +++ b/source/de/anomic/document/parser/pdfParser.java @@ -56,7 +56,7 @@ public class pdfParser extends AbstractParser implements Parser { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); static { SUPPORTED_MIME_TYPES.put("application/pdf","pdf"); SUPPORTED_MIME_TYPES.put("application/x-pdf","pdf"); @@ -64,18 +64,10 @@ public class pdfParser extends AbstractParser implements Parser { SUPPORTED_MIME_TYPES.put("applications/vnd.pdf","pdf"); SUPPORTED_MIME_TYPES.put("text/pdf","pdf"); SUPPORTED_MIME_TYPES.put("text/x-pdf","pdf"); - } - - /** - * a list of library names that are needed by this parser - * @see Parser#getLibxDependences() - */ - private static final String[] LIBX_DEPENDENCIES = new String[] { - "PDFBox-0.7.3.jar", "FontBox-0.1.0-dev.jar", "bcprov-jdk14-139.jar", "bcmail-jdk14-139.jar" - }; + } public pdfParser() { - super(LIBX_DEPENDENCIES); + super(); this.parserName = "Acrobat Portable Document Parser"; } diff --git a/source/de/anomic/document/parser/pptParser.java b/source/de/anomic/document/parser/pptParser.java index 37e333ae0..59ca84ae1 100644 --- a/source/de/anomic/document/parser/pptParser.java +++ b/source/de/anomic/document/parser/pptParser.java @@ -47,7 +47,7 @@ public class pptParser extends AbstractParser implements Parser { */ public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); static final String fileExtensions = "ppt,pps"; - static { + static { SUPPORTED_MIME_TYPES.put("application/mspowerpoint",fileExtensions); SUPPORTED_MIME_TYPES.put("application/powerpoint",fileExtensions); SUPPORTED_MIME_TYPES.put("application/vnd.ms-powerpoint",fileExtensions); @@ -56,19 +56,10 @@ public class pptParser extends AbstractParser implements Parser { SUPPORTED_MIME_TYPES.put("application/vnd-mspowerpoint",fileExtensions); SUPPORTED_MIME_TYPES.put("application/x-powerpoint",fileExtensions); SUPPORTED_MIME_TYPES.put("application/x-m",fileExtensions); - } - - /** - * a list of library names that are needed by this parser - * @see Parser#getLibxDependences() - */ - private static final String[] LIBX_DEPENDENCIES = new String[] { - "poi-3.2-FINAL-20081019.jar", - "poi-scratchpad-3.2-FINAL-20081019.jar" - }; + } public pptParser(){ - super(LIBX_DEPENDENCIES); + super(); this.parserName = "Microsoft Powerpoint Parser"; } diff --git a/source/de/anomic/document/parser/psParser.java b/source/de/anomic/document/parser/psParser.java index 0efae22b6..cd84998f3 100644 --- a/source/de/anomic/document/parser/psParser.java +++ b/source/de/anomic/document/parser/psParser.java @@ -55,20 +55,14 @@ public class psParser extends AbstractParser implements Parser { SUPPORTED_MIME_TYPES.put("application/x-postscript","ps"); SUPPORTED_MIME_TYPES.put("application/x-ps","ps"); SUPPORTED_MIME_TYPES.put("application/x-postscript-not-eps","ps"); - } - - /** - * a list of library names that are needed by this parser - * @see Parser#getLibxDependences() - */ - private static final String[] LIBX_DEPENDENCIES = new String[] {}; + } private final static Object modeScan = new Object(); private static boolean modeScanDone = false; private static String parserMode = "java"; public psParser() { - super(LIBX_DEPENDENCIES); + super(); this.parserName = "PostScript Document Parser"; if (!modeScanDone) synchronized (modeScan) { if (testForPs2Ascii()) parserMode = "ps2ascii"; diff --git a/source/de/anomic/document/parser/rpmParser.java b/source/de/anomic/document/parser/rpmParser.java index aa156e8e1..6039cd23e 100644 --- a/source/de/anomic/document/parser/rpmParser.java +++ b/source/de/anomic/document/parser/rpmParser.java @@ -62,16 +62,10 @@ public class rpmParser extends AbstractParser implements Parser { SUPPORTED_MIME_TYPES.put("application/x-rpm","rpm"); SUPPORTED_MIME_TYPES.put("application/x-redhat packet manager","rpm"); SUPPORTED_MIME_TYPES.put("application/x-redhat-package-manager","rpm"); - } - - /** - * a list of library names that are needed by this parser - * @see Parser#getLibxDependences() - */ - private static final String[] LIBX_DEPENDENCIES = new String[] {"jrpm-head.jar"}; + } public rpmParser() { - super(LIBX_DEPENDENCIES); + super(); this.parserName = "rpm Parser"; } diff --git a/source/de/anomic/document/parser/rssParser.java b/source/de/anomic/document/parser/rssParser.java index 552e65ca0..9cacdcc40 100644 --- a/source/de/anomic/document/parser/rssParser.java +++ b/source/de/anomic/document/parser/rssParser.java @@ -66,16 +66,10 @@ public class rssParser extends AbstractParser implements Parser { SUPPORTED_MIME_TYPES.put("application/rdf+xml",fileExtensions); SUPPORTED_MIME_TYPES.put("application/rss+xml",fileExtensions); SUPPORTED_MIME_TYPES.put("application/atom+xml",fileExtensions); - } - - /** - * a list of library names that are needed by this parser - * @see Parser#getLibxDependences() - */ - private static final String[] LIBX_DEPENDENCIES = new String[] {}; + } public rssParser() { - super(LIBX_DEPENDENCIES); + super(); this.parserName = "Rich Site Summary/Atom Feed Parser"; } diff --git a/source/de/anomic/document/parser/rtfParser.java b/source/de/anomic/document/parser/rtfParser.java index 35346b7a6..b8d0e83a3 100644 --- a/source/de/anomic/document/parser/rtfParser.java +++ b/source/de/anomic/document/parser/rtfParser.java @@ -55,15 +55,9 @@ public class rtfParser extends AbstractParser implements Parser { SUPPORTED_MIME_TYPES.put("application/doc","rtf"); SUPPORTED_MIME_TYPES.put("application/x-soffice","rtf"); } - - /** - * a list of library names that are needed by this parser - * @see Parser#getLibxDependences() - */ - private static final String[] LIBX_DEPENDENCIES = new String[] {}; - + public rtfParser() { - super(LIBX_DEPENDENCIES); + super(); this.parserName = "Rich Text Format Parser"; } diff --git a/source/de/anomic/document/parser/sevenzipParser.java b/source/de/anomic/document/parser/sevenzipParser.java index 2c3b4e711..674a7b63b 100644 --- a/source/de/anomic/document/parser/sevenzipParser.java +++ b/source/de/anomic/document/parser/sevenzipParser.java @@ -59,16 +59,10 @@ public class sevenzipParser extends AbstractParser implements Parser { public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); static { SUPPORTED_MIME_TYPES.put("application/x-7z-compressed", "7z"); - } - - /** - * a list of library names that are needed by this parser - * @see Parser#getLibxDependences() - */ - private static final String[] LIBX_DEPENDENCIES = new String[] { "J7Zip-modified.jar" }; + } public sevenzipParser() { - super(LIBX_DEPENDENCIES); + super(); super.parserName = "7zip Archive Parser"; } diff --git a/source/de/anomic/document/parser/swfParser.java b/source/de/anomic/document/parser/swfParser.java index 5b2dfefe0..688c53387 100644 --- a/source/de/anomic/document/parser/swfParser.java +++ b/source/de/anomic/document/parser/swfParser.java @@ -52,14 +52,8 @@ public class swfParser extends AbstractParser implements Parser { SUPPORTED_MIME_TYPES.put("image/vnd.rn-realflash","swf"); } - /** - * a list of library names that are needed by this parser - * @see Parser#getLibxDependences() - */ - private static final String[] LIBX_DEPENDENCIES = new String[] {"webcat-0.1-swf.jar"}; - public swfParser() { - super(LIBX_DEPENDENCIES); + super(); this.parserName = "Adobe Flash Parser"; } diff --git a/source/de/anomic/document/parser/tarParser.java b/source/de/anomic/document/parser/tarParser.java index 6ef630e01..bc06a4066 100644 --- a/source/de/anomic/document/parser/tarParser.java +++ b/source/de/anomic/document/parser/tarParser.java @@ -69,16 +69,8 @@ public class tarParser extends AbstractParser implements Parser { SUPPORTED_MIME_TYPES.put("application/x-compressed","tar"); } - /** - * a list of library names that are needed by this parser - * @see Parser#getLibxDependences() - */ - private static final String[] LIBX_DEPENDENCIES = new String[] { -// "tar.jar" - }; - public tarParser() { - super(LIBX_DEPENDENCIES); + super(); this.parserName = "Tape Archive File Parser"; } diff --git a/source/de/anomic/document/parser/vcfParser.java b/source/de/anomic/document/parser/vcfParser.java index ef5f054c7..c7f89c978 100644 --- a/source/de/anomic/document/parser/vcfParser.java +++ b/source/de/anomic/document/parser/vcfParser.java @@ -63,23 +63,17 @@ public class vcfParser extends AbstractParser implements Parser { */ public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); static { - SUPPORTED_MIME_TYPES.put("text/x-vcard","vcf"); + SUPPORTED_MIME_TYPES.put("text/x-vcard","vcf"); SUPPORTED_MIME_TYPES.put("application/vcard","vcf"); SUPPORTED_MIME_TYPES.put("text/anytext","vcf"); SUPPORTED_MIME_TYPES.put("text/directory","vcf"); SUPPORTED_MIME_TYPES.put("application/x-versit","vcf"); SUPPORTED_MIME_TYPES.put("text/x-versit","vcf"); SUPPORTED_MIME_TYPES.put("text/x-vcalendar","vcf"); - } - - /** - * a list of library names that are needed by this parser - * @see Parser#getLibxDependences() - */ - private static final String[] LIBX_DEPENDENCIES = new String[] {}; + } public vcfParser() { - super(LIBX_DEPENDENCIES); + super(); this.parserName = "vCard Parser"; } diff --git a/source/de/anomic/document/parser/vsdParser.java b/source/de/anomic/document/parser/vsdParser.java index 5072fd305..01ac2c32e 100644 --- a/source/de/anomic/document/parser/vsdParser.java +++ b/source/de/anomic/document/parser/vsdParser.java @@ -56,17 +56,8 @@ public class vsdParser extends AbstractParser implements Parser { SUPPORTED_MIME_TYPES.put("zz-application/zz-winassoc-vsd","vsd"); } - /** - * a list of library names that are needed by this parser - * @see Parser#getLibxDependences() - */ - private static final String[] LIBX_DEPENDENCIES = new String[] { - "poi-3.2-FINAL-20081019.jar", - "poi-scratchpad-3.2-FINAL-20081019.jar", - }; - public vsdParser() { - super(LIBX_DEPENDENCIES); + super(); this.parserName = "Microsoft Visio Parser"; } diff --git a/source/de/anomic/document/parser/xlsParser.java b/source/de/anomic/document/parser/xlsParser.java index 62ff94943..afb73c463 100644 --- a/source/de/anomic/document/parser/xlsParser.java +++ b/source/de/anomic/document/parser/xlsParser.java @@ -69,17 +69,8 @@ public class xlsParser extends AbstractParser implements Parser, HSSFListener { SUPPORTED_MIME_TYPES.put("application/xls","xls"); } - /** - * a list of library names that are needed by this parser - * @see Parser#getLibxDependences() - */ - private static final String[] LIBX_DEPENDENCIES = new String[] { - "poi-3.2-FINAL-20081019.jar", - "poi-scratchpad-3.2-FINAL-20081019.jar" - }; - public xlsParser(){ - super(LIBX_DEPENDENCIES); + super(); this.parserName = "Microsoft Excel Parser"; } diff --git a/source/de/anomic/document/parser/zipParser.java b/source/de/anomic/document/parser/zipParser.java index 013abc0dc..59cbe7b8a 100644 --- a/source/de/anomic/document/parser/zipParser.java +++ b/source/de/anomic/document/parser/zipParser.java @@ -69,14 +69,8 @@ public class zipParser extends AbstractParser implements Parser { SUPPORTED_MIME_TYPES.put("application/java-archive","jar"); } - /** - * a list of library names that are needed by this parser - * @see Parser#getLibxDependences() - */ - private static final String[] LIBX_DEPENDENCIES = new String[] {}; - public zipParser() { - super(LIBX_DEPENDENCIES); + super(); this.parserName = "Compressed Archive File Parser"; } diff --git a/source/de/anomic/http/httpdFileHandler.java b/source/de/anomic/http/httpdFileHandler.java index 3de08bd34..cf0d969d6 100644 --- a/source/de/anomic/http/httpdFileHandler.java +++ b/source/de/anomic/http/httpdFileHandler.java @@ -81,6 +81,7 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.zip.GZIPOutputStream; import de.anomic.document.ParserDispatcher; +import de.anomic.document.parser.htmlParser; import de.anomic.document.parser.html.ContentScraper; import de.anomic.document.parser.html.ScraperInputStream; import de.anomic.kelondro.util.ByteBuffer; @@ -860,7 +861,7 @@ public final class httpdFileHandler { fis.mark(1000); // scrape document to look up charset final ScraperInputStream htmlFilter = new ScraperInputStream(fis,"UTF-8",new yacyURL("http://localhost", null),null,false); - final String charset = ParserDispatcher.patchCharsetEncoding(htmlFilter.detectCharset()); + final String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset()); if(charset != null) mimeType = mimeType + "; charset="+charset; // reset position diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 0b43a0265..1a01ff4e5 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -523,7 +523,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch 0) && (sb.getConfig("parseableMimeTypes.CRAWLER", "").length() == 0)) { - sb.setConfig("parseableMimeTypes.CRAWLER", value); - sb.setConfig("parseableMimeTypes.PROXY", value); - sb.setConfig("parseableMimeTypes.URLREDIRECTOR", value); - sb.setConfig("parseableMimeTypes.ICAP", value); - } //Locales in DATA, because DATA must be writable, htroot not. if(sb.getConfig("locale.translated_html", "DATA/LOCALE/htroot").equals("htroot/locale")){ sb.setConfig("locale.translated_html", "DATA/LOCALE/htroot");