From af10f729df9920961e40d4304280c8f900e1192a Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 22 Nov 2007 01:34:29 +0000 Subject: [PATCH] fixed image search and favicon loading git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4225 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/http/httpdProxyHandler.java | 6 +- .../plasma/crawler/plasmaCrawlQueues.java | 5 +- .../plasma/crawler/plasmaHTTPLoader.java | 10 +- .../plasma/crawler/plasmaProtocolLoader.java | 8 +- source/de/anomic/plasma/plasmaCrawlEntry.java | 6 +- source/de/anomic/plasma/plasmaParser.java | 162 ++++++------------ .../de/anomic/plasma/plasmaParserConfig.java | 12 +- .../de/anomic/plasma/plasmaSwitchboard.java | 16 +- yacy.init | 5 +- 9 files changed, 89 insertions(+), 141 deletions(-) diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java index 1fe76fb47..c51f8bfcf 100644 --- a/source/de/anomic/http/httpdProxyHandler.java +++ b/source/de/anomic/http/httpdProxyHandler.java @@ -581,7 +581,7 @@ public final class httpdProxyHandler { // handle file types and make (possibly transforming) output stream if ( (!transformer.isIdentityTransformer()) && - (plasmaParser.supportedRealTimeContent(url,res.responseHeader.mime())) + (plasmaParser.supportedHTMLContent(url,res.responseHeader.mime())) ) { // make a transformer theLogger.logFine("create transformer for URL " + url); @@ -794,8 +794,8 @@ public final class httpdProxyHandler { // make a transformer if (( !transformer.isIdentityTransformer()) && - (ext == null || !plasmaParser.supportedRealtimeFileExtContains(url)) && - (plasmaParser.realtimeParsableMimeTypesContains(cachedResponseHeader.mime()))) { + (ext == null || !plasmaParser.supportedHTMLFileExtContains(url)) && + (plasmaParser.HTMLParsableMimeTypesContains(cachedResponseHeader.mime()))) { hfos = new htmlFilterWriter((chunkedOut != null) ? chunkedOut : respond, charSet, null, transformer, (ext.length() == 0)); } else { hfos = (gzippedOut != null) ? gzippedOut : ((chunkedOut != null)? chunkedOut : respond); diff --git a/source/de/anomic/plasma/crawler/plasmaCrawlQueues.java b/source/de/anomic/plasma/crawler/plasmaCrawlQueues.java index f72d3bce8..acc3128e0 100644 --- a/source/de/anomic/plasma/crawler/plasmaCrawlQueues.java +++ b/source/de/anomic/plasma/crawler/plasmaCrawlQueues.java @@ -39,6 +39,7 @@ import de.anomic.plasma.plasmaCrawlNURL; import de.anomic.plasma.plasmaCrawlProfile; import de.anomic.plasma.plasmaCrawlZURL; import de.anomic.plasma.plasmaHTCache; +import de.anomic.plasma.plasmaParser; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.logging.serverLog; import de.anomic.tools.crypt; @@ -518,7 +519,7 @@ public class plasmaCrawlQueues { 0, 0); - return loader.load(centry); + return loader.load(centry, (forText) ? plasmaParser.PARSER_MODE_CRAWLER : plasmaParser.PARSER_MODE_IMAGE); } public int size() { @@ -547,7 +548,7 @@ public class plasmaCrawlQueues { } else { // starting a load from the internet this.entry.setStatus("worker-loading"); - String result = loader.process(this.entry); + String result = loader.process(this.entry, plasmaParser.PARSER_MODE_CRAWLER); if (result != null) { plasmaCrawlZURL.Entry eentry = errorURL.newEntry(this.entry.url(), "cannot load: " + result); eentry.store(); diff --git a/source/de/anomic/plasma/crawler/plasmaHTTPLoader.java b/source/de/anomic/plasma/crawler/plasmaHTTPLoader.java index 4c0309bf4..078051f96 100644 --- a/source/de/anomic/plasma/crawler/plasmaHTTPLoader.java +++ b/source/de/anomic/plasma/crawler/plasmaHTTPLoader.java @@ -130,11 +130,11 @@ public final class plasmaHTTPLoader { ); } - public plasmaHTCache.Entry load(plasmaCrawlEntry entry) { - return load(entry, DEFAULT_CRAWLING_RETRY_COUNT); + public plasmaHTCache.Entry load(plasmaCrawlEntry entry, String parserMode) { + return load(entry, parserMode, DEFAULT_CRAWLING_RETRY_COUNT); } - private plasmaHTCache.Entry load(plasmaCrawlEntry entry, int retryCount) { + private plasmaHTCache.Entry load(plasmaCrawlEntry entry, String parserMode, int retryCount) { if (retryCount < 0) { this.log.logInfo("Redirection counter exceeded for URL " + entry.url().toString() + ". Processing aborted."); @@ -212,7 +212,7 @@ public final class plasmaHTTPLoader { // request has been placed and result has been returned. work off response File cacheFile = plasmaHTCache.getCachePath(entry.url()); try { - if (plasmaParser.supportedContent(plasmaParser.PARSER_MODE_CRAWLER,entry.url(),res.responseHeader.mime())) { + if (plasmaParser.supportedContent(parserMode, entry.url(), res.responseHeader.mime())) { // delete old content if (cacheFile.isFile()) { plasmaHTCache.deleteURLfromCache(entry.url()); @@ -310,7 +310,7 @@ public final class plasmaHTTPLoader { // retry crawling with new url entry.redirectURL(redirectionUrl); - return load(entry, retryCount - 1); + return load(entry, plasmaParser.PARSER_MODE_URLREDIRECTOR, retryCount - 1); } } else { diff --git a/source/de/anomic/plasma/crawler/plasmaProtocolLoader.java b/source/de/anomic/plasma/crawler/plasmaProtocolLoader.java index 1cbb3645c..a184213f2 100644 --- a/source/de/anomic/plasma/crawler/plasmaProtocolLoader.java +++ b/source/de/anomic/plasma/crawler/plasmaProtocolLoader.java @@ -61,23 +61,23 @@ public final class plasmaProtocolLoader { return (HashSet) this.supportedProtocols.clone(); } - public plasmaHTCache.Entry load(plasmaCrawlEntry entry) { + public plasmaHTCache.Entry load(plasmaCrawlEntry entry, String parserMode) { // getting the protocol of the next URL String protocol = entry.url().getProtocol(); - if ((protocol.equals("http") || (protocol.equals("https")))) return httpLoader.load(entry); + if ((protocol.equals("http") || (protocol.equals("https")))) return httpLoader.load(entry, parserMode); if (protocol.equals("ftp")) return ftpLoader.load(entry); this.log.logWarning("Unsupported protocol '" + protocol + "' in url " + entry.url()); return null; } - public String process(plasmaCrawlEntry entry) { + public String process(plasmaCrawlEntry entry, String parserMode) { // load a resource, store it to htcache and push queue entry to switchboard queue // returns null if everything went fine, a fail reason string if a problem occurred plasmaHTCache.Entry h; try { - h = load(entry); + h = load(entry, parserMode); entry.setStatus("loaded"); if (h == null) return "load failed"; boolean stored = sb.htEntryStoreProcess(h); diff --git a/source/de/anomic/plasma/plasmaCrawlEntry.java b/source/de/anomic/plasma/plasmaCrawlEntry.java index 712281286..efc53bf79 100644 --- a/source/de/anomic/plasma/plasmaCrawlEntry.java +++ b/source/de/anomic/plasma/plasmaCrawlEntry.java @@ -1,6 +1,6 @@ -// plasmaCrawlBalancerEntry.java -// (C) 2007 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany -// first published 14.03.2007 on http://www.anomic.de +// plasmaCrawlEntry.java +// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 14.03.2007 on http://yacy.net // // This is a part of YaCy, a peer-to-peer based web search engine // diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index 6efd711dc..8aa48d51d 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -1,12 +1,16 @@ // plasmaParser.java -// ------------------------ -// part of YaCy -// (C) by Michael Peter Christen; mc@anomic.de -// first published on http://www.anomic.de -// Frankfurt, Germany, 2005 +// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published in january 2005 on http://yacy.net +// with contributions 02.05.2005 by Martin Thelian // -// last major change: 02.05.2005 by Martin Thelian +// This is a part of YaCy, a peer-to-peer based web search engine // +// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ +// $LastChangedRevision: 1986 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or @@ -20,27 +24,6 @@ // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// -// Using this software in any meaning (reading, learning, copying, compiling, -// running) means that you agree that the Author(s) is (are) not responsible -// for cost, loss of data or any harm that may be caused directly or indirectly -// by usage of this softare or this documentation. The usage of this software -// is on your own risk. The installation and usage (starting/running) of this -// software may allow other people or application to access your computer and -// any attached devices and is highly dependent on the configuration of the -// software which must be done by the user of the software; the author(s) is -// (are) also not responsible for proper configuration and usage of the -// software, even if provoked by documentation provided together with -// the software. -// -// Any changes to this file according to the GPL as documented in the file -// gpl.txt aside this file in the shipment you received can be done to the -// lines that follows this copyright notice here, but changes must not be -// done inside the copyright notive above. A re-distribution must contain -// the intact and unchanged copyright notice. -// Contributions and changes to the program code must be marked as such. - -// compile: javac -classpath lib/commons-collections.jar:lib/commons-pool.jar -sourcepath source source/de/anomic/plasma/plasmaParser.java package de.anomic.plasma; @@ -78,15 +61,17 @@ import de.anomic.server.logging.serverLog; import de.anomic.yacy.yacyURL; public final class plasmaParser { - public static final String PARSER_MODE_PROXY = "PROXY"; - public static final String PARSER_MODE_CRAWLER = "CRAWLER"; + public static final String PARSER_MODE_PROXY = "PROXY"; + public static final String PARSER_MODE_CRAWLER = "CRAWLER"; public static final String PARSER_MODE_URLREDIRECTOR = "URLREDIRECTOR"; - public static final String PARSER_MODE_ICAP = "ICAP"; + public static final String PARSER_MODE_ICAP = "ICAP"; + public static final String PARSER_MODE_IMAGE = "IMAGE"; public static final HashSet PARSER_MODE = new HashSet(Arrays.asList(new String[]{ PARSER_MODE_PROXY, PARSER_MODE_CRAWLER, PARSER_MODE_ICAP, - PARSER_MODE_URLREDIRECTOR + PARSER_MODE_URLREDIRECTOR, + PARSER_MODE_IMAGE })); private static final HashMap parserConfigList = new HashMap(); @@ -98,15 +83,10 @@ public final class plasmaParser { public static final Properties availableParserList = new Properties(); /** - * A list of file extensions that are supported by the html-parser and can - * be parsed in realtime. + * A list of file extensions and mime types that are supported by the html-parser */ - public static final HashSet supportedRealtimeFileExt = new HashSet(); - - /** - * A list of mimeTypes that can be parsed in Realtime (on the fly) - */ - public static final HashSet realtimeParsableMimeTypes = new HashSet(); + public static final HashSet supportedHTMLFileExt = new HashSet(); + public static final HashSet supportedHTMLMimeTypes = new HashSet(); private static final Properties mimeTypeLookupByFileExt = new Properties(); static { @@ -194,34 +174,24 @@ public final class plasmaParser { } /** - * This function is used to initialize the realtimeParsableMimeTypes List. + * This function is used to initialize the HTMLParsableMimeTypes List. * This list contains a list of mimeTypes that can be parsed in realtime by * the yacy html-Parser - * @param realtimeParsableMimeTypes a list of mimetypes that can be parsed by the + * @param htmlParsableMimeTypes a list of mimetypes that can be parsed by the * yacy html parser */ - public static void initRealtimeParsableMimeTypes(String realtimeParsableMimeTypes) { + public static void initHTMLParsableMimeTypes(String htmlParsableMimeTypes) { LinkedList mimeTypes = new LinkedList(); - if ((realtimeParsableMimeTypes == null) || (realtimeParsableMimeTypes.length() == 0)) { - // Nothing todo here - } else { - String[] realtimeParsableMimeTypeList = realtimeParsableMimeTypes.split(","); - for (int i = 0; i < realtimeParsableMimeTypeList.length; i++) mimeTypes.add(realtimeParsableMimeTypeList[i].toLowerCase().trim()); + if ((htmlParsableMimeTypes == null) || (htmlParsableMimeTypes.length() == 0)) { + return; } - initRealtimeParsableMimeTypes(mimeTypes); - } - - /** - * This function is used to initialize the realtimeParsableMimeTypes List. - * This list contains a list of mimeTypes that can be parsed in realtime by - * the yacy html-Parser - * @param realtimeParsableMimeTypes a list of mimetypes that can be parsed by the - * yacy html parser - */ - public static void initRealtimeParsableMimeTypes(List mimeTypesList) { - synchronized (realtimeParsableMimeTypes) { - realtimeParsableMimeTypes.clear(); - realtimeParsableMimeTypes.addAll(mimeTypesList); + String[] realtimeParsableMimeTypeList = htmlParsableMimeTypes.split(","); + for (int i = 0; i < realtimeParsableMimeTypeList.length; i++) { + mimeTypes.add(realtimeParsableMimeTypeList[i].toLowerCase().trim()); + } + synchronized (supportedHTMLMimeTypes) { + supportedHTMLMimeTypes.clear(); + supportedHTMLMimeTypes.addAll(mimeTypes); } } @@ -277,32 +247,31 @@ public final class plasmaParser { } } - public static void initSupportedRealtimeFileExt(List supportedRealtimeFileExtList) { - synchronized (supportedRealtimeFileExt) { - supportedRealtimeFileExt.clear(); - supportedRealtimeFileExt.addAll(supportedRealtimeFileExtList); + public static void initSupportedHTMLFileExt(List supportedRealtimeFileExtList) { + synchronized (supportedHTMLFileExt) { + supportedHTMLFileExt.clear(); + supportedHTMLFileExt.addAll(supportedRealtimeFileExtList); } } - public static boolean realtimeParsableMimeTypesContains(String mimeType) { - mimeType = getRealMimeType(mimeType); - synchronized (realtimeParsableMimeTypes) { - return realtimeParsableMimeTypes.contains(mimeType); + public static boolean HTMLParsableMimeTypesContains(String mimeType) { + mimeType = normalizeMimeType(mimeType); + synchronized (supportedHTMLMimeTypes) { + return supportedHTMLMimeTypes.contains(mimeType); } } - public static boolean supportedRealTimeContent(yacyURL url, String mimeType) { - return realtimeParsableMimeTypesContains(mimeType) && supportedRealtimeFileExtContains(url); + public static boolean supportedHTMLContent(yacyURL url, String mimeType) { + return HTMLParsableMimeTypesContains(mimeType) && supportedHTMLFileExtContains(url); } - public static boolean supportedRealtimeFileExtContains(yacyURL url) { + public static boolean supportedHTMLFileExtContains(yacyURL url) { String fileExt = getFileExt(url); - synchronized (supportedRealtimeFileExt) { - return supportedRealtimeFileExt.contains(fileExt); + synchronized (supportedHTMLFileExt) { + return supportedHTMLFileExt.contains(fileExt); } } - public static String getFileExt(yacyURL url) { // getting the file path String name = url.getPath(); @@ -319,13 +288,12 @@ public final class plasmaParser { return name.substring(p + 1); } - public static boolean mediaExtContains(String mediaExt) { if (mediaExt == null) return false; mediaExt = mediaExt.trim().toLowerCase(); - synchronized (supportedRealtimeFileExt) { - if (supportedRealtimeFileExt.contains(mediaExt)) return false; + synchronized (supportedHTMLFileExt) { + if (supportedHTMLFileExt.contains(mediaExt)) return false; } if (supportedFileExtContains(mediaExt)) return false; @@ -407,7 +375,7 @@ public final class plasmaParser { return encoding; } - public static String getRealMimeType(String mimeType) { + public static String normalizeMimeType(String mimeType) { //if (mimeType == null) doMimeTypeAnalysis if (mimeType == null) mimeType = "application/octet-stream"; mimeType = mimeType.trim().toLowerCase(); @@ -616,7 +584,7 @@ public final class plasmaParser { this.theLogger.logFine("Parsing '" + location + "' from stream"); // getting the mimetype of the document - mimeType = getRealMimeType(theMimeType); + mimeType = normalizeMimeType(theMimeType); // getting the file extension of the document String fileExt = getFileExt(location); @@ -646,7 +614,7 @@ public final class plasmaParser { theParser.setContentLength(contentLength); // parse the resource doc = theParser.parse(location, mimeType,documentCharset,sourceStream); - } else if (realtimeParsableMimeTypesContains(mimeType)) { + } else if (HTMLParsableMimeTypesContains(mimeType)) { doc = parseHtml(location, mimeType, documentCharset, sourceStream); } else { String errorMsg = "No parser available to parse mimetype '" + mimeType + "'"; @@ -749,7 +717,7 @@ public final class plasmaParser { */ private Parser getParser(String mimeType) { - mimeType = getRealMimeType(mimeType); + mimeType = normalizeMimeType(mimeType); try { // determining the proper parser class name for the mimeType @@ -782,17 +750,6 @@ public final class plasmaParser { } - /* - public static String urlNormalform(URL url) { - if (url == null) return null; - return urlNormalform(url.toString()); - } - - public static String urlNormalform(String us) { - return htmlFilterContentScraper.urlNormalform(us); - } - */ - static Map allReflinks(Set links) { // links is either a Set of Strings (with urls) or htmlFilterImageEntries // we find all links that are part of a reference inside a url @@ -909,11 +866,8 @@ public final class plasmaParser { // creating a plasma parser plasmaParser theParser = new plasmaParser(); - // configuring the realtime parsable mimeTypes - plasmaParser.initRealtimeParsableMimeTypes("application/xhtml+xml,text/html,text/plain,text/sgml"); - - // configure all other supported mimeTypes - plasmaParser.enableAllParsers(PARSER_MODE_PROXY); + // configuring the html parsable mimeTypes + plasmaParser.initHTMLParsableMimeTypes("application/xhtml+xml,text/html,text/plain,text/sgml"); // parsing the content plasmaParserDocument document = null; @@ -955,17 +909,6 @@ public final class plasmaParser { } } - private static void enableAllParsers(String parserMode) { - if (!PARSER_MODE.contains(parserMode)) throw new IllegalArgumentException(); - - plasmaParserConfig config = (plasmaParserConfig) parserConfigList.get(parserMode); - if (config == null) { - config = new plasmaParserConfig(parserMode); - parserConfigList.put(parserMode, config); - } - config.enableAllParsers(); - } - public static boolean supportedContent(yacyURL url, String mimeType) { if (url == null) throw new NullPointerException(); @@ -984,6 +927,7 @@ public final class plasmaParser { if (!PARSER_MODE.contains(parserMode)) throw new IllegalArgumentException(); if (url == null) throw new NullPointerException(); + if (parserMode.equals(PARSER_MODE_IMAGE)) return true; plasmaParserConfig config = (plasmaParserConfig) parserConfigList.get(parserMode); return (config == null)?false:config.supportedContent(url, mimeType); } diff --git a/source/de/anomic/plasma/plasmaParserConfig.java b/source/de/anomic/plasma/plasmaParserConfig.java index da96bb52b..0b3ffad27 100644 --- a/source/de/anomic/plasma/plasmaParserConfig.java +++ b/source/de/anomic/plasma/plasmaParserConfig.java @@ -88,7 +88,7 @@ public class plasmaParserConfig { public boolean supportedContent(yacyURL url, String mimeType) { // TODO: we need some exceptions here to index URLs like this // http://www.musicabona.com/respighi/12668/cd/index.html.fr - mimeType = plasmaParser.getRealMimeType(mimeType); + mimeType = plasmaParser.normalizeMimeType(mimeType); if ( mimeType.equals("text/html") || mimeType.equals("application/xhtml+xml") || @@ -100,10 +100,10 @@ public class plasmaParserConfig { } public boolean supportedMimeTypesContains(String mimeType) { - mimeType = plasmaParser.getRealMimeType(mimeType); + mimeType = plasmaParser.normalizeMimeType(mimeType); - synchronized (plasmaParser.realtimeParsableMimeTypes) { - if (plasmaParser.realtimeParsableMimeTypes.contains(mimeType)) return true; + synchronized (plasmaParser.supportedHTMLMimeTypes) { + if (plasmaParser.supportedHTMLMimeTypes.contains(mimeType)) return true; } synchronized (this.enabledParserList) { @@ -124,8 +124,8 @@ public class plasmaParserConfig { if (fileExt == null) return false; fileExt = fileExt.trim().toLowerCase(); - synchronized (plasmaParser.supportedRealtimeFileExt) { - if (plasmaParser.supportedRealtimeFileExt.contains(fileExt)) return true; + synchronized (plasmaParser.supportedHTMLFileExt) { + if (plasmaParser.supportedHTMLFileExt.contains(fileExt)) return true; } synchronized(this.supportedFileExt) { diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 688737558..0f9aef309 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -563,11 +563,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // Parser settings ////////////////////////////////////////////////////////////////////////////////////////////// - public static final String PARSER_MIMETYPES_REALTIME = "parseableRealtimeMimeTypes"; + public static final String PARSER_MIMETYPES_HTML = "parseableMimeTypes.HTML"; public static final String PARSER_MIMETYPES_PROXY = "parseableMimeTypes.PROXY"; public static final String PARSER_MIMETYPES_CRAWLER = "parseableMimeTypes.CRAWLER"; public static final String PARSER_MIMETYPES_ICAP = "parseableMimeTypes.ICAP"; public static final String PARSER_MIMETYPES_URLREDIRECTOR = "parseableMimeTypes.URLREDIRECTOR"; + public static final String PARSER_MIMETYPES_IMAGE = "parseableMimeTypes.IMAGE"; public static final String PARSER_MEDIA_EXT = "mediaExt"; public static final String PARSER_MEDIA_EXT_PARSEABLE = "parseableExt"; @@ -1180,15 +1181,16 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // define an extension-blacklist log.logConfig("Parser: Initializing Extension Mappings for Media/Parser"); plasmaParser.initMediaExt(plasmaParser.extString2extList(getConfig(PARSER_MEDIA_EXT,""))); - plasmaParser.initSupportedRealtimeFileExt(plasmaParser.extString2extList(getConfig(PARSER_MEDIA_EXT_PARSEABLE,""))); + plasmaParser.initSupportedHTMLFileExt(plasmaParser.extString2extList(getConfig(PARSER_MEDIA_EXT_PARSEABLE,""))); // define a realtime parsable mimetype list log.logConfig("Parser: Initializing Mime Types"); - plasmaParser.initRealtimeParsableMimeTypes(getConfig(PARSER_MIMETYPES_REALTIME,"application/xhtml+xml,text/html,text/plain")); - plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_PROXY,getConfig(PARSER_MIMETYPES_PROXY,null)); - plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_CRAWLER,getConfig(PARSER_MIMETYPES_CRAWLER,null)); - plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_ICAP,getConfig(PARSER_MIMETYPES_ICAP,null)); - plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_URLREDIRECTOR,getConfig(PARSER_MIMETYPES_URLREDIRECTOR,null)); + plasmaParser.initHTMLParsableMimeTypes(getConfig(PARSER_MIMETYPES_HTML, "application/xhtml+xml,text/html,text/plain")); + plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_PROXY, getConfig(PARSER_MIMETYPES_PROXY, null)); + plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_CRAWLER, getConfig(PARSER_MIMETYPES_CRAWLER, null)); + plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_ICAP, getConfig(PARSER_MIMETYPES_ICAP, null)); + plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_URLREDIRECTOR, getConfig(PARSER_MIMETYPES_URLREDIRECTOR, null)); + plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_IMAGE, getConfig(PARSER_MIMETYPES_IMAGE, null)); // start a loader log.logConfig("Starting Crawl Loader"); diff --git a/yacy.init b/yacy.init index 5cb9c397b..55264ca16 100644 --- a/yacy.init +++ b/yacy.init @@ -224,9 +224,9 @@ proxyCacheMigration = true # the following mime-types are the whitelist for indexing # -# parseableRealtimeMimeTypes: specifies mime-types that can be indexed on the fly +# parseableMimeTypes.HTML: specifies mime-types that can be indexed with built-in html parser # parseableMime: specifies mime-types that can be indexed but not on the fly -parseableRealtimeMimeTypes=application/xhtml+xml,text/html,text/plain,text/sgml +parseableMimeTypes.HTML=application/xhtml+xml,text/html,text/plain,text/sgml parseableMimeTypes= parseableMimeTypes__pro=application/atom+xml,application/bzip2,application/excel,application/gzip,application/java-archive,application/msexcel,application/mspowerpoint,application/msword,application/octet-stream,application/pdf,application/postscript,application/powerpoint,application/rdf+xml,application/rss+xml,application/rtf,application/tar,application/vcard,application/vnd.ms-excel,application/vnd.ms-powerpoint,application/vnd.oasis.opendocument.text,application/x-7z-compressed,application/x-bz2,application/x-bzip2,application/x-compress,application/x-compressed,application/x-excel,application/x-gzip,application/x-msexcel,application/x-redhat packet manager,application/x-redhat-package-manager,application/x-rpm,application/x-shockwave-flash,application/x-shockwave-flash2-preview,application/x-tar,application/x-vnd.oasis.opendocument.text,application/x-xml,application/x-zip,application/x-zip-compressed,application/xml,application/zip,text/postscript,text/rss,text/rtf,text/x-vcard,text/xml parseableMimeTypes.CRAWLER= @@ -237,6 +237,7 @@ parseableMimeTypes.ICAP= parseableMimeTypes.ICAP__pro=application/atom+xml,application/bzip2,application/excel,application/gzip,application/java-archive,application/msexcel,application/mspowerpoint,application/msword,application/octet-stream,application/pdf,application/postscript,application/powerpoint,application/rdf+xml,application/rss+xml,application/rtf,application/tar,application/vcard,application/vnd.ms-excel,application/vnd.ms-powerpoint,application/vnd.oasis.opendocument.text,application/x-7z-compressed,application/x-bz2,application/x-bzip2,application/x-compress,application/x-compressed,application/x-excel,application/x-gzip,application/x-msexcel,application/x-redhat packet manager,application/x-redhat-package-manager,application/x-rpm,application/x-shockwave-flash,application/x-shockwave-flash2-preview,application/x-tar,application/x-vnd.oasis.opendocument.text,application/x-xml,application/x-zip,application/x-zip-compressed,application/xml,application/zip,text/postscript,text/rss,text/rtf,text/x-vcard,text/xml parseableMimeTypes.URLREDIRECTOR= parseableMimeTypes.URLREDIRECTOR__pro=application/atom+xml,application/bzip2,application/excel,application/gzip,application/java-archive,application/msexcel,application/mspowerpoint,application/msword,application/octet-stream,application/pdf,application/postscript,application/powerpoint,application/rdf+xml,application/rss+xml,application/rtf,application/tar,application/vcard,application/vnd.ms-excel,application/vnd.ms-powerpoint,application/vnd.oasis.opendocument.text,application/x-7z-compressed,application/x-bz2,application/x-bzip2,application/x-compress,application/x-compressed,application/x-excel,application/x-gzip,application/x-msexcel,application/x-redhat packet manager,application/x-redhat-package-manager,application/x-rpm,application/x-shockwave-flash,application/x-shockwave-flash2-preview,application/x-tar,application/x-vnd.oasis.opendocument.text,application/x-xml,application/x-zip,application/x-zip-compressed,application/xml,application/zip,text/postscript,text/rss,text/rtf,text/x-vcard,text/xml +parseableMimeTypes.IMAGE=image/gif,image/jpeg,image/png,image/tiff,image/vnd.wap.wbmp,image/x-icon,image/bmp # media extension string # a comma-separated list of extensions that denote media file formats