From 44fa94ac5205c088eed17199efd0b951f5d4d89b Mon Sep 17 00:00:00 2001
From: theli
Date: Tue, 6 Dec 2005 10:41:19 +0000
Subject: [PATCH] *) Modifications for dbImport functionality - dbImporter
threads are now shutdown by the switchboard on server shutdown - adding
possibility to pause a importer thread via GUI - Bugfix for abort function
See: http://www.yacy-forum.de/viewtopic.php?p=13363#13363
*) Modification of content parser configuration
- now it's possible to configure which parsers should be enabled for the proxy,
crawler, icap, etc. separately
-
*) htmlFilterContentScraper.java
- adding regular expression to normalize URLs containing /../ and /./ parts
*) httpc.java
- adding functionality to unzip gzipped content
- requested by roland: should be used later to allow gzipped seed lists
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1170 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
htroot/IndexImport_p.html | 25 +-
htroot/IndexImport_p.java | 69 +++--
htroot/QuickCrawlLink_p.html | 3 +-
htroot/SettingsAck_p.html | 10 +-
htroot/SettingsAck_p.java | 94 ++++--
htroot/Settings_Parser.inc | 18 +-
htroot/Settings_p.java | 29 +-
.../htmlFilter/htmlFilterContentScraper.java | 11 +
source/de/anomic/http/httpc.java | 42 ++-
source/de/anomic/http/httpdProxyHandler.java | 2 +-
source/de/anomic/icap/icapd.java | 3 +-
.../parser/mimeType/mimeTypeParser.java | 1 +
.../de/anomic/plasma/plasmaCrawlWorker.java | 4 +-
source/de/anomic/plasma/plasmaDbImporter.java | 85 +++++-
source/de/anomic/plasma/plasmaParser.java | 284 +++++++-----------
.../anomic/plasma/plasmaParserDocument.java | 2 +-
.../de/anomic/plasma/plasmaSwitchboard.java | 14 +-
.../anomic/urlRedirector/urlRedirectord.java | 6 +-
source/de/anomic/yacy/yacyVersion.java | 8 +
yacy.init | 6 +-
20 files changed, 442 insertions(+), 274 deletions(-)
diff --git a/htroot/IndexImport_p.html b/htroot/IndexImport_p.html
index acf5435c0..42f4fb415 100644
--- a/htroot/IndexImport_p.html
+++ b/htroot/IndexImport_p.html
@@ -30,7 +30,6 @@
-
+
@@ -86,7 +95,7 @@
#{finished.jobs}#
#[path]#
- #(stopped)#Finished::Error: #[errorMsg]##(/stopped)#
+ #(status)#Finished::Error: #[errorMsg]#::Paused#(/status)#
#[percent]#
#[elapsed]#
#[wordHash]#
diff --git a/htroot/IndexImport_p.java b/htroot/IndexImport_p.java
index 7dfd44920..6ce30b34c 100644
--- a/htroot/IndexImport_p.java
+++ b/htroot/IndexImport_p.java
@@ -102,8 +102,12 @@ public final class IndexImport_p {
plasmaDbImporter.finishedJobs.clear();
prop.put("LOCATION","");
return prop;
- } else if (post.containsKey("stopIndexDbImport")) {
- // getting the job nr of the thread that should be stopped
+ } else if (
+ (post.containsKey("stopIndexDbImport")) ||
+ (post.containsKey("pauseIndexDbImport")) ||
+ (post.containsKey("continueIndexDbImport"))
+ ) {
+ // getting the job nr of the thread
String jobNr = (String) post.get("jobNr");
Thread[] importThreads = new Thread[plasmaDbImporter.runningJobs.activeCount()*2];
@@ -112,12 +116,13 @@ public final class IndexImport_p {
for (int i=0; i < activeCount; i++) {
plasmaDbImporter currThread = (plasmaDbImporter) importThreads[i];
if (currThread.getJobNr() == Integer.valueOf(jobNr).intValue()) {
- currThread.stoppIt();
- try {
- currThread.join();
- } catch (InterruptedException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
+ if (post.containsKey("stopIndexDbImport")) {
+ currThread.stoppIt();
+ try { currThread.join(); } catch (InterruptedException e) {e.printStackTrace();}
+ } else if (post.containsKey("pauseIndexDbImport")) {
+ currThread.pauseIt();
+ } else if (post.containsKey("continueIndexDbImport")) {
+ currThread.continueIt();
}
break;
}
@@ -138,30 +143,30 @@ public final class IndexImport_p {
for (int i=0; i < activeCount; i++) {
plasmaDbImporter currThread = (plasmaDbImporter) importThreads[i];
+
+ // root path of the source db
+ prop.put("running.jobs_" + i + "_path", currThread.getImportRoot().toString());
+
+ // specifies if the importer is still running
+ prop.put("running.jobs_" + i + "_stopped", currThread.isAlive() ? 1:0);
+
+ // specifies if the importer was paused
+ prop.put("running.jobs_" + i + "_paused", currThread.isPaused() ? 1:0);
- File importPath = currThread.getImportRoot();
- String currWordHash = currThread.getCurrentWordhash();
- long currWordEntryCount = currThread.getWordEntryCounter();
- long currWordEntityCounter = currThread.getWordEntityCounter();
- long currUrlCounter = currThread.getUrlCounter();
- //long currImportDbSize = currThread.getImportWordDbSize();
- long estimatedTime = currThread.getEstimatedTime();
- long elapsedTime = currThread.getElapsedTime();
- int jobNr = currThread.getJobNr();
- int percent = currThread.getProcessingStatus();
+ // setting the status
+ prop.put("running.jobs_" + i + "_status", currThread.isPaused() ? 2 : currThread.isAlive() ? 1 : 0);
- boolean isRunning = currThread.isAlive();
+ // other information
+ prop.put("running.jobs_" + i + "_percent", Integer.toString(currThread.getProcessingStatus()));
+ prop.put("running.jobs_" + i + "_elapsed", serverDate.intervalToString(currThread.getElapsedTime()));
+ prop.put("running.jobs_" + i + "_estimated", serverDate.intervalToString(currThread.getEstimatedTime()));
+ prop.put("running.jobs_" + i + "_wordHash", currThread.getCurrentWordhash());
+ prop.put("running.jobs_" + i + "_url_num", Long.toString(currThread.getUrlCounter()));
+ prop.put("running.jobs_" + i + "_word_entity_num", Long.toString(currThread.getWordEntityCounter()));
+ prop.put("running.jobs_" + i + "_word_entry_num", Long.toString(currThread.getWordEntryCounter()));
- prop.put("running.jobs_" + i + "_path", importPath.toString());
- prop.put("running.jobs_" + i + "_stopped", isRunning ? 1:0);
- prop.put("running.jobs_" + i + "_percent", Integer.toString(percent));
- prop.put("running.jobs_" + i + "_elapsed", serverDate.intervalToString(elapsedTime));
- prop.put("running.jobs_" + i + "_estimated", serverDate.intervalToString(estimatedTime));
- prop.put("running.jobs_" + i + "_wordHash", currWordHash);
- prop.put("running.jobs_" + i + "_url_num", Long.toString(currUrlCounter));
- prop.put("running.jobs_" + i + "_word_entity_num", Long.toString(currWordEntityCounter));
- prop.put("running.jobs_" + i + "_word_entry_num", Long.toString(currWordEntryCount));
- prop.put("running.jobs_" + i + "_stopped_job_nr", Integer.toString(jobNr));
+ // job number of the importer thread
+ prop.put("running.jobs_" + i + "_job_nr", Integer.toString(currThread.getJobNr()));
}
prop.put("running.jobs",activeCount);
@@ -174,10 +179,10 @@ public final class IndexImport_p {
String error = currThread.getError();
prop.put("finished.jobs_" + i + "_path", currThread.getImportRoot().toString());
if (error != null) {
- prop.put("finished.jobs_" + i + "_stopped", 2);
- prop.put("finished.jobs_" + i + "_stopped_errorMsg", error);
+ prop.put("finished.jobs_" + i + "_status", 2);
+ prop.put("finished.jobs_" + i + "_status_errorMsg", error);
} else {
- prop.put("finished.jobs_" + i + "_stopped", 0);
+ prop.put("finished.jobs_" + i + "_status", 0);
}
prop.put("finished.jobs_" + i + "_percent", Integer.toString(currThread.getProcessingStatus()));
prop.put("finished.jobs_" + i + "_elapsed", serverDate.intervalToString(currThread.getElapsedTime()));
diff --git a/htroot/QuickCrawlLink_p.html b/htroot/QuickCrawlLink_p.html
index 1dc541f2d..da1c8be4b 100644
--- a/htroot/QuickCrawlLink_p.html
+++ b/htroot/QuickCrawlLink_p.html
@@ -35,7 +35,8 @@ If you click on it while browsing, the currently viewed website will be inserted
- Crawl with YaCy
+ Crawl with YaCy
+
::
diff --git a/htroot/SettingsAck_p.html b/htroot/SettingsAck_p.html
index cf4a1b55b..43e5f7a8d 100644
--- a/htroot/SettingsAck_p.html
+++ b/htroot/SettingsAck_p.html
@@ -79,13 +79,15 @@ Your Peer Language is: #[peerLang]#
Peer names must not contain characters other than (a-z, A-Z, 0-9, '-', '_') and must not be longer than 80 characters.
Your Peer Language is: #[peerLang]#
::
+
The new parser settings where changed successfully.
-Parsing of the following mime-types was enabled:
-
+Parsing of the following mime-types was enabled:
+
+
#{parser}#
-#[enabledMime]#
+#[parserMode]# #[enabledMime]#
#{/parser}#
-
+
::
Seed Upload method was changed successfully.
#(success)#:: You are now a principal peer.#(/success)#
diff --git a/htroot/SettingsAck_p.java b/htroot/SettingsAck_p.java
index 5e40e04dc..3fc1414b9 100644
--- a/htroot/SettingsAck_p.java
+++ b/htroot/SettingsAck_p.java
@@ -49,14 +49,18 @@
import java.net.InetSocketAddress;
import java.net.SocketException;
import java.util.Arrays;
+import java.util.Enumeration;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.Iterator;
+import java.util.Set;
import de.anomic.http.httpHeader;
import de.anomic.http.httpRemoteProxyConfig;
import de.anomic.http.httpd;
import de.anomic.http.httpdProxyHandler;
import de.anomic.plasma.plasmaParser;
+import de.anomic.plasma.plasmaParserConfig;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverCodings;
import de.anomic.server.serverCore;
@@ -537,32 +541,82 @@ public class SettingsAck_p {
/*
* Parser configuration
*/
- if (post.containsKey("parserSettings")) {
- plasmaSwitchboard sb = (plasmaSwitchboard)env;
- post.remove("parserSettings");
+ if (post.containsKey("parserSettings")) {
+ post.remove("parserSettings");
- String[] enabledMimes = null;
- if (post.containsKey("allParserEnabled")) {
- // enable all available parsers
- enabledMimes = plasmaParser.setEnabledParserList(sb.parser.getAvailableParserList().keySet());
- } else {
- // activate all received parsers
- enabledMimes = plasmaParser.setEnabledParserList(post.keySet());
- }
- Arrays.sort(enabledMimes);
+ HashMap newConfigList = new HashMap();
+ Set parserModes = plasmaParser.getParserConfigList().keySet();
- StringBuffer enabledMimesTxt = new StringBuffer();
- for (int i=0; i < enabledMimes.length; i++) {
- enabledMimesTxt.append(enabledMimes[i]).append(",");
- prop.put("info_parser_" + i + "_enabledMime",enabledMimes[i]);
+ // looping through all received settings
+ int pos;
+ Enumeration keyEnum = post.keys();
+ while (keyEnum.hasMoreElements()) {
+ String key = (String) keyEnum.nextElement();
+ if ((pos = key.indexOf(".")) != -1) {
+ String currParserMode = key.substring(0,pos).trim().toUpperCase();
+ String currMimeType = key.substring(pos+1).replaceAll("\n", "");
+ if (parserModes.contains(currParserMode)) {
+ HashSet currEnabledMimeTypes;
+ if (newConfigList.containsKey(currParserMode)) {
+ currEnabledMimeTypes = (HashSet) newConfigList.get(currParserMode);
+ } else {
+ currEnabledMimeTypes = new HashSet();
+ newConfigList.put(currParserMode, currEnabledMimeTypes);
+ }
+ currEnabledMimeTypes.add(currMimeType);
+ }
+ }
}
- prop.put("info_parser",enabledMimes.length);
- if (enabledMimesTxt.length() > 0) enabledMimesTxt.deleteCharAt(enabledMimesTxt.length()-1);
-
- env.setConfig("parseableMimeTypes",enabledMimesTxt.toString());
+ int enabledMimesCount = 0;
+ StringBuffer currEnabledMimesTxt = new StringBuffer();
+ Iterator parserModeIter = newConfigList.keySet().iterator();
+ while (parserModeIter.hasNext()) {
+ String currParserMode = (String)parserModeIter.next();
+ String[] enabledMimes = plasmaParser.setEnabledParserList(currParserMode, (Set)newConfigList.get(currParserMode));
+ Arrays.sort(enabledMimes);
+
+ currEnabledMimesTxt.setLength(0);
+ for (int i=0; i < enabledMimes.length; i++) {
+ currEnabledMimesTxt.append(enabledMimes[i]).append(",");
+ prop.put("info_parser_" + enabledMimesCount + "_parserMode",currParserMode);
+ prop.put("info_parser_" + enabledMimesCount + "_enabledMime",enabledMimes[i]);
+ enabledMimesCount++;
+ }
+ if (currEnabledMimesTxt.length() > 0) currEnabledMimesTxt.deleteCharAt(currEnabledMimesTxt.length()-1);
+ env.setConfig("parseableMimeTypes." + currParserMode,currEnabledMimesTxt.toString());
+ }
+ prop.put("info_parser",enabledMimesCount);
prop.put("info", 18);
return prop;
+
+// plasmaSwitchboard sb = (plasmaSwitchboard)env;
+//
+// HashMap configList = plasmaParser.getParserConfigList();
+// Iterator parserModeIter = configList.keySet().iterator();
+//
+// String[] enabledMimes = null;
+// if (post.containsKey("allParserEnabled")) {
+// // enable all available parsers
+// enabledMimes = plasmaParser.setEnabledParserList(sb.parser.getAvailableParserList().keySet());
+// } else {
+// // activate all received parsers
+// enabledMimes = plasmaParser.setEnabledParserList(post.keySet());
+// }
+// Arrays.sort(enabledMimes);
+//
+// StringBuffer enabledMimesTxt = new StringBuffer();
+// for (int i=0; i < enabledMimes.length; i++) {
+// enabledMimesTxt.append(enabledMimes[i]).append(",");
+// prop.put("info_parser_" + i + "_enabledMime",enabledMimes[i]);
+// }
+// prop.put("info_parser",enabledMimes.length);
+// if (enabledMimesTxt.length() > 0) enabledMimesTxt.deleteCharAt(enabledMimesTxt.length()-1);
+//
+// env.setConfig("parseableMimeTypes",enabledMimesTxt.toString());
+//
+// prop.put("info", 18);
+// return prop;
}
diff --git a/htroot/Settings_Parser.inc b/htroot/Settings_Parser.inc
index 2953b68cd..a2f53cec7 100644
--- a/htroot/Settings_Parser.inc
+++ b/htroot/Settings_Parser.inc
@@ -5,33 +5,41 @@ For a detailed description of the various MIME-types take a look at
#{parser}#
- #[name]# V#[version]#
+ #[name]# V#[version]#
#[usage]#
#{mime}#
-
+#{parserMode}#
+
+#{/parserMode}#
#[mimetype]#
#{/mime}#
#{/parser}#
+
- Changes take effect immediately
+ Changes take effect immediately
diff --git a/htroot/Settings_p.java b/htroot/Settings_p.java
index c7bc14430..52121b83f 100644
--- a/htroot/Settings_p.java
+++ b/htroot/Settings_p.java
@@ -49,6 +49,8 @@ import java.util.HashSet;
import java.util.Iterator;
import de.anomic.http.httpHeader;
+import de.anomic.plasma.plasmaParser;
+import de.anomic.plasma.plasmaParserConfig;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.parser.ParserInfo;
import de.anomic.server.serverObjects;
@@ -248,7 +250,10 @@ public final class Settings_p {
* Parser Configuration
*/
plasmaSwitchboard sb = (plasmaSwitchboard)env;
- HashSet enabledParsers = sb.parser.getEnabledParserList();
+
+ HashMap configList = plasmaParser.getParserConfigList();
+ plasmaParserConfig[] configArray = (plasmaParserConfig[]) configList.values().toArray(new plasmaParserConfig[configList.size()]);
+
HashSet parserInfos = new HashSet(sb.parser.getAvailableParserList().values());
// // fetching a list of all available mimetypes
@@ -258,7 +263,7 @@ public final class Settings_p {
// Collections.sort(availableParserKeys);
// loop through the mimeTypes and add it to the properties
- boolean allParsersEnabled = true;
+ boolean[] allParsersEnabled = new boolean[configList.size()];
int parserIdx = 0;
Iterator availableParserIter = parserInfos.iterator();
@@ -267,20 +272,23 @@ public final class Settings_p {
prop.put("parser_" + parserIdx + "_name", parserInfo.parserName);
prop.put("parser_" + parserIdx + "_version", parserInfo.parserVersionNr);
prop.put("parser_" + parserIdx + "_usage", Integer.toString(parserInfo.usageCount));
+ prop.put("parser_" + parserIdx + "_colspan",Integer.toString(configArray.length+1));
int mimeIdx = 0;
Enumeration mimeTypeIter = parserInfo.supportedMimeTypes.keys();
while (mimeTypeIter.hasMoreElements()) {
String mimeType = (String)mimeTypeIter.nextElement();
- boolean parserIsEnabled = enabledParsers.contains(mimeType);
-
prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_mimetype", mimeType);
//prop.put("parser_" + parserIdx + "_name", parserName);
//prop.put("parser_" + parserIdx + "_shortname", parserName.substring(parserName.lastIndexOf(".")+1));
- prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_status", enabledParsers.contains(mimeType) ? 1:0);
- allParsersEnabled &= parserIsEnabled;
-
+ for (int i=0; i= 0) path = path.substring(0, cpos);
+
+ Pattern pathPattern = Pattern.compile("(/[^/\\.]+/)(? 1) && (((a[1] << 8) | a[0]) == GZIPInputStream.GZIP_MAGIC)) {
+ try {
+ ByteArrayInputStream byteInput = new ByteArrayInputStream(a);
+ ByteArrayOutputStream byteOutput = new ByteArrayOutputStream();
+ GZIPInputStream zippedContent = new GZIPInputStream(byteInput);
+ byte[] data = new byte[1024];
+ int read = 0;
+
+ // reading gzip file and store it uncompressed
+ while((read = zippedContent.read(data, 0, 1024)) != -1) {
+ byteOutput.write(data, 0, read);
+ }
+ zippedContent.close();
+ byteOutput.close();
+
+ a = byteOutput.toByteArray();
+ } catch (Exception e) {
+ if (!e.getMessage().equals("Not in GZIP format")) {
+ throw new IOException(e.getMessage());
+ }
+ }
+ }
+
int s = 0;
int e;
ArrayList v = new ArrayList();
@@ -1872,6 +1899,7 @@ final class httpcFactory implements org.apache.commons.pool.PoolableObjectFactor
* @see org.apache.commons.pool.PoolableObjectFactory#destroyObject(java.lang.Object)
*/
public void destroyObject(Object obj) {
+ assert(obj instanceof httpc): "Invalid object type added to pool.";
if (obj instanceof httpc) {
httpc theHttpc = (httpc) obj;
@@ -1883,12 +1911,7 @@ final class httpcFactory implements org.apache.commons.pool.PoolableObjectFactor
* @see org.apache.commons.pool.PoolableObjectFactory#validateObject(java.lang.Object)
*/
public boolean validateObject(Object obj) {
- /*
- if (obj instanceof httpc) {
- httpc theHttpc = (httpc) obj;
- return true;
- }
- */
+ assert(obj instanceof httpc): "Invalid object type in pool.";
return true;
}
@@ -1905,12 +1928,7 @@ final class httpcFactory implements org.apache.commons.pool.PoolableObjectFactor
*
*/
public void passivateObject(Object obj) {
- //log.debug(" passivateObject..." + obj);
- /*
- if (obj instanceof Session) {
- httpc theHttpc = (httpc) obj;
- }
- */
+ assert(obj instanceof httpc): "Invalid object type returned to pool.";
}
}
diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java
index 28ceeac98..45cc91240 100644
--- a/source/de/anomic/http/httpdProxyHandler.java
+++ b/source/de/anomic/http/httpdProxyHandler.java
@@ -763,7 +763,7 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
// make a transformer
if ((!(transformer.isIdentityTransformer())) &&
- ((ext == null) || (!(plasmaParser.mediaExtContains(ext)))) &&
+ ((ext == null) || (!(plasmaParser.supportedRealtimeFileExtContains(url)))) &&
((cachedResponseHeader == null) || (plasmaParser.realtimeParsableMimeTypesContains(cachedResponseHeader.mime())))) {
hfos = new htmlFilterOutputStream((chunkedOut != null) ? chunkedOut : respond, null, transformer, (ext.length() == 0));
} else {
diff --git a/source/de/anomic/icap/icapd.java b/source/de/anomic/icap/icapd.java
index f4e7330dc..836d540dc 100644
--- a/source/de/anomic/icap/icapd.java
+++ b/source/de/anomic/icap/icapd.java
@@ -372,8 +372,7 @@ public class icapd implements serverHandler {
reader.close();
resHdrStream.close();
- if ((!(plasmaParser.supportedMimeTypesContains(httpResHeader.mime()))) &&
- (!(plasmaParser.supportedFileExt(httpRequestURL)))) {
+ if (!plasmaParser.supportedContent(plasmaParser.PARSER_MODE_ICAP, httpRequestURL, httpResHeader.mime())) {
this.log.logInfo("Wrong mimeType or fileExtension for indexing:" +
"\nMimeType: " + httpResHeader.mime() +
"\nRequest Line:" + httpRequestLine);
diff --git a/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java b/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java
index ff961261b..848741032 100644
--- a/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java
+++ b/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java
@@ -154,6 +154,7 @@ implements Parser {
Collection subMatches = match.getSubMatches();
if ((subMatches != null) && (!subMatches.isEmpty())) {
mimeType = ((MagicMatch) subMatches.iterator().next()).getMimeType();
+ if ((mimeType == null)||(mimeType.length() == 0)) mimeType = match.getMimeType();
} else {
mimeType = match.getMimeType();
}
diff --git a/source/de/anomic/plasma/plasmaCrawlWorker.java b/source/de/anomic/plasma/plasmaCrawlWorker.java
index 4105e1154..15ecb79a1 100644
--- a/source/de/anomic/plasma/plasmaCrawlWorker.java
+++ b/source/de/anomic/plasma/plasmaCrawlWorker.java
@@ -375,7 +375,7 @@ public final class plasmaCrawlWorker extends Thread {
// request has been placed and result has been returned. work off response
File cacheFile = cacheManager.getCachePath(url);
try {
- if (plasmaParser.supportedContent(url,res.responseHeader.mime())) {
+ if (plasmaParser.supportedContent(plasmaParser.PARSER_MODE_CRAWLER,url,res.responseHeader.mime())) {
if (cacheFile.isFile()) {
cacheManager.deleteFile(url);
}
@@ -521,7 +521,7 @@ public final class plasmaCrawlWorker extends Thread {
}
// returning the used httpc
- httpc.returnInstance(remote);
+ if (remote != null) httpc.returnInstance(remote);
remote = null;
// setting the retry counter to 1
diff --git a/source/de/anomic/plasma/plasmaDbImporter.java b/source/de/anomic/plasma/plasmaDbImporter.java
index ffdac04b6..8c952d802 100644
--- a/source/de/anomic/plasma/plasmaDbImporter.java
+++ b/source/de/anomic/plasma/plasmaDbImporter.java
@@ -26,10 +26,10 @@ public class plasmaDbImporter extends Thread {
private final serverLog log;
private boolean stopped = false;
- //private boolean paused = false;
+ private boolean paused = false;
private String wordHash = "------------";
- long wordChunkStart = System.currentTimeMillis(), wordChunkEnd = wordChunkStart;
+ long wordChunkStart = System.currentTimeMillis(), wordChunkEnd = this.wordChunkStart;
String wordChunkStartHash = "------------", wordChunkEndHash;
private long urlCounter = 0, wordCounter = 0, entryCounter = 0;
@@ -40,6 +40,74 @@ public class plasmaDbImporter extends Thread {
public void stoppIt() {
this.stopped = true;
+ this.continueIt();
+ }
+
+ public void pauseIt() {
+ synchronized(this) {
+ this.paused = true;
+ }
+ }
+
+ public void continueIt() {
+ synchronized(this) {
+ if (this.paused) {
+ this.paused = false;
+ this.notifyAll();
+ }
+ }
+ }
+
+ public boolean isPaused() {
+ synchronized(this) {
+ return this.paused;
+ }
+ }
+
+ /**
+ * Can be used to close all still running importer threads
+ * e.g. on server shutdown
+ */
+ public static void close() {
+ /* waiting for all threads to finish */
+ int threadCount = runningJobs.activeCount();
+ Thread[] threadList = new Thread[threadCount];
+ threadCount = plasmaDbImporter.runningJobs.enumerate(threadList);
+
+ if (threadCount == 0) return;
+
+ serverLog log = new serverLog("DB-IMPORT");
+ try {
+ // trying to gracefull stop all still running sessions ...
+ log.logInfo("Signaling shutdown to " + threadCount + " remaining dbImporter threads ...");
+ for ( int currentThreadIdx = 0; currentThreadIdx < threadCount; currentThreadIdx++ ) {
+ Thread currentThread = threadList[currentThreadIdx];
+ if (currentThread.isAlive()) {
+ ((plasmaDbImporter)currentThread).stoppIt();
+ }
+ }
+
+ // waiting a few ms for the session objects to continue processing
+ try { Thread.sleep(500); } catch (InterruptedException ex) {}
+
+ // interrupting all still running or pooled threads ...
+ log.logInfo("Sending interruption signal to " + runningJobs.activeCount() + " remaining dbImporter threads ...");
+ plasmaDbImporter.runningJobs.interrupt();
+
+ // we need to use a timeout here because of missing interruptable session threads ...
+ log.logFine("Waiting for " + runningJobs.activeCount() + " remaining dbImporter threads to finish shutdown ...");
+ for ( int currentThreadIdx = 0; currentThreadIdx < threadCount; currentThreadIdx++ ) {
+ Thread currentThread = threadList[currentThreadIdx];
+ if (currentThread.isAlive()) {
+ log.logFine("Waiting for dbImporter thread '" + currentThread.getName() + "' [" + currentThreadIdx + "] to finish shutdown.");
+ try { currentThread.join(500); } catch (InterruptedException ex) {}
+ }
+ }
+
+ log.logInfo("Shutdown of remaining dbImporter threads finished.");
+ } catch (Exception e) {
+ log.logSevere("Unexpected error while trying to shutdown all remaining dbImporter threads.",e);
+ }
}
public String getError() {
@@ -94,7 +162,7 @@ public class plasmaDbImporter extends Thread {
if (theHomeUrlDB == null) throw new NullPointerException();
this.homeUrlDB = theHomeUrlDB;
- if (this.homeWordIndex.getRoot().equals(importRoot)) {
+ if (this.homeWordIndex.getRoot().equals(this.importRoot)) {
throw new IllegalArgumentException("Import and home DB directory must not be equal");
}
@@ -120,7 +188,7 @@ public class plasmaDbImporter extends Thread {
try {
importWordsDB();
} finally {
- globalEnd = System.currentTimeMillis();
+ this.globalEnd = System.currentTimeMillis();
finishedJobs.add(this);
}
}
@@ -249,6 +317,15 @@ public class plasmaDbImporter extends Thread {
}
private boolean isAborted() {
+ synchronized(this) {
+ if (this.paused) {
+ try {
+ this.wait();
+ }
+ catch (InterruptedException e){}
+ }
+ }
+
return (this.stopped) || Thread.currentThread().isInterrupted();
}
diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java
index f5d89241e..37817e8b8 100644
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@@ -42,7 +42,6 @@
// compile: javac -classpath lib/commons-collections.jar:lib/commons-pool-1.2.jar -sourcepath source source/de/anomic/plasma/plasmaParser.java
-
package de.anomic.plasma;
import java.io.BufferedInputStream;
@@ -56,7 +55,6 @@ import java.net.MalformedURLException;
import java.net.URI;
import java.net.URL;
import java.util.Arrays;
-import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Hashtable;
@@ -81,53 +79,35 @@ import org.apache.commons.pool.KeyedPoolableObjectFactory;
import org.apache.commons.pool.impl.GenericObjectPool;
public final class plasmaParser {
-
+ public static final String PARSER_MODE_PROXY = "PROXY";
+ public static final String PARSER_MODE_CRAWLER = "CRAWLER";
+ public static final String PARSER_MODE_URLREDIRECTOR = "URLREDIRECTOR";
+ public static final String PARSER_MODE_ICAP = "ICAP";
+ public static final HashSet PARSER_MODE = new HashSet(Arrays.asList(new String[]{
+ PARSER_MODE_PROXY,
+ PARSER_MODE_CRAWLER,
+ PARSER_MODE_ICAP,
+ PARSER_MODE_URLREDIRECTOR
+ }));
+
+ private static final HashMap parserConfigList = new HashMap();
+
/**
* A list containing all installed parsers and the mimeType that they support
* @see #loadAvailableParserList()
*/
- private static final Properties availableParserList = new Properties();
-
- /**
- * A list containing all enabled parsers and the mimeType that they can handle
- * @see #loadEnabledParserList()
- * @see #setEnabledParserList(Enumeration)
- */
- private static final HashSet enabledParserList = new HashSet();
-
- /**
- * A list of file extensions that are supported by all enabled parsers
- */
- private static final HashSet supportedFileExt = new HashSet();
+ static final Properties availableParserList = new Properties();
/**
* A list of file extensions that are supported by the html-parser and can
* be parsed in realtime.
*/
- private static final HashSet supportedRealtimeFileExt = new HashSet();
-
- /**
- * A list of mimeTypes that are generic
- */
- private static final HashSet genericMimeTypes = new HashSet();
- static {
- genericMimeTypes.add("text/plain");
- genericMimeTypes.add("text/text");
- genericMimeTypes.add("text/xml");
- genericMimeTypes.add("application/xml");
- genericMimeTypes.add("application/x-xml");
- genericMimeTypes.add("application/octet-stream");
- genericMimeTypes.add("application/zip");
- genericMimeTypes.add("application/x-zip");
- genericMimeTypes.add("application/x-zip-compressed");
- genericMimeTypes.add("application/x-compress");
- genericMimeTypes.add("application/x-compressed");
- }
+ static final HashSet supportedRealtimeFileExt = new HashSet();
/**
* A list of mimeTypes that can be parsed in Realtime (on the fly)
*/
- private static final HashSet realtimeParsableMimeTypes = new HashSet();
+ static final HashSet realtimeParsableMimeTypes = new HashSet();
private static final Properties mimeTypeLookupByFileExt = new Properties();
static {
@@ -147,7 +127,7 @@ public final class plasmaParser {
* @see plasmaParserPool
* @see plasmaParserFactory
*/
- private static plasmaParserPool theParserPool;
+ static plasmaParserPool theParserPool;
/**
* A list of media extensions that should not be handled by the plasmaParser
@@ -209,6 +189,10 @@ public final class plasmaParser {
private serverLog theLogger = new serverLog("PARSER");
+ public static HashMap getParserConfigList() {
+ return parserConfigList;
+ }
+
/**
* This function is used to initialize the realtimeParsableMimeTypes List.
* This list contains a list of mimeTypes that can be parsed in realtime by
@@ -241,17 +225,7 @@ public final class plasmaParser {
}
}
- public static void initParseableMimeTypes(String enabledMimeTypes) {
- HashSet mimeTypes = null;
- if ((enabledMimeTypes == null) || (enabledMimeTypes.length() == 0)) {
- mimeTypes = new HashSet();
- } else {
- String[] enabledMimeTypeList = enabledMimeTypes.split(",");
- mimeTypes = new HashSet(enabledMimeTypeList.length);
- for (int i = 0; i < enabledMimeTypeList.length; i++) mimeTypes.add(enabledMimeTypeList[i].toLowerCase().trim());
- }
- setEnabledParserList(mimeTypes);
- }
+
public static List extString2extList(String extString) {
LinkedList extensions = new LinkedList();
@@ -291,32 +265,17 @@ public final class plasmaParser {
}
}
- public static boolean supportedContent(URL url, String mimeType) {
- // TODO: we need some exceptions here to index URLs like this
- // http://www.musicabona.com/respighi/12668/cd/index.html.fr
- mimeType = getRealMimeType(mimeType);
- if (mimeType.equals("text/html")) {
- return supportedMimeTypesContains(mimeType);
- } else {
- return supportedMimeTypesContains(mimeType) && supportedFileExt(url);
- }
- }
-
public static boolean supportedRealTimeContent(URL url, String mimeType) {
- return realtimeParsableMimeTypesContains(mimeType) && supportedFileExt(url);
+ return realtimeParsableMimeTypesContains(mimeType) && supportedRealtimeFileExtContains(url);
}
- public static boolean supportedMimeTypesContains(String mimeType) {
- mimeType = getRealMimeType(mimeType);
-
- synchronized (realtimeParsableMimeTypes) {
- if (realtimeParsableMimeTypes.contains(mimeType)) return true;
- }
-
- synchronized (enabledParserList) {
- return enabledParserList.contains(mimeType);
- }
+ public static boolean supportedRealtimeFileExtContains(URL url) {
+ String fileExt = getFileExt(url);
+ synchronized (supportedRealtimeFileExt) {
+ return supportedRealtimeFileExt.contains(fileExt);
+ }
}
+
public static String getFileExt(URL url) {
// getting the file path
@@ -339,35 +298,12 @@ public final class plasmaParser {
if (p < 0) return "";
return name.substring(p + 1);
}
-
- public static boolean supportedFileExt(URL url) {
- // getting the file path
- String name = getFileExt(url);
- return supportedFileExtContains(name);
- }
-
- public static boolean supportedFileExtContains(String fileExt) {
- if (supportedFileExt == null) return false;
- if (fileExt == null) return false;
- fileExt = fileExt.trim().toLowerCase();
-
- synchronized(supportedFileExt) {
- if (supportedFileExt.contains(fileExt)) return true;
- }
-
- synchronized (supportedRealtimeFileExt) {
- return supportedRealtimeFileExt.contains(fileExt);
- }
- }
+
public static boolean mediaExtContains(String mediaExt) {
if (mediaExt == null) return false;
mediaExt = mediaExt.trim().toLowerCase();
- synchronized (supportedFileExt) {
- if (supportedFileExt.contains(mediaExt)) return false;
- }
-
synchronized (supportedRealtimeFileExt) {
if (supportedRealtimeFileExt.contains(mediaExt)) return false;
}
@@ -389,74 +325,7 @@ public final class plasmaParser {
public static String getMimeTypeByFileExt(String fileExt) {
return mimeTypeLookupByFileExt.getProperty(fileExt,"application/octet-stream");
}
-
- public plasmaParser() {
- // nothing todo here at the moment
- }
-
- public static void enableAllParsers() {
- Set availableMimeTypes = availableParserList.keySet();
- setEnabledParserList(availableMimeTypes);
- }
-
- public static String[] setEnabledParserList(Set mimeTypeSet) {
-
- HashSet newEnabledParsers = new HashSet();
- HashSet newSupportedFileExt = new HashSet();
-
- if (mimeTypeSet != null) {
- Iterator mimeTypes = mimeTypeSet.iterator();
- while (mimeTypes.hasNext()) {
- String mimeType = (String) mimeTypes.next();
- if (availableParserList.containsKey(mimeType)) {
- Parser theParser = null;
- try {
- // getting the parser
- theParser = (Parser) plasmaParser.theParserPool.borrowObject(((ParserInfo)availableParserList.get(mimeType)).parserClassName);
-
- // getting a list of mimeTypes that the parser supports
- Hashtable parserSupportsMimeTypes = theParser.getSupportedMimeTypes();
- if (parserSupportsMimeTypes != null) {
- Object supportedExtensions = parserSupportsMimeTypes.get(mimeType);
- if ((supportedExtensions != null) &&
- (supportedExtensions instanceof String) &&
- (((String)supportedExtensions).length() > 0)) {
- String[] extArray = ((String)supportedExtensions).split(",");
- newSupportedFileExt.addAll(Arrays.asList(extArray));
- }
- }
- newEnabledParsers.add(mimeType);
-
- } catch (Exception e) {
- serverLog.logSevere("PARSER", "error in setEnabledParserList", e);
- } finally {
- if (theParser != null)
- try { plasmaParser.theParserPool.returnObject(mimeType,theParser); } catch (Exception e) {}
- }
- }
- }
- }
-
- synchronized (enabledParserList) {
- enabledParserList.clear();
- enabledParserList.addAll(newEnabledParsers);
- }
-
-
- synchronized (supportedFileExt) {
- supportedFileExt.clear();
- supportedFileExt.addAll(newSupportedFileExt);
- }
- return (String[])newEnabledParsers.toArray(new String[newEnabledParsers.size()]);
- }
-
- public HashSet getEnabledParserList() {
- synchronized (plasmaParser.enabledParserList) {
- return (HashSet) plasmaParser.enabledParserList.clone();
- }
- }
-
public Hashtable getAvailableParserList() {
return plasmaParser.availableParserList;
}
@@ -556,8 +425,12 @@ public final class plasmaParser {
public void close() {
// clearing the parser list
- synchronized (enabledParserList) {
- enabledParserList.clear();
+ Iterator configs = parserConfigList.values().iterator();
+ while (configs.hasNext()) {
+ plasmaParserConfig currentConfig = (plasmaParserConfig) configs.next();
+ synchronized (currentConfig.enabledParserList) {
+ currentConfig.enabledParserList.clear();
+ }
}
// closing the parser object pool
@@ -659,7 +532,7 @@ public final class plasmaParser {
//e.printStackTrace();
return null;
} finally {
- if ((theParser != null) && (supportedMimeTypesContains(mimeType))) {
+ if (theParser != null) {
try { plasmaParser.theParserPool.returnObject(mimeType, theParser); } catch (Exception e) { }
}
}
@@ -693,8 +566,8 @@ public final class plasmaParser {
// determining the proper parser class name for the mimeType
String parserClassName = null;
ParserInfo parserInfo = null;
- synchronized (plasmaParser.enabledParserList) {
- if (plasmaParser.enabledParserList.contains(mimeType)) {
+ synchronized (plasmaParser.availableParserList) {
+ if (plasmaParser.availableParserList.contains(mimeType)) {
parserInfo = (ParserInfo)plasmaParser.availableParserList.get(mimeType);
parserClassName = parserInfo.parserClassName;
} else {
@@ -815,7 +688,7 @@ public final class plasmaParser {
plasmaParser.initRealtimeParsableMimeTypes("application/xhtml+xml,text/html,text/plain");
// configure all other supported mimeTypes
- plasmaParser.enableAllParsers();
+ plasmaParser.enableAllParsers(PARSER_MODE_PROXY);
// parsing the content
plasmaParserDocument document = theParser.parseSource(contentURL, contentMimeType, contentFile);
@@ -830,6 +703,81 @@ public final class plasmaParser {
}
}
+ private static void enableAllParsers(String parserMode) {
+ if (!PARSER_MODE.contains(parserMode)) throw new IllegalArgumentException();
+
+ plasmaParserConfig config = (plasmaParserConfig) parserConfigList.get(parserMode);
+ config.enableAllParsers();
+ }
+
+ public static boolean supportedContent(URL url, String mimeType) {
+ if (url == null) throw new NullPointerException();
+
+ Iterator configs = parserConfigList.values().iterator();
+ while (configs.hasNext()) {
+ plasmaParserConfig currentConfig = (plasmaParserConfig) configs.next();
+ synchronized (currentConfig.enabledParserList) {
+ if (currentConfig.supportedContent(url, mimeType)) return true;
+ }
+ }
+
+ return false;
+ }
+
+ public static boolean supportedContent(String parserMode, URL url, String mimeType) {
+ if (!PARSER_MODE.contains(parserMode)) throw new IllegalArgumentException();
+ if (url == null) throw new NullPointerException();
+
+ plasmaParserConfig config = (plasmaParserConfig) parserConfigList.get(parserMode);
+ return (config == null)?false:config.supportedContent(url, mimeType);
+ }
+
+ public static void initParseableMimeTypes(String parserMode, String configStr) {
+ if (!PARSER_MODE.contains(parserMode)) throw new IllegalArgumentException();
+
+ plasmaParserConfig config = (plasmaParserConfig) parserConfigList.get(parserMode);
+ if (config == null) {
+ config = new plasmaParserConfig(parserMode);
+ parserConfigList.put(parserMode, config);
+ }
+ config.initParseableMimeTypes(configStr);
+ }
+
+ public static String[] setEnabledParserList(String parserMode, Set mimeTypeSet) {
+ if (!PARSER_MODE.contains(parserMode)) throw new IllegalArgumentException();
+
+ plasmaParserConfig config = (plasmaParserConfig) parserConfigList.get(parserMode);
+ if (config == null) {
+ config = new plasmaParserConfig(parserMode);
+ parserConfigList.put(parserMode, config);
+ }
+ return config.setEnabledParserList(mimeTypeSet);
+ }
+
+ public static boolean supportedFileExtContains(String fileExt) {
+ Iterator configs = parserConfigList.values().iterator();
+ while (configs.hasNext()) {
+ plasmaParserConfig currentConfig = (plasmaParserConfig) configs.next();
+ synchronized (currentConfig.enabledParserList) {
+ if (currentConfig.supportedFileExtContains(fileExt)) return true;
+ }
+ }
+
+ return false;
+ }
+
+ public static boolean supportedMimeTypesContains(String mimeType) {
+ Iterator configs = parserConfigList.values().iterator();
+ while (configs.hasNext()) {
+ plasmaParserConfig currentConfig = (plasmaParserConfig) configs.next();
+ synchronized (currentConfig.enabledParserList) {
+ if (currentConfig.supportedMimeTypesContains(mimeType)) return true;
+ }
+ }
+
+ return false;
+ }
+
}
final class plasmaParserFactory implements KeyedPoolableObjectFactory {
@@ -920,5 +868,3 @@ final class plasmaParserPool extends GenericKeyedObjectPool {
super.returnObject(key,borrowed);
}
}
-
-
diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java
index f52a6e8bf..fad277003 100644
--- a/source/de/anomic/plasma/plasmaParserDocument.java
+++ b/source/de/anomic/plasma/plasmaParserDocument.java
@@ -196,7 +196,7 @@ public class plasmaParserDocument {
ext = url.substring(extpos).toLowerCase();
}
normal = htmlFilterContentScraper.urlNormalform(null, url);
- if (normal != null) {
+ if (normal != null) { //TODO: extension function is not correct
if (plasmaParser.mediaExtContains(ext.substring(1))) {
// this is not an normal anchor, its a media link
medialinks.put(normal, entry.getValue());
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index 70786e062..1b03d6780 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -387,7 +387,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// define a realtime parsable mimetype list
log.logConfig("Parser: Initializing Mime Types");
plasmaParser.initRealtimeParsableMimeTypes(getConfig("parseableRealtimeMimeTypes","application/xhtml+xml,text/html,text/plain"));
- plasmaParser.initParseableMimeTypes(getConfig("parseableMimeTypes",null));
+ plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_PROXY,getConfig("parseableMimeTypes.PROXY",null));
+ plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_CRAWLER,getConfig("parseableMimeTypes.CRAWLER",null));
+ plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_ICAP,getConfig("parseableMimeTypes.ICAP",null));
+ plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_URLREDIRECTOR,getConfig("parseableMimeTypes.URLREDIRECTOR",null));
// start a loader
log.logConfig("Starting Crawl Loader");
@@ -721,7 +724,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
wordIndex.close(waitingBoundSeconds);
log.logConfig("SWITCHBOARD SHUTDOWN STEP 3: sending termination signal to database manager");
try {
- //sbStackCrawlThread.stopIt();
+ // closing all still running db importer jobs
+ plasmaDbImporter.close();
+
indexDistribution.close();
cacheLoader.close();
wikiDB.close();
@@ -1141,7 +1146,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
plasmaParserDocument document = null;
httpHeader entryRespHeader = entry.responseHeader();
String mimeType = (entryRespHeader == null)?null:entryRespHeader.mime();
- if (plasmaParser.supportedContent(entry.url(),mimeType)){
+ if (plasmaParser.supportedContent(
+ entry.url(),
+ mimeType)
+ ){
if ((entry.cacheFile().exists()) && (entry.cacheFile().length() > 0)) {
log.logFine("(Parser) '" + entry.normalizedURLString() + "' is not parsed yet, parsing now from File");
document = parser.parseSource(entry.url(), mimeType, entry.cacheFile());
diff --git a/source/de/anomic/urlRedirector/urlRedirectord.java b/source/de/anomic/urlRedirector/urlRedirectord.java
index 4def8aaa3..c7806f77f 100644
--- a/source/de/anomic/urlRedirector/urlRedirectord.java
+++ b/source/de/anomic/urlRedirector/urlRedirectord.java
@@ -160,7 +160,11 @@ public class urlRedirectord implements serverHandler {
// getting URL mimeType
httpHeader header = httpc.whead(reqURL, 10000, null, null, switchboard.remoteProxyConfig);
- if (plasmaParser.supportedContent(reqURL,header.mime())) {
+ if (plasmaParser.supportedContent(
+ plasmaParser.PARSER_MODE_URLREDIRECTOR,
+ reqURL,
+ header.mime())
+ ) {
// enqueuing URL for crawling
reasonString = switchboard.sbStackCrawlThread.stackCrawl(
this.nextURL,
diff --git a/source/de/anomic/yacy/yacyVersion.java b/source/de/anomic/yacy/yacyVersion.java
index edff8d08a..8e053354d 100644
--- a/source/de/anomic/yacy/yacyVersion.java
+++ b/source/de/anomic/yacy/yacyVersion.java
@@ -50,6 +50,14 @@ public final class yacyVersion {
if ((removedSettings == null)||(removedSettings.size() == 0)) return null;
HashMap migratedSettings = new HashMap();
+ if (removedSettings.containsKey("parseableMimeTypes")) {
+ String value = (String) removedSettings.get("parseableMimeTypes");
+ migratedSettings.put("parseableMimeTypes.CRAWLER", value);
+ migratedSettings.put("parseableMimeTypes.PROXY", value);
+ migratedSettings.put("parseableMimeTypes.URLREDIRECTOR", value);
+ migratedSettings.put("parseableMimeTypes.ICAP", value);
+ }
+
return migratedSettings;
}
diff --git a/yacy.init b/yacy.init
index 0a4c7c0b7..39cea8fab 100644
--- a/yacy.init
+++ b/yacy.init
@@ -98,7 +98,11 @@ proxyCacheSize = 200
# parseableRealtimeMimeTypes: specifies mime-types that can be indexed on the fly
# parseableMime: specifies mime-types that can be indexed but not on the fly
parseableRealtimeMimeTypes=application/xhtml+xml,text/html,text/plain
-parseableMimeTypes=
+parseableMimeTypes.CRAWLER=
+parseableMimeTypes.PROXY=
+parseableMimeTypes.ICAP=
+parseableMimeTypes.URLREDIRECTOR=
+
# media extension string
# a comma-separated list of extensions that denote media file formats