- some work to integrate the html parser the same way as the other parsers are integrated (not finished)

- added migration of code of settings pages (hmm.. does not work correctly yet, sorry)
- more refactoring
- removed more unused code

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6187 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 1ee109761f
commit 8ca1f5d400

@ -249,16 +249,15 @@ minimumGlobalDelta = 500
# the following mime-types are the whitelist for indexing
#
# parseableMimeTypes.HTML: specifies mime-types that can be indexed with built-in html parser
# parseableMime: specifies mime-types that can be indexed but not on the fly
parseableMimeTypes.HTML=application/xhtml+xml,text/html,text/plain,text/sgml
# parseableMime: specifies mime-types that can be indexed with any built-in parser
parseableMimeTypes=application/atom+xml,application/bzip2,application/excel,application/gzip,application/java-archive,application/msexcel,application/mspowerpoint,application/msword,application/octet-stream,application/pdf,application/postscript,application/powerpoint,application/rdf+xml,application/rss+xml,application/rtf,application/tar,application/vcard,application/visio,application/visio.drawing,application/vnd.ms-excel,application/vnd.ms-powerpoint,application/vnd.oasis.opendocument.text,application/vnd.visio,application/vsd,application/x-7z-compressed,application/x-bz2,application/x-bzip2,application/x-compress,application/x-compressed,application/x-excel,application/x-gzip,application/x-msexcel,application/x-redhat packet manager,application/x-redhat-package-manager,application/x-rpm,application/x-shockwave-flash,application/x-shockwave-flash2-preview,application/x-tar,application/x-visio,application/x-vnd.oasis.opendocument.text,application/x-vsd,application/x-xml,application/x-zip,application/x-zip-compressed,application/xml,application/zip,image/x-vsd,text/postscript,text/rss,text/rtf,text/x-vcard,text/xml,zz-application/zz-winassoc-vsd
parseableMimeTypes.CRAWLER=application/atom+xml,application/bzip2,application/excel,application/gzip,application/java-archive,application/msexcel,application/mspowerpoint,application/msword,application/octet-stream,application/pdf,application/postscript,application/powerpoint,application/rdf+xml,application/rss+xml,application/rtf,application/tar,application/vcard,application/visio,application/visio.drawing,application/vnd.ms-excel,application/vnd.ms-powerpoint,application/vnd.oasis.opendocument.text,application/vnd.visio,application/vsd,application/x-7z-compressed,application/x-bz2,application/x-bzip2,application/x-compress,application/x-compressed,application/x-excel,application/x-gzip,application/x-msexcel,application/x-redhat packet manager,application/x-redhat-package-manager,application/x-rpm,application/x-shockwave-flash,application/x-shockwave-flash2-preview,application/x-tar,application/x-visio,application/x-vnd.oasis.opendocument.text,application/x-vsd,application/x-xml,application/x-zip,application/x-zip-compressed,application/xml,application/zip,image/x-vsd,text/postscript,text/rss,text/rtf,text/x-vcard,text/xml,zz-application/zz-winassoc-vsd
parseableMimeTypes.PROXY=application/atom+xml,application/bzip2,application/excel,application/gzip,application/java-archive,application/msexcel,application/mspowerpoint,application/msword,application/octet-stream,application/pdf,application/postscript,application/powerpoint,application/rdf+xml,application/rss+xml,application/rtf,application/tar,application/vcard,application/visio,application/visio.drawing,application/vnd.ms-excel,application/vnd.ms-powerpoint,application/vnd.oasis.opendocument.text,application/vnd.visio,application/vsd,application/x-7z-compressed,application/x-bz2,application/x-bzip2,application/x-compress,application/x-compressed,application/x-excel,application/x-gzip,application/x-msexcel,application/x-redhat packet manager,application/x-redhat-package-manager,application/x-rpm,application/x-shockwave-flash,application/x-shockwave-flash2-preview,application/x-tar,application/x-visio,application/x-vnd.oasis.opendocument.text,application/x-vsd,application/x-xml,application/x-zip,application/x-zip-compressed,application/xml,application/zip,image/x-vsd,text/postscript,text/rss,text/rtf,text/x-vcard,text/xml,zz-application/zz-winassoc-vsd
parseableMimeTypes.ICAP=application/atom+xml,application/bzip2,application/excel,application/gzip,application/java-archive,application/msexcel,application/mspowerpoint,application/msword,application/octet-stream,application/pdf,application/postscript,application/powerpoint,application/rdf+xml,application/rss+xml,application/rtf,application/tar,application/vcard,application/visio,application/visio.drawing,application/vnd.ms-excel,application/vnd.ms-powerpoint,application/vnd.oasis.opendocument.text,application/vnd.visio,application/vsd,application/x-7z-compressed,application/x-bz2,application/x-bzip2,application/x-compress,application/x-compressed,application/x-excel,application/x-gzip,application/x-msexcel,application/x-redhat packet manager,application/x-redhat-package-manager,application/x-rpm,application/x-shockwave-flash,application/x-shockwave-flash2-preview,application/x-tar,application/x-visio,application/x-vnd.oasis.opendocument.text,application/x-vsd,application/x-xml,application/x-zip,application/x-zip-compressed,application/xml,application/zip,image/x-vsd,text/postscript,text/rss,text/rtf,text/x-vcard,text/xml,zz-application/zz-winassoc-vsd
parseableMimeTypes.URLREDIRECTOR=application/atom+xml,application/bzip2,application/excel,application/gzip,application/java-archive,application/msexcel,application/mspowerpoint,application/msword,application/octet-stream,application/pdf,application/postscript,application/powerpoint,application/rdf+xml,application/rss+xml,application/rtf,application/tar,application/vcard,application/visio,application/visio.drawing,application/vnd.ms-excel,application/vnd.ms-powerpoint,application/vnd.oasis.opendocument.text,application/vnd.visio,application/vsd,application/x-7z-compressed,application/x-bz2,application/x-bzip2,application/x-compress,application/x-compressed,application/x-excel,application/x-gzip,application/x-msexcel,application/x-redhat packet manager,application/x-redhat-package-manager,application/x-rpm,application/x-shockwave-flash,application/x-shockwave-flash2-preview,application/x-tar,application/x-visio,application/x-vnd.oasis.opendocument.text,application/x-vsd,application/x-xml,application/x-zip,application/x-zip-compressed,application/xml,application/zip,image/x-vsd,text/postscript,text/rss,text/rtf,text/x-vcard,text/xml,zz-application/zz-winassoc-vsd
# parseableMimeTypes.IMAGE: specifies mime-types that refer to image type content
parseableMimeTypes.IMAGE=image/gif,image/jpeg,image/png,image/tiff,image/vnd.wap.wbmp,image/x-icon,image/bmp
# parseableMimeTypes.HTML: specifies mime-types that can be indexed with built-in html parser
parseableMimeTypes.HTML=application/xhtml+xml,text/html,text/plain,text/sgml
# media extension string
# a comma-separated list of extensions that denote media file formats
# this is important to recognize <a href> - tags as not-html reference

@ -33,7 +33,6 @@ import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
@ -458,54 +457,34 @@ public class SettingsAck_p {
*/
if (post.containsKey("parserSettings")) {
post.remove("parserSettings");
/*
final Set<String> parserModes = ParserDispatcher.getParserConfigList().keySet();
final HashMap<String, HashSet<String>> newConfigList = new HashMap<String, HashSet<String>>();
Iterator<String> parserModeIter = parserModes.iterator();
while (parserModeIter.hasNext()) {
final String currParserMode = parserModeIter.next();
newConfigList.put(currParserMode, new HashSet<String>());
}
// looping through all received settings
int pos;
final HashSet<String> newConfig = new HashSet<String>();
// loop through all received settings
final Iterator<String> keyEnum = post.keySet().iterator();
while (keyEnum.hasNext()) {
final String key = keyEnum.next();
if ((pos = key.indexOf(".")) != -1) {
final String currParserMode = key.substring(0,pos).trim().toUpperCase();
final String currMimeType = key.substring(pos+1).replaceAll("\n", "");
if (parserModes.contains(currParserMode)) {
HashSet<String> currEnabledMimeTypes;
assert (newConfigList.containsKey(currParserMode)) : "Unexpected Error";
currEnabledMimeTypes = newConfigList.get(currParserMode);
currEnabledMimeTypes.add(currMimeType);
}
}
String key = keyEnum.next();
if (key.startsWith("mimename")) newConfig.add(post.get(key));
}
int enabledMimesCount = 0;
final StringBuilder currEnabledMimesTxt = new StringBuilder();
parserModeIter = newConfigList.keySet().iterator();
while (parserModeIter.hasNext()) {
final String currParserMode = parserModeIter.next();
final String[] enabledMimes = ParserDispatcher.setEnabledParserList(newConfigList.get(currParserMode));
Arrays.sort(enabledMimes);
currEnabledMimesTxt.setLength(0);
for (int i=0; i < enabledMimes.length; i++) {
currEnabledMimesTxt.append(enabledMimes[i]).append(",");
prop.put("info_parser_" + enabledMimesCount + "_parserMode",currParserMode);
prop.put("info_parser_" + enabledMimesCount + "_enabledMime",enabledMimes[i]);
enabledMimesCount++;
}
if (currEnabledMimesTxt.length() > 0) currEnabledMimesTxt.deleteCharAt(currEnabledMimesTxt.length()-1);
env.setConfig("parseableMimeTypes." + currParserMode,currEnabledMimesTxt.toString());
final String[] enabledMimes = ParserDispatcher.setEnabledParserList(newConfig);
Arrays.sort(enabledMimes);
currEnabledMimesTxt.setLength(0);
for (int i=0; i < enabledMimes.length; i++) {
currEnabledMimesTxt.append(enabledMimes[i]).append(",");
prop.put("info_parser_" + enabledMimesCount + "_enabledMime", newConfig.toString());
enabledMimesCount++;
}
if (currEnabledMimesTxt.length() > 0) currEnabledMimesTxt.deleteCharAt(currEnabledMimesTxt.length()-1);
env.setConfig("parseableMimeTypes", currEnabledMimesTxt.toString());
prop.put("info_parser",enabledMimesCount);
prop.put("info", "18");
return prop;
*/
}
// Crawler settings

@ -6,31 +6,27 @@
<a href="http://www.iana.org/assignments/media-types/">http://www.iana.org/assignments/media-types/</a>
</p>
<table border="0" cellpadding="2" cellspacing="1">
<tr class="TableHeader" valign="bottom">#{parserMode}#
<td class="small" >#[name]#</td>#{/parserMode}#
<tr class="TableHeader" valign="bottom">
<td class="small" >enable/disable Parser</td>
<td class="small" >Mime-Type</td>
<td class="small" >Parser&nbsp;Usage</td>
</tr>#{parser}#
<tr class="TableCellDark">
<td colspan="#[colspan]#">#[name]# V#[version]#</td>
<td colspan="#[colspan]#">#[name]#</td>
<td>&nbsp;</td>
<td>#[usage]#</td>
</tr>#{mime}#
<tr class="TableCellLight">#{parserMode}#
<td class="small" align="center"><input type="checkbox" name="#[optionName]#" #(status)#::checked="checked" #(/status)#/></td>#{/parserMode}#
<tr class="TableCellLight">
<td class="small" align="center"><input type="checkbox" mimename.#[mimetype]#="#[mimetype]#" #(status)#::checked="checked" #(/status)#/></td>
<td class="small">#[mimetype]#</td>
<td class="small">&nbsp;</td>
</tr>#{/mime}#
#{/parser}#
<tr class="TableCellDark">#{parserMode}#
<tr class="TableCellDark">
<td class="small" align="center">
<input type="checkbox" name="#[name]#.allParserEnabled" onclick="javascript: ParserCheckboxes(this);" #(allParserEnabled)#::checked="checked" #(/allParserEnabled)#/>
</td>#{/parserMode}#
<td>&nbsp;</td>
</td>
<td colspan="2" class="small">Enable all parsers</td>
</tr>
<tr class="TableCellDark">
<td colspan="#[parser.colspan]#" class="small" ><input type="submit" name="parserSettings" value="Submit" /> Changes take effect immediately</td>
<td colspan="2" class="small" ><input type="submit" name="parserSettings" value="Submit" /> Changes take effect immediately</td>
</tr>
</table>
</fieldset>

@ -26,11 +26,10 @@
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import de.anomic.document.Parser;
import de.anomic.document.ParserDispatcher;
import de.anomic.document.ParserConfig;
import de.anomic.http.httpHeader;
import de.anomic.http.httpRequestHeader;
import de.anomic.plasma.plasmaSwitchboard;
@ -217,47 +216,19 @@ public final class Settings_p {
/*
* Parser Configuration
*/
/*
final HashMap<String, plasmaParserConfig> configList = ParserDispatcher.getParserConfigList();
final plasmaParserConfig[] configArray = configList.values().toArray(new plasmaParserConfig[configList.size()]);
final HashSet<ParserInfo> parserInfos = new HashSet<ParserInfo>(ParserDispatcher.getAvailableParserList().values());
// // fetching a list of all available mimetypes
// List availableParserKeys = Arrays.asList(availableParsers.entrySet().toArray(new ParserInfo[availableParsers.size()]));
//
// // sort it
// Collections.sort(availableParserKeys);
// loop through the mimeTypes and add it to the properties
final boolean[] allParsersEnabled = new boolean[configList.size()];
for (int i=0; i<configArray.length; i++)
allParsersEnabled[i] = true;
int parserIdx = 0;
final Iterator<ParserInfo> availableParserIter = parserInfos.iterator();
final Iterator<Parser> availableParserIter = ParserDispatcher.availableParserList.values().iterator();
while (availableParserIter.hasNext()) {
final ParserInfo parserInfo = availableParserIter.next();
prop.put("parser_" + parserIdx + "_name", parserInfo.parserName);
prop.putXML("parser_" + parserIdx + "_version", parserInfo.parserVersionNr);
prop.put("parser_" + parserIdx + "_usage", parserInfo.usageCount);
prop.put("parser_" + parserIdx + "_colspan", configArray.length);
final Parser parserInfo = availableParserIter.next();
prop.put("parser_" + parserIdx + "_name", parserInfo.getName());
int mimeIdx = 0;
final Enumeration<String> mimeTypeIter = parserInfo.supportedMimeTypes.keys();
final Enumeration<String> mimeTypeIter = parserInfo.getSupportedMimeTypes().keys();
while (mimeTypeIter.hasMoreElements()) {
final String mimeType = mimeTypeIter.nextElement();
prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_mimetype", mimeType);
//prop.put("parser_" + parserIdx + "_name", parserName);
//prop.put("parser_" + parserIdx + "_shortname", parserName.substring(parserName.lastIndexOf(".")+1));
for (int i=0; i<configArray.length; i++) {
final HashSet<String> enabledParsers = configArray[i].getEnabledParserList();
prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_parserMode_" + i + "_optionName", configArray[i].parserMode + "." + mimeType);
prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_parserMode_" + i + "_status", enabledParsers.contains(mimeType) ? "1" : "0");
allParsersEnabled[i] &= enabledParsers.contains(mimeType);
}
prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_parserMode", configArray.length);
prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_status", (ParserDispatcher.supportedMimeTypesContains(mimeType)) ? 1 : 0);
mimeIdx++;
}
prop.put("parser_" + parserIdx + "_mime", mimeIdx);
@ -265,14 +236,8 @@ public final class Settings_p {
parserIdx++;
}
for (int i=0; i<configArray.length; i++) {
prop.put("parserMode_" + i + "_name",configArray[i].parserMode);
prop.put("parserMode_" + i + "_allParserEnabled",allParsersEnabled[i] ? "1" : "0");
}
prop.put("parserMode",configArray.length);
prop.put("parser", parserIdx);
prop.put("parser.colspan", configArray.length+2);
*/
// Crawler settings
prop.putHTML("crawler.clientTimeout",sb.getConfig("crawler.clientTimeout", "10000"));
prop.putHTML("crawler.http.maxFileSize",sb.getConfig("crawler.http.maxFileSize", "-1"));

@ -45,11 +45,6 @@ import de.anomic.yacy.logging.Log;
*/
public abstract class AbstractParser implements Parser {
/**
* a list of library names that are needed by this parser
*/
protected String[] libxDependencies = null;
/**
* the logger class that should be used by the parser module for logging
* purposes.
@ -70,9 +65,8 @@ public abstract class AbstractParser implements Parser {
/**
* The Constructor of this class.
*/
public AbstractParser(final String[] libxDependencies) {
public AbstractParser() {
super();
this.libxDependencies = libxDependencies;
}
/**
@ -230,14 +224,6 @@ public abstract class AbstractParser implements Parser {
*/
public abstract Document parse(yacyURL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException;
/**
* @return Returns a list of library names that are needed by this parser
* @see de.anomic.document.Parser#getLibxDependences()
*/
public String[] getLibxDependences() {
return this.libxDependencies;
}
/**
* Return the name of the parser
*/

@ -7,11 +7,7 @@ import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
@ -26,6 +22,7 @@ import java.util.Set;
import de.anomic.document.parser.bzipParser;
import de.anomic.document.parser.docParser;
import de.anomic.document.parser.gzipParser;
import de.anomic.document.parser.htmlParser;
import de.anomic.document.parser.mimeTypeParser;
import de.anomic.document.parser.odtParser;
import de.anomic.document.parser.pdfParser;
@ -41,17 +38,13 @@ import de.anomic.document.parser.vcfParser;
import de.anomic.document.parser.vsdParser;
import de.anomic.document.parser.xlsParser;
import de.anomic.document.parser.zipParser;
import de.anomic.document.parser.html.ContentScraper;
import de.anomic.document.parser.html.ImageEntry;
import de.anomic.document.parser.html.ScraperInputStream;
import de.anomic.document.parser.html.TransformerWriter;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.yacy.yacyURL;
import de.anomic.yacy.logging.Log;
public final class ParserDispatcher {
private static final ParserConfig parserConfig = new ParserConfig();
public static final ParserConfig parserConfig = new ParserConfig();
/**
* A list containing all installed parsers and the mimeType that they support
@ -91,30 +84,6 @@ public final class ParserDispatcher {
private static final HashSet<String> videoExtSet = new HashSet<String>();
private static final HashSet<String> appsExtSet = new HashSet<String>();
/**
* This {@link FilenameFilter} is used to find all classes based on there filenames
* which seems to be additional content parsers.
* Currently the filenames of all content parser classes must end with <code>Parser.class</code>
*/
/*
private static final FilenameFilter parserFileNameFilter = new FilenameFilter() {
public boolean accept(File dir, String name) {
return name.endsWith("Parser.class");
}
};
*/
/**
* This {@link FileFilter} is used to get all subpackages
* of the parser package.
*/
/*
private static final FileFilter parserDirectoryFilter = new FileFilter() {
public boolean accept(File file) {
return file.isDirectory();
}
};
*/
/**
* Initializing the
@ -146,9 +115,6 @@ public final class ParserDispatcher {
private static final Log theLogger = new Log("PARSER");
public Log getLogger() {
return theLogger;
}
/**
* This function is used to initialize the HTMLParsableMimeTypes List.
@ -187,60 +153,43 @@ public final class ParserDispatcher {
}
}
public static void initImageExt(final List<String> imageExtList) {
private static void initImageExt(final List<String> imageExtList) {
synchronized (imageExtSet) {
imageExtSet.addAll(imageExtList);
}
}
public static void initAudioExt(final List<String> audioExtList) {
private static void initAudioExt(final List<String> audioExtList) {
synchronized (audioExtSet) {
audioExtSet.addAll(audioExtList);
}
}
public static void initVideoExt(final List<String> videoExtList) {
private static void initVideoExt(final List<String> videoExtList) {
synchronized (videoExtSet) {
videoExtSet.addAll(videoExtList);
}
}
public static void initAppsExt(final List<String> appsExtList) {
private static void initAppsExt(final List<String> appsExtList) {
synchronized (appsExtSet) {
appsExtSet.addAll(appsExtList);
}
}
public static String getMediaExtList() {
synchronized (mediaExtSet) {
return mediaExtSet.toString();
}
}
public static void initSupportedHTMLFileExt(final List<String> supportedRealtimeFileExtList) {
synchronized (supportedHTMLFileExt) {
supportedHTMLFileExt.addAll(supportedRealtimeFileExtList);
}
}
public static boolean HTMLParsableMimeTypesContains(String mimeType) {
private static boolean HTMLParsableMimeTypesContains(String mimeType) {
mimeType = normalizeMimeType(mimeType);
synchronized (supportedHTMLMimeTypes) {
return supportedHTMLMimeTypes.contains(mimeType);
}
}
public static boolean supportedHTMLContent(final yacyURL url, final String mimeType) {
return HTMLParsableMimeTypesContains(mimeType) && supportedHTMLFileExtContains(url);
}
public static boolean supportedHTMLFileExtContains(final yacyURL url) {
final String fileExt = getFileExt(url);
synchronized (supportedHTMLFileExt) {
return supportedHTMLFileExt.contains(fileExt);
}
}
public static String getFileExt(final yacyURL url) {
// getting the file path
String name = url.getPath();
@ -300,81 +249,6 @@ public final class ParserDispatcher {
}
}
/**
* some html authors use wrong encoding names, either because they don't know exactly what they
* are doing or they produce a type. Many times, the upper/downcase scheme of the name is fuzzy
* This method patches wrong encoding names. The correct names are taken from
* http://www.iana.org/assignments/character-sets
* @param encoding
* @return patched encoding name
*/
public static String patchCharsetEncoding(String encoding) {
// return the system default encoding
if ((encoding == null) || (encoding.length() < 3)) return Charset.defaultCharset().name();
// trim encoding string
encoding = encoding.trim();
// fix upper/lowercase
encoding = encoding.toUpperCase();
if (encoding.startsWith("SHIFT")) return "Shift_JIS";
if (encoding.startsWith("BIG")) return "Big5";
// all other names but such with "windows" use uppercase
if (encoding.startsWith("WINDOWS")) encoding = "windows" + encoding.substring(7);
if (encoding.startsWith("MACINTOSH")) encoding = "MacRoman";
// fix wrong fill characters
encoding = encoding.replaceAll("_", "-");
if (encoding.matches("GB[_-]?2312([-_]80)?")) return "GB2312";
if (encoding.matches(".*UTF[-_]?8.*")) return "UTF-8";
if (encoding.startsWith("US")) return "US-ASCII";
if (encoding.startsWith("KOI")) return "KOI8-R";
// patch missing '-'
if (encoding.startsWith("windows") && encoding.length() > 7) {
final char c = encoding.charAt(7);
if ((c >= '0') && (c <= '9')) {
encoding = "windows-" + encoding.substring(7);
}
}
if (encoding.startsWith("ISO")) {
// patch typos
if (encoding.length() > 3) {
final char c = encoding.charAt(3);
if ((c >= '0') && (c <= '9')) {
encoding = "ISO-" + encoding.substring(3);
}
}
if (encoding.length() > 8) {
final char c = encoding.charAt(8);
if ((c >= '0') && (c <= '9')) {
encoding = encoding.substring(0, 8) + "-" + encoding.substring(8);
}
}
}
// patch wrong name
if (encoding.startsWith("ISO-8559")) {
// popular typo
encoding = "ISO-8859" + encoding.substring(8);
}
// converting cp\d{4} -> windows-\d{4}
if (encoding.matches("CP([_-])?125[0-8]")) {
final char c = encoding.charAt(2);
if ((c >= '0') && (c <= '9')) {
encoding = "windows-" + encoding.substring(2);
} else {
encoding = "windows" + encoding.substring(2);
}
}
return encoding;
}
public static String normalizeMimeType(String mimeType) {
//if (mimeType == null) doMimeTypeAnalysis
if (mimeType == null) mimeType = "application/octet-stream";
@ -519,7 +393,7 @@ public final class ParserDispatcher {
// getting the charset of the document
// TODO: do a charset detection here ....
final String documentCharset = patchCharsetEncoding(theDocumentCharset);
final String documentCharset = htmlParser.patchCharsetEncoding(theDocumentCharset);
// testing if parsing is supported for this resource
if (!supportedContent(location,mimeType)) {
@ -543,7 +417,7 @@ public final class ParserDispatcher {
// parse the resource
doc = theParser.parse(location, mimeType,documentCharset,sourceStream);
} else if (HTMLParsableMimeTypesContains(mimeType)) {
doc = parseHtml(location, mimeType, documentCharset, sourceStream);
doc = new htmlParser().parse(location, mimeType, documentCharset, sourceStream);
} else {
final String errorMsg = "No parser available to parse mimetype '" + mimeType + "' (2)";
theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
@ -558,17 +432,6 @@ public final class ParserDispatcher {
}
return doc;
} catch (final UnsupportedEncodingException e) {
final String errorMsg = "unsupported charset encoding: " + e.getMessage();
theLogger.logSevere("Unable to parse '" + location + "'. " + errorMsg, e);
throw new ParserException(errorMsg,location, errorMsg);
} catch (final IOException e) {
// IOExceptions may occur during html parsing when a server closes the connection during reading.
// This may happen here, because the html parser is a streaming parser
// that produces surrogates while the connection is active
final String errorMsg = "IOException - server may have closed the connection. " + e.getMessage();
theLogger.logWarning("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location, errorMsg);
} catch (final Exception e) {
// Interrupted- and Parser-Exceptions should pass through
if (e instanceof InterruptedException) throw (InterruptedException) e;
@ -586,71 +449,8 @@ public final class ParserDispatcher {
}
}
private static Document parseHtml(
final yacyURL location,
final String mimeType,
final String documentCharset,
final InputStream sourceStream) throws IOException, ParserException {
// make a scraper and transformer
final ScraperInputStream htmlFilter = new ScraperInputStream(sourceStream,documentCharset,location,null,false);
String charset = htmlFilter.detectCharset();
if (charset == null) {
charset = documentCharset;
} else {
charset = patchCharsetEncoding(charset);
}
if (!documentCharset.equalsIgnoreCase(charset)) {
theLogger.logInfo("Charset transformation needed from '" + documentCharset + "' to '" + charset + "' for URL = " + location.toNormalform(true, true));
}
Charset c;
try {
c = Charset.forName(charset);
} catch (IllegalCharsetNameException e) {
c = Charset.defaultCharset();
} catch (UnsupportedCharsetException e) {
c = Charset.defaultCharset();
}
// parsing the content
final ContentScraper scraper = new ContentScraper(location);
final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false);
FileUtils.copy(htmlFilter, writer, c);
writer.close();
//OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
//serverFileUtils.copy(sourceFile, hfos);
//hfos.close();
if (writer.binarySuspect()) {
final String errorMsg = "Binary data found in resource";
theLogger.logSevere("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg,location);
}
return transformScraper(location, mimeType, documentCharset, scraper);
}
public static Document transformScraper(final yacyURL location, final String mimeType, final String charSet, final ContentScraper scraper) {
final String[] sections = new String[scraper.getHeadlines(1).length + scraper.getHeadlines(2).length + scraper.getHeadlines(3).length + scraper.getHeadlines(4).length];
int p = 0;
for (int i = 1; i <= 4; i++) for (int j = 0; j < scraper.getHeadlines(i).length; j++) sections[p++] = scraper.getHeadlines(i)[j];
final Document ppd = new Document(
location,
mimeType,
charSet,
scraper.getContentLanguages(),
scraper.getKeywords(),
scraper.getTitle(),
scraper.getAuthor(),
sections,
scraper.getDescription(),
scraper.getText(),
scraper.getAnchors(),
scraper.getImages());
//scraper.close();
ppd.setFavicon(scraper.getFavicon());
return ppd;
}
/**
* This function is used to determine the parser class that should be used for a given

@ -60,14 +60,8 @@ public class bzipParser extends AbstractParser implements Parser {
SUPPORTED_MIME_TYPES.put("application/x-stuffit",fileExtensions);
}
/**
* a list of library names that are needed by this parser
* @see Parser#getLibxDependences()
*/
private static final String[] LIBX_DEPENDENCIES = new String[] {};
public bzipParser() {
super(LIBX_DEPENDENCIES);
super();
this.parserName = "Bzip 2 UNIX Compressed File Parser";
}

@ -58,16 +58,8 @@ public class docParser extends AbstractParser implements Parser {
SUPPORTED_MIME_TYPES.put("application/x-msword","doc");
}
/**
* a list of library names that are needed by this parser
* @see Parser#getLibxDependences()
*/
private static final String[] LIBX_DEPENDENCIES = new String[] {
"tm-extractors-1.0.jar"
};
public docParser() {
super(LIBX_DEPENDENCIES);
super();
this.parserName = "Word Document Parser";
}

@ -62,14 +62,8 @@ public class gzipParser extends AbstractParser implements Parser {
SUPPORTED_MIME_TYPES.put("application/x-tar",fileExtensions);
}
/**
* a list of library names that are needed by this parser
* @see Parser#getLibxDependences()
*/
private static final String[] LIBX_DEPENDENCIES = new String[] {};
public gzipParser() {
super(LIBX_DEPENDENCIES);
super();
this.parserName = "GNU Zip Compressed Archive Parser";
}

@ -45,7 +45,7 @@ import java.util.Properties;
import javax.swing.event.EventListenerList;
import de.anomic.crawler.HTTPLoader;
import de.anomic.document.ParserDispatcher;
import de.anomic.document.parser.htmlParser;
import de.anomic.http.httpClient;
import de.anomic.http.httpHeader;
import de.anomic.http.httpRequestHeader;
@ -501,7 +501,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// scrape document to look up charset
final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page),"UTF-8",new yacyURL("http://localhost", null),null,false);
final String charset = ParserDispatcher.patchCharsetEncoding(htmlFilter.detectCharset());
final String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
// scrape content
final ContentScraper scraper = new ContentScraper(new yacyURL("http://localhost", null));

@ -0,0 +1,195 @@
package de.anomic.document.parser;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.Hashtable;
import de.anomic.document.AbstractParser;
import de.anomic.document.Document;
import de.anomic.document.Parser;
import de.anomic.document.ParserException;
import de.anomic.document.parser.html.ContentScraper;
import de.anomic.document.parser.html.ScraperInputStream;
import de.anomic.document.parser.html.TransformerWriter;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.yacy.yacyURL;
public class htmlParser extends AbstractParser implements Parser {
/**
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final Hashtable<String, String> SUPPORTED_MIME_TYPES = new Hashtable<String, String>();
static {
SUPPORTED_MIME_TYPES.put("application/xhtml+xml","htm,html,xhtml,php,asp");
SUPPORTED_MIME_TYPES.put("text/html","htm,html,xhtml,php,asp");
SUPPORTED_MIME_TYPES.put("text/plain","htm,html,xhtml,php,asp,txt");
SUPPORTED_MIME_TYPES.put("text/sgml","htm,html,xhtml,php,asp,xml");
}
public htmlParser() {
super();
this.parserName = "streaming html parser";
}
@Override
public Document parse(
final yacyURL location,
final String mimeType,
final String documentCharset,
final InputStream sourceStream) throws ParserException, InterruptedException {
// make a scraper and transformer
final ScraperInputStream htmlFilter = new ScraperInputStream(sourceStream,documentCharset,location,null,false);
String charset = null;
try {
charset = htmlFilter.detectCharset();
} catch (IOException e1) {
throw new ParserException("Charset error:" + e1.getMessage(), location);
}
if (charset == null) {
charset = documentCharset;
} else {
charset = patchCharsetEncoding(charset);
}
if (!documentCharset.equalsIgnoreCase(charset)) {
theLogger.logInfo("Charset transformation needed from '" + documentCharset + "' to '" + charset + "' for URL = " + location.toNormalform(true, true));
}
Charset c;
try {
c = Charset.forName(charset);
} catch (IllegalCharsetNameException e) {
c = Charset.defaultCharset();
} catch (UnsupportedCharsetException e) {
c = Charset.defaultCharset();
}
// parsing the content
final ContentScraper scraper = new ContentScraper(location);
final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false);
try {
FileUtils.copy(htmlFilter, writer, c);
writer.close();
} catch (IOException e) {
throw new ParserException("IO error:" + e.getMessage(), location);
}
//OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
//serverFileUtils.copy(sourceFile, hfos);
//hfos.close();
if (writer.binarySuspect()) {
final String errorMsg = "Binary data found in resource";
theLogger.logSevere("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg,location);
}
return transformScraper(location, mimeType, documentCharset, scraper);
}
private static Document transformScraper(final yacyURL location, final String mimeType, final String charSet, final ContentScraper scraper) {
final String[] sections = new String[scraper.getHeadlines(1).length + scraper.getHeadlines(2).length + scraper.getHeadlines(3).length + scraper.getHeadlines(4).length];
int p = 0;
for (int i = 1; i <= 4; i++) for (int j = 0; j < scraper.getHeadlines(i).length; j++) sections[p++] = scraper.getHeadlines(i)[j];
final Document ppd = new Document(
location,
mimeType,
charSet,
scraper.getContentLanguages(),
scraper.getKeywords(),
scraper.getTitle(),
scraper.getAuthor(),
sections,
scraper.getDescription(),
scraper.getText(),
scraper.getAnchors(),
scraper.getImages());
//scraper.close();
ppd.setFavicon(scraper.getFavicon());
return ppd;
}
/**
* some html authors use wrong encoding names, either because they don't know exactly what they
* are doing or they produce a type. Many times, the upper/downcase scheme of the name is fuzzy
* This method patches wrong encoding names. The correct names are taken from
* http://www.iana.org/assignments/character-sets
* @param encoding
* @return patched encoding name
*/
public static String patchCharsetEncoding(String encoding) {
// return the system default encoding
if ((encoding == null) || (encoding.length() < 3)) return Charset.defaultCharset().name();
// trim encoding string
encoding = encoding.trim();
// fix upper/lowercase
encoding = encoding.toUpperCase();
if (encoding.startsWith("SHIFT")) return "Shift_JIS";
if (encoding.startsWith("BIG")) return "Big5";
// all other names but such with "windows" use uppercase
if (encoding.startsWith("WINDOWS")) encoding = "windows" + encoding.substring(7);
if (encoding.startsWith("MACINTOSH")) encoding = "MacRoman";
// fix wrong fill characters
encoding = encoding.replaceAll("_", "-");
if (encoding.matches("GB[_-]?2312([-_]80)?")) return "GB2312";
if (encoding.matches(".*UTF[-_]?8.*")) return "UTF-8";
if (encoding.startsWith("US")) return "US-ASCII";
if (encoding.startsWith("KOI")) return "KOI8-R";
// patch missing '-'
if (encoding.startsWith("windows") && encoding.length() > 7) {
final char c = encoding.charAt(7);
if ((c >= '0') && (c <= '9')) {
encoding = "windows-" + encoding.substring(7);
}
}
if (encoding.startsWith("ISO")) {
// patch typos
if (encoding.length() > 3) {
final char c = encoding.charAt(3);
if ((c >= '0') && (c <= '9')) {
encoding = "ISO-" + encoding.substring(3);
}
}
if (encoding.length() > 8) {
final char c = encoding.charAt(8);
if ((c >= '0') && (c <= '9')) {
encoding = encoding.substring(0, 8) + "-" + encoding.substring(8);
}
}
}
// patch wrong name
if (encoding.startsWith("ISO-8559")) {
// popular typo
encoding = "ISO-8859" + encoding.substring(8);
}
// converting cp\d{4} -> windows-\d{4}
if (encoding.matches("CP([_-])?125[0-8]")) {
final char c = encoding.charAt(2);
if ((c >= '0') && (c <= '9')) {
encoding = "windows-" + encoding.substring(2);
} else {
encoding = "windows" + encoding.substring(2);
}
}
return encoding;
}
public Hashtable<String, String> getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
}

@ -64,17 +64,6 @@ public class mimeTypeParser extends AbstractParser implements Parser {
SUPPORTED_MIME_TYPES.put("application/x-compressed","");
}
/**
* a list of library names that are needed by this parser
* @see Parser#getLibxDependences()
*/
private static final String[] LIBX_DEPENDENCIES = new String[] {
"commons-logging-1.1.1.jar",
"jmimemagic-0.1.0.jar",
"jakarta-oro-2.0.7.jar",
"log4j-1.2.9.jar"
};
/**
* Helping structure used to detect loops in the mimeType detection
* process
@ -82,7 +71,7 @@ public class mimeTypeParser extends AbstractParser implements Parser {
private static Hashtable<Thread, Integer> threadLoopDetection = new Hashtable<Thread, Integer>();
public mimeTypeParser() {
super(LIBX_DEPENDENCIES);
super();
this.parserName = "MimeType Parser";
}

@ -69,14 +69,8 @@ public class odtParser extends AbstractParser implements Parser {
SUPPORTED_MIME_TYPES.put("application/x-vnd.oasis.opendocument.text","odt");
}
/**
* a list of library names that are needed by this parser
* @see Parser#getLibxDependences()
*/
private static final String[] LIBX_DEPENDENCIES = new String[] {"odf_utils_05_11_29.jar"};
public odtParser() {
super(LIBX_DEPENDENCIES);
super();
this.parserName = "OASIS OpenDocument V2 Text Document Parser";
}

@ -66,16 +66,8 @@ public class pdfParser extends AbstractParser implements Parser {
SUPPORTED_MIME_TYPES.put("text/x-pdf","pdf");
}
/**
* a list of library names that are needed by this parser
* @see Parser#getLibxDependences()
*/
private static final String[] LIBX_DEPENDENCIES = new String[] {
"PDFBox-0.7.3.jar", "FontBox-0.1.0-dev.jar", "bcprov-jdk14-139.jar", "bcmail-jdk14-139.jar"
};
public pdfParser() {
super(LIBX_DEPENDENCIES);
super();
this.parserName = "Acrobat Portable Document Parser";
}

@ -58,17 +58,8 @@ public class pptParser extends AbstractParser implements Parser {
SUPPORTED_MIME_TYPES.put("application/x-m",fileExtensions);
}
/**
* a list of library names that are needed by this parser
* @see Parser#getLibxDependences()
*/
private static final String[] LIBX_DEPENDENCIES = new String[] {
"poi-3.2-FINAL-20081019.jar",
"poi-scratchpad-3.2-FINAL-20081019.jar"
};
public pptParser(){
super(LIBX_DEPENDENCIES);
super();
this.parserName = "Microsoft Powerpoint Parser";
}

@ -57,18 +57,12 @@ public class psParser extends AbstractParser implements Parser {
SUPPORTED_MIME_TYPES.put("application/x-postscript-not-eps","ps");
}
/**
* a list of library names that are needed by this parser
* @see Parser#getLibxDependences()
*/
private static final String[] LIBX_DEPENDENCIES = new String[] {};
private final static Object modeScan = new Object();
private static boolean modeScanDone = false;
private static String parserMode = "java";
public psParser() {
super(LIBX_DEPENDENCIES);
super();
this.parserName = "PostScript Document Parser";
if (!modeScanDone) synchronized (modeScan) {
if (testForPs2Ascii()) parserMode = "ps2ascii";

@ -64,14 +64,8 @@ public class rpmParser extends AbstractParser implements Parser {
SUPPORTED_MIME_TYPES.put("application/x-redhat-package-manager","rpm");
}
/**
* a list of library names that are needed by this parser
* @see Parser#getLibxDependences()
*/
private static final String[] LIBX_DEPENDENCIES = new String[] {"jrpm-head.jar"};
public rpmParser() {
super(LIBX_DEPENDENCIES);
super();
this.parserName = "rpm Parser";
}

@ -68,14 +68,8 @@ public class rssParser extends AbstractParser implements Parser {
SUPPORTED_MIME_TYPES.put("application/atom+xml",fileExtensions);
}
/**
* a list of library names that are needed by this parser
* @see Parser#getLibxDependences()
*/
private static final String[] LIBX_DEPENDENCIES = new String[] {};
public rssParser() {
super(LIBX_DEPENDENCIES);
super();
this.parserName = "Rich Site Summary/Atom Feed Parser";
}

@ -56,14 +56,8 @@ public class rtfParser extends AbstractParser implements Parser {
SUPPORTED_MIME_TYPES.put("application/x-soffice","rtf");
}
/**
* a list of library names that are needed by this parser
* @see Parser#getLibxDependences()
*/
private static final String[] LIBX_DEPENDENCIES = new String[] {};
public rtfParser() {
super(LIBX_DEPENDENCIES);
super();
this.parserName = "Rich Text Format Parser";
}

@ -61,14 +61,8 @@ public class sevenzipParser extends AbstractParser implements Parser {
SUPPORTED_MIME_TYPES.put("application/x-7z-compressed", "7z");
}
/**
* a list of library names that are needed by this parser
* @see Parser#getLibxDependences()
*/
private static final String[] LIBX_DEPENDENCIES = new String[] { "J7Zip-modified.jar" };
public sevenzipParser() {
super(LIBX_DEPENDENCIES);
super();
super.parserName = "7zip Archive Parser";
}

@ -52,14 +52,8 @@ public class swfParser extends AbstractParser implements Parser {
SUPPORTED_MIME_TYPES.put("image/vnd.rn-realflash","swf");
}
/**
* a list of library names that are needed by this parser
* @see Parser#getLibxDependences()
*/
private static final String[] LIBX_DEPENDENCIES = new String[] {"webcat-0.1-swf.jar"};
public swfParser() {
super(LIBX_DEPENDENCIES);
super();
this.parserName = "Adobe Flash Parser";
}

@ -69,16 +69,8 @@ public class tarParser extends AbstractParser implements Parser {
SUPPORTED_MIME_TYPES.put("application/x-compressed","tar");
}
/**
* a list of library names that are needed by this parser
* @see Parser#getLibxDependences()
*/
private static final String[] LIBX_DEPENDENCIES = new String[] {
// "tar.jar"
};
public tarParser() {
super(LIBX_DEPENDENCIES);
super();
this.parserName = "Tape Archive File Parser";
}

@ -72,14 +72,8 @@ public class vcfParser extends AbstractParser implements Parser {
SUPPORTED_MIME_TYPES.put("text/x-vcalendar","vcf");
}
/**
* a list of library names that are needed by this parser
* @see Parser#getLibxDependences()
*/
private static final String[] LIBX_DEPENDENCIES = new String[] {};
public vcfParser() {
super(LIBX_DEPENDENCIES);
super();
this.parserName = "vCard Parser";
}

@ -56,17 +56,8 @@ public class vsdParser extends AbstractParser implements Parser {
SUPPORTED_MIME_TYPES.put("zz-application/zz-winassoc-vsd","vsd");
}
/**
* a list of library names that are needed by this parser
* @see Parser#getLibxDependences()
*/
private static final String[] LIBX_DEPENDENCIES = new String[] {
"poi-3.2-FINAL-20081019.jar",
"poi-scratchpad-3.2-FINAL-20081019.jar",
};
public vsdParser() {
super(LIBX_DEPENDENCIES);
super();
this.parserName = "Microsoft Visio Parser";
}

@ -69,17 +69,8 @@ public class xlsParser extends AbstractParser implements Parser, HSSFListener {
SUPPORTED_MIME_TYPES.put("application/xls","xls");
}
/**
* a list of library names that are needed by this parser
* @see Parser#getLibxDependences()
*/
private static final String[] LIBX_DEPENDENCIES = new String[] {
"poi-3.2-FINAL-20081019.jar",
"poi-scratchpad-3.2-FINAL-20081019.jar"
};
public xlsParser(){
super(LIBX_DEPENDENCIES);
super();
this.parserName = "Microsoft Excel Parser";
}

@ -69,14 +69,8 @@ public class zipParser extends AbstractParser implements Parser {
SUPPORTED_MIME_TYPES.put("application/java-archive","jar");
}
/**
* a list of library names that are needed by this parser
* @see Parser#getLibxDependences()
*/
private static final String[] LIBX_DEPENDENCIES = new String[] {};
public zipParser() {
super(LIBX_DEPENDENCIES);
super();
this.parserName = "Compressed Archive File Parser";
}

@ -81,6 +81,7 @@ import java.util.concurrent.ConcurrentHashMap;
import java.util.zip.GZIPOutputStream;
import de.anomic.document.ParserDispatcher;
import de.anomic.document.parser.htmlParser;
import de.anomic.document.parser.html.ContentScraper;
import de.anomic.document.parser.html.ScraperInputStream;
import de.anomic.kelondro.util.ByteBuffer;
@ -860,7 +861,7 @@ public final class httpdFileHandler {
fis.mark(1000);
// scrape document to look up charset
final ScraperInputStream htmlFilter = new ScraperInputStream(fis,"UTF-8",new yacyURL("http://localhost", null),null,false);
final String charset = ParserDispatcher.patchCharsetEncoding(htmlFilter.detectCharset());
final String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
if(charset != null)
mimeType = mimeType + "; charset="+charset;
// reset position

@ -523,7 +523,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
// define a realtime parsable mimetype list
log.logConfig("Parser: Initializing Mime Types");
ParserDispatcher.initHTMLParsableMimeTypes(getConfig(plasmaSwitchboardConstants.PARSER_MIMETYPES_HTML, "application/xhtml+xml,text/html,text/plain"));
ParserDispatcher.addParseableMimeTypes(getConfig(plasmaSwitchboardConstants.PARSER_MIMETYPES_CRAWLER, null));
ParserDispatcher.addParseableMimeTypes(getConfig(plasmaSwitchboardConstants.PARSER_MIMETYPES, null));
// start a loader
log.logConfig("Starting Crawl Loader");

@ -244,11 +244,8 @@ public final class plasmaSwitchboardConstants {
public static final String RANKING_DIST_1_METHOD = "CRDist1Method";
public static final String RANKING_DIST_1_PERCENT = "CRDist1Percent";
public static final String RANKING_DIST_1_TARGET = "CRDist1Target";
public static final String PARSER_MIMETYPES = "parseableMimeTypes";
public static final String PARSER_MIMETYPES_HTML = "parseableMimeTypes.HTML";
public static final String PARSER_MIMETYPES_PROXY = "parseableMimeTypes.PROXY";
public static final String PARSER_MIMETYPES_CRAWLER = "parseableMimeTypes.CRAWLER";
public static final String PARSER_MIMETYPES_ICAP = "parseableMimeTypes.ICAP";
public static final String PARSER_MIMETYPES_URLREDIRECTOR = "parseableMimeTypes.URLREDIRECTOR";
public static final String PARSER_MIMETYPES_IMAGE = "parseableMimeTypes.IMAGE";
public static final String PARSER_MEDIA_EXT = "mediaExt";
public static final String PARSER_MEDIA_EXT_PARSEABLE = "parseableExt";

@ -56,8 +56,6 @@ import de.anomic.kelondro.util.SortStack;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.plasma.plasmaProfiling;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaProfiling.searchEvent;
import de.anomic.search.QueryEvent.ResultEntry;
import de.anomic.server.serverProfiling;
import de.anomic.yacy.yacyURL;

@ -233,12 +233,6 @@ public class migration {
// migration for additional parser settings
String value = "";
if (((value = sb.getConfig("parseableMimeTypes","")).length() > 0) && (sb.getConfig("parseableMimeTypes.CRAWLER", "").length() == 0)) {
sb.setConfig("parseableMimeTypes.CRAWLER", value);
sb.setConfig("parseableMimeTypes.PROXY", value);
sb.setConfig("parseableMimeTypes.URLREDIRECTOR", value);
sb.setConfig("parseableMimeTypes.ICAP", value);
}
//Locales in DATA, because DATA must be writable, htroot not.
if(sb.getConfig("locale.translated_html", "DATA/LOCALE/htroot").equals("htroot/locale")){
sb.setConfig("locale.translated_html", "DATA/LOCALE/htroot");

Loading…
Cancel
Save