diff --git a/defaults/yacy.init b/defaults/yacy.init
index 6624a162e..791381c24 100644
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@@ -247,23 +247,9 @@ releases = DATA/RELEASE
minimumLocalDelta = 0
minimumGlobalDelta = 500
-# the following mime-types are the whitelist for indexing
-#
-# parseableMime: specifies mime-types that can be indexed with any built-in parser
-parseableMimeTypes=application/atom+xml,application/bzip2,application/excel,application/gzip,application/java-archive,application/msexcel,application/mspowerpoint,application/msword,application/octet-stream,application/pdf,application/postscript,application/powerpoint,application/rdf+xml,application/rss+xml,application/rtf,application/tar,application/vcard,application/visio,application/visio.drawing,application/vnd.ms-excel,application/vnd.ms-powerpoint,application/vnd.oasis.opendocument.text,application/vnd.visio,application/vsd,application/x-7z-compressed,application/x-bz2,application/x-bzip2,application/x-compress,application/x-compressed,application/x-excel,application/x-gzip,application/x-msexcel,application/x-redhat packet manager,application/x-redhat-package-manager,application/x-rpm,application/x-shockwave-flash,application/x-shockwave-flash2-preview,application/x-tar,application/x-visio,application/x-vnd.oasis.opendocument.text,application/x-vsd,application/x-xml,application/x-zip,application/x-zip-compressed,application/xml,application/zip,image/x-vsd,text/postscript,text/rss,text/rtf,text/x-vcard,text/xml,zz-application/zz-winassoc-vsd
-
-# parseableMimeTypes.IMAGE: specifies mime-types that refer to image type content
-parseableMimeTypes.IMAGE=image/gif,image/jpeg,image/png,image/tiff,image/vnd.wap.wbmp,image/x-icon,image/bmp
-
-# parseableMimeTypes.HTML: specifies mime-types that can be indexed with built-in html parser
-parseableMimeTypes.HTML=application/xhtml+xml,text/html,text/plain,text/sgml
-
-# media extension string
-# a comma-separated list of extensions that denote media file formats
-# this is important to recognize - tags as not-html reference
-# These files will be excluded from indexing _(Please keep extensions in alphabetical order)_
-mediaExt=7z,ace,aif,aiff,arj,asf,asx,avi,bin,bmp,bz2,css,db,dcm,deb,doc,dll,dmg,exe,gif,gz,hqx,ico,img,iso,jar,jpe,jpg,jpeg,lx,lxl,m4v,mpeg,mov,mp3,mpg,ogg,png,pdf,ppt,ps,ram,rar,rm,rpm,scr,sit,so,swf,sxc,sxd,sxi,sxw,tar,tbz,tgz,torrent,vsd,war,wav,wmv,xcf,xls,zip
-parseableExt=html,htm,txt,php,shtml,asp,aspx,jsp
+# the following mime-types are a blacklist for indexing:
+# parser.mime.deny: specifies mime-types that shall not be indexed
+parser.mime.deny=
# Promotion Strings
# These strings appear in the Web Mask of the YACY search client
diff --git a/htroot/SettingsAck_p.java b/htroot/SettingsAck_p.java
index 027f5a644..909ad984f 100644
--- a/htroot/SettingsAck_p.java
+++ b/htroot/SettingsAck_p.java
@@ -29,15 +29,13 @@
import java.net.InetSocketAddress;
import java.net.SocketException;
-import java.util.Arrays;
import java.util.HashMap;
-import java.util.HashSet;
import java.util.Iterator;
import java.util.StringTokenizer;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
-import de.anomic.document.Classification;
+import de.anomic.document.Parser;
import de.anomic.http.httpRequestHeader;
import de.anomic.http.httpRemoteProxyConfig;
import de.anomic.http.httpd;
@@ -46,6 +44,7 @@ import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.Digest;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.plasma.plasmaSwitchboard;
+import de.anomic.plasma.plasmaSwitchboardConstants;
import de.anomic.server.serverCore;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@@ -458,30 +457,16 @@ public class SettingsAck_p {
if (post.containsKey("parserSettings")) {
post.remove("parserSettings");
- final HashSet newConfig = new HashSet();
-
// loop through all received settings
final Iterator keyEnum = post.keySet().iterator();
while (keyEnum.hasNext()) {
String key = keyEnum.next();
- if (key.startsWith("mimename")) newConfig.add(post.get(key));
+ if (key.startsWith("mimename")) Parser.grantMime(key.substring(9), post.get(key).equals("on"));
}
- int enabledMimesCount = 0;
- final StringBuilder currEnabledMimesTxt = new StringBuilder();
- final String[] enabledMimes = Classification.setEnabledParserList(newConfig);
- Arrays.sort(enabledMimes);
-
- currEnabledMimesTxt.setLength(0);
- for (int i=0; i < enabledMimes.length; i++) {
- currEnabledMimesTxt.append(enabledMimes[i]).append(",");
- prop.put("info_parser_" + enabledMimesCount + "_enabledMime", newConfig.toString());
- enabledMimesCount++;
- }
- if (currEnabledMimesTxt.length() > 0) currEnabledMimesTxt.deleteCharAt(currEnabledMimesTxt.length()-1);
- env.setConfig("parseableMimeTypes", currEnabledMimesTxt.toString());
+ env.setConfig(plasmaSwitchboardConstants.PARSER_MIME_DENY, Parser.getDenyMime());
- prop.put("info_parser",enabledMimesCount);
+ prop.put("info_parser", 0);
prop.put("info", "18");
return prop;
diff --git a/htroot/Settings_p.java b/htroot/Settings_p.java
index 40857bf38..ccfb51925 100644
--- a/htroot/Settings_p.java
+++ b/htroot/Settings_p.java
@@ -24,11 +24,9 @@
// javac -classpath .:../Classes Settings_p.java
// if the shell's current path is HTROOT
-import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
-import de.anomic.document.Classification;
import de.anomic.document.Idiom;
import de.anomic.document.Parser;
import de.anomic.http.httpHeader;
@@ -219,17 +217,15 @@ public final class Settings_p {
*/
int parserIdx = 0;
- final Iterator availableParserIter = Parser.availableParserList.values().iterator();
+ final Iterator availableParserIter = Parser.idioms().iterator();
while (availableParserIter.hasNext()) {
final Idiom parserInfo = availableParserIter.next();
prop.put("parser_" + parserIdx + "_name", parserInfo.getName());
int mimeIdx = 0;
- final Enumeration mimeTypeIter = parserInfo.getSupportedMimeTypes().keys();
- while (mimeTypeIter.hasMoreElements()) {
- final String mimeType = mimeTypeIter.nextElement();
+ for (String mimeType: parserInfo.getSupportedMimeTypes().keySet()) {
prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_mimetype", mimeType);
- prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_status", (Classification.supportedMimeTypesContains(mimeType)) ? 1 : 0);
+ prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_status", (Parser.supportsMime(mimeType)) ? 1 : 0);
mimeIdx++;
}
prop.put("parser_" + parserIdx + "_mime", mimeIdx);
diff --git a/source/de/anomic/crawler/FTPLoader.java b/source/de/anomic/crawler/FTPLoader.java
index d32d4d663..f501c247b 100644
--- a/source/de/anomic/crawler/FTPLoader.java
+++ b/source/de/anomic/crawler/FTPLoader.java
@@ -32,7 +32,7 @@ import java.io.IOException;
import java.io.PrintStream;
import java.util.Date;
-import de.anomic.document.Classification;
+import de.anomic.document.Parser;
import de.anomic.http.httpHeader;
import de.anomic.http.httpRequestHeader;
import de.anomic.http.httpResponseHeader;
@@ -218,15 +218,24 @@ public class FTPLoader {
private httpDocument getFile(final ftpc ftpClient, final CrawlEntry entry) throws Exception {
// determine the mimetype of the resource
final yacyURL entryUrl = entry.url();
- final String extension = Classification.getFileExt(entryUrl);
- final String mimeType = Classification.getMimeTypeByFileExt(extension);
+ final String mimeType = Parser.mimeOf(entryUrl);
final String path = getPath(entryUrl);
// if the mimetype and file extension is supported we start to download
// the file
httpDocument htCache = null;
- if (Classification.supportedContent(entryUrl, mimeType)) {
- // aborting download if content is too long
+ if (!Parser.supportsExtension(entryUrl)) {
+ // if the response has not the right file type then reject file
+ log.logInfo("REJECTED WRONG EXTENSION TYPE " + mimeType + " for URL " + entry.url().toString());
+ sb.crawlQueues.errorURL.newEntry(entry, this.sb.peers.mySeed().hash, new Date(), 1, "wrong extension");
+ throw new Exception("response has not the right extension type -> rejected");
+ } else if (!Parser.supportsMime(mimeType)) {
+ // if the response has not the right file type then reject file
+ log.logInfo("REJECTED WRONG MIME TYPE " + mimeType + " for URL " + entry.url().toString());
+ sb.crawlQueues.errorURL.newEntry(entry, this.sb.peers.mySeed().hash, new Date(), 1, "wrong mime type");
+ throw new Exception("response has not the right mime type -> rejected");
+ } else {
+ // abort the download if content is too long
final int size = ftpClient.fileSize(path);
if (size <= maxFileSize || maxFileSize == -1) {
// timeout for download
@@ -246,11 +255,6 @@ public class FTPLoader {
sb.crawlQueues.errorURL.newEntry(entry, this.sb.peers.mySeed().hash, new Date(), 1, "file size limit exceeded");
throw new Exception("file size exceeds limit");
}
- } else {
- // if the response has not the right file type then reject file
- log.logInfo("REJECTED WRONG MIME/EXT TYPE " + mimeType + " for URL " + entry.url().toString());
- sb.crawlQueues.errorURL.newEntry(entry, this.sb.peers.mySeed().hash, new Date(), 1, "wrong mime type or wrong extension");
- throw new Exception("response has not the right file type -> rejected");
}
return htCache;
}
diff --git a/source/de/anomic/crawler/HTTPLoader.java b/source/de/anomic/crawler/HTTPLoader.java
index 55dbd60d4..04dbe3431 100644
--- a/source/de/anomic/crawler/HTTPLoader.java
+++ b/source/de/anomic/crawler/HTTPLoader.java
@@ -29,7 +29,7 @@ import java.io.IOException;
import java.util.Date;
import de.anomic.data.Blacklist;
-import de.anomic.document.Classification;
+import de.anomic.document.Parser;
import de.anomic.http.httpClient;
import de.anomic.http.httpHeader;
import de.anomic.http.httpResponse;
@@ -156,8 +156,15 @@ public final class HTTPLoader {
// request has been placed and result has been returned. work off response
//try {
- if (Classification.supportedContent(entry.url(), res.getResponseHeader().mime())) {
-
+ if (!Parser.supportsExtension(entry.url())) {
+ // if the response has not the right file type then reject file
+ sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "wrong extension");
+ throw new IOException("REJECTED WRONG EXTENSION TYPE " + entry.url().getFileExtension()+ " for URL " + entry.url().toString());
+ } else if (!Parser.supportsMime(res.getResponseHeader().mime())) {
+ // if the response has not the right file type then reject file
+ sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "wrong mime type");
+ throw new IOException("REJECTED WRONG MIME TYPE " + res.getResponseHeader().mime() + " for URL " + entry.url().toString());
+ } else {
// get the content length and check if the length is allowed
long contentLength = res.getResponseHeader().getContentLength();
if (maxFileSize >= 0 && contentLength > maxFileSize) {
@@ -177,10 +184,6 @@ public final class HTTPLoader {
}
htCache.setCacheArray(responseBody);
- } else {
- // if the response has not the right file type then reject file
- sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "wrong mime type or wrong extension");
- throw new IOException("REJECTED WRONG MIME/EXT TYPE " + res.getResponseHeader().mime() + " for URL " + entry.url().toString());
}
return htCache;
/*
diff --git a/source/de/anomic/document/AbstractParser.java b/source/de/anomic/document/AbstractParser.java
index cf79945e9..4276b6ec0 100644
--- a/source/de/anomic/document/AbstractParser.java
+++ b/source/de/anomic/document/AbstractParser.java
@@ -54,7 +54,7 @@ public abstract class AbstractParser implements Idiom {
/**
* Parser name
*/
- protected String parserName = this.getClass().getName();
+ private String parserName;
/**
* The source file file size in bytes if the source document was passed
@@ -65,7 +65,7 @@ public abstract class AbstractParser implements Idiom {
/**
* The Constructor of this class.
*/
- public AbstractParser() {
+ public AbstractParser(String name) {
super();
}
@@ -125,10 +125,7 @@ public abstract class AbstractParser implements Idiom {
// XXX: workaround for relative paths within document
+ file.getPath().substring(file.getPath().indexOf(File.separatorChar) + 1)
+ "/" + file.getName());
- final Document subdoc = Parser.parseSource(
- url,
- Classification.getMimeTypeByFileExt(files[i].substring(files[i].indexOf('.') + 1)),
- null, file);
+ final Document subdoc = Parser.parseSource(url, Parser.mimeOf(url), null, file);
// TODO: change anchors back to use '#' after archive name
doc.addSubDocument(subdoc);
subdoc.close();
diff --git a/source/de/anomic/document/Classification.java b/source/de/anomic/document/Classification.java
index 59c662490..7ea9aeb51 100644
--- a/source/de/anomic/document/Classification.java
+++ b/source/de/anomic/document/Classification.java
@@ -2,8 +2,6 @@
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 09.07.2009 on http://yacy.net
//
-// This is a part of YaCy, a peer-to-peer based web search engine
-//
// $LastChangedDate: 2009-03-20 16:44:59 +0100 (Fr, 20 Mrz 2009) $
// $LastChangedRevision: 5736 $
// $LastChangedBy: borg-0300 $
@@ -30,38 +28,25 @@ import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
-import java.util.Arrays;
import java.util.HashSet;
-import java.util.Hashtable;
-import java.util.Iterator;
-import java.util.LinkedList;
-import java.util.List;
import java.util.Properties;
import java.util.Set;
-import de.anomic.yacy.yacyURL;
-import de.anomic.yacy.logging.Log;
-
public class Classification {
- public static final HashSet supportedHTMLFileExt = new HashSet();
- public static final HashSet supportedHTMLMimeTypes = new HashSet();
-
private static final HashSet mediaExtSet = new HashSet();
private static final HashSet imageExtSet = new HashSet();
private static final HashSet audioExtSet = new HashSet();
private static final HashSet videoExtSet = new HashSet();
private static final HashSet appsExtSet = new HashSet();
- private static final Properties mimeTypeLookupByFileExt = new Properties();
- public final static HashSet enabledParserList = new HashSet();
- private final static HashSet supportedFileExt = new HashSet();
+ private static final Properties ext2mime = new Properties();
static {
// load a list of extensions from file
BufferedInputStream bufferedIn = null;
try {
- mimeTypeLookupByFileExt.load(bufferedIn = new BufferedInputStream(new FileInputStream(new File("httpd.mime"))));
+ ext2mime.load(bufferedIn = new BufferedInputStream(new FileInputStream(new File("httpd.mime"))));
} catch (final IOException e) {
System.err.println("ERROR: httpd.mime not found in settings path");
} finally {
@@ -70,219 +55,46 @@ public class Classification {
} catch (final Exception e) {}
}
- final String apps = "sit,hqx,img,dmg,exe,com,bat,sh,vbs,zip,jar";
- final String audio = "mp2,mp3,ogg,aac,aif,aiff,wav";
- final String video = "swf,avi,wmv,rm,mov,mpg,mpeg,ram,m4v";
- final String image = "jpg,jpeg,jpe,gif,png,ico,bmp";
+ final String apps = "7z,ace,arc,arj,asf,asx,bat,bin,bkf,bz2,cab,com,css,dcm,deb,dll,dmg,exe,gho,ghs,gz,hqx,img,iso,jar,lha,rar,sh,sit,sitx,tar,tbz,tgz,tib,torrent,vbs,war,zip";
+ final String audio = "aac,aif,aiff,flac,m4a,m4p,mid,mp2,mp3,oga,ogg,ram,wav,wma";
+ final String video = "3g2,3gp,3gp2,3gpp,3gpp2,3ivx,asf,asx,avi,div,divx,dv,dvx,env,f4v,flv,hdmov,m1v,m4v,m-jpeg,moov,mov,movie,mp2v,mp4,mpe,mpeg,mpg,mpg4,mv4,ogm,ogv,qt,rm,rv,vid,swf,wmv";
+ final String image = "ai,bmp,cdr,cmx,emf,eps,gif,img,jpeg,jpg,mng,pct,pdd,pdn,pict,png,psb,psd,psp,tif,tiff,wmf";
- imageExtSet.addAll(extString2extList(image)); // image formats
- audioExtSet.addAll(extString2extList(audio)); // audio formats
- videoExtSet.addAll(extString2extList(video)); // video formats
- appsExtSet.addAll(extString2extList(apps)); // application formats
-
- initMediaExt(extString2extList(apps + "," + // application container
- "tar,gz,bz2,arj,zip,rar," + // archive formats
- "ps,xls,ppt,asf," + // text formats without support
- audio + "," + // audio formats
- video + "," + // video formats
- image // image formats
- ));
+ addSet(imageExtSet, image); // image formats
+ addSet(audioExtSet, audio); // audio formats
+ addSet(videoExtSet, video); // video formats
+ addSet(appsExtSet, apps); // application formats
+ addSet(mediaExtSet, apps + "," + audio + "," + video + "," + image); // all media formats
}
- public static List extString2extList(final String extString) {
- final LinkedList extensions = new LinkedList();
- if ((extString == null) || (extString.length() == 0)) {
- return extensions;
- }
- final String[] xs = extString.split(",");
- for (int i = 0; i < xs.length; i++)
- extensions.add(xs[i].toLowerCase().trim());
- return extensions;
+ private static void addSet(Set set, final String extString) {
+ if ((extString == null) || (extString.length() == 0)) return;
+ for (String s: extString.split(",")) set.add(s.toLowerCase().trim());
}
- public static void initMediaExt(final List mediaExtList) {
- mediaExtSet.addAll(mediaExtList);
- }
-
- public static boolean mediaExtContains(String mediaExt) {
+ public static boolean isMediaExtension(String mediaExt) {
if (mediaExt == null) return false;
- mediaExt = mediaExt.trim().toLowerCase();
-
- if (supportedHTMLFileExt.contains(mediaExt)) return false;
-
- if (supportedFileExtContains(mediaExt)) return false;
-
- return mediaExtSet.contains(mediaExt);
+ return mediaExtSet.contains(mediaExt.trim().toLowerCase());
}
- public static boolean imageExtContains(final String imageExt) {
+ public static boolean isImageExtension(final String imageExt) {
if (imageExt == null) return false;
return imageExtSet.contains(imageExt.trim().toLowerCase());
}
- public static boolean audioExtContains(final String audioExt) {
+ public static boolean isAudioExtension(final String audioExt) {
if (audioExt == null) return false;
return audioExtSet.contains(audioExt.trim().toLowerCase());
}
- public static boolean videoExtContains(final String videoExt) {
+ public static boolean isVideoExtension(final String videoExt) {
if (videoExt == null) return false;
return videoExtSet.contains(videoExt.trim().toLowerCase());
}
- public static boolean appsExtContains(final String appsExt) {
+ public static boolean isApplicationExtension(final String appsExt) {
if (appsExt == null) return false;
return appsExtSet.contains(appsExt.trim().toLowerCase());
}
- public static void initHTMLParsableMimeTypes(
- final String htmlParsableMimeTypes) {
- final LinkedList mimeTypes = new LinkedList();
- if ((htmlParsableMimeTypes == null) || (htmlParsableMimeTypes.length() == 0)) {
- return;
- }
- final String[] realtimeParsableMimeTypeList = htmlParsableMimeTypes
- .split(",");
- for (int i = 0; i < realtimeParsableMimeTypeList.length; i++) {
- mimeTypes.add(realtimeParsableMimeTypeList[i].toLowerCase().trim());
- }
- supportedHTMLMimeTypes.addAll(mimeTypes);
- }
-
- public static String normalizeMimeType(String mimeType) {
- // if (mimeType == null) doMimeTypeAnalysis
- if (mimeType == null) mimeType = "application/octet-stream";
- mimeType = mimeType.trim().toLowerCase();
-
- final int pos = mimeType.indexOf(';');
- return ((pos < 0) ? mimeType : mimeType.substring(0, pos));
- }
-
- public static String getMimeTypeByFileExt(final String fileExt) {
- return mimeTypeLookupByFileExt.getProperty(fileExt, "application/octet-stream");
- }
-
- public static void initSupportedHTMLFileExt(final List supportedRealtimeFileExtList) {
- supportedHTMLFileExt.addAll(supportedRealtimeFileExtList);
- }
-
- static boolean HTMLParsableMimeTypesContains(String mimeType) {
- mimeType = normalizeMimeType(mimeType);
- return supportedHTMLMimeTypes.contains(mimeType);
- }
-
- public static boolean supportedContent(final yacyURL url, String mimeType) {
- mimeType = Classification.normalizeMimeType(mimeType);
- if (
- mimeType.equals("text/html") ||
- mimeType.equals("application/xhtml+xml") ||
- mimeType.equals("text/plain")
- ) {
- return supportedMimeTypesContains(mimeType);
- }
- return supportedMimeTypesContains(mimeType) && supportedFileExt(url);
- }
-
- public static boolean supportedMimeTypesContains(String mimeType) {
- mimeType = Classification.normalizeMimeType(mimeType);
-
- if (Classification.supportedHTMLMimeTypes.contains(mimeType)) return true;
- return enabledParserList.contains(mimeType);
- }
-
- private static boolean supportedFileExt(final yacyURL url) {
- if (url == null) throw new NullPointerException();
-
- // getting the file path
- final String name = getFileExt(url);
- return supportedFileExtContains(name);
- }
-
- public static boolean supportedFileExtContains(String fileExt) {
- if (fileExt == null) return false;
- fileExt = fileExt.trim().toLowerCase();
- if (Classification.supportedHTMLFileExt.contains(fileExt)) return true;
-
- return supportedFileExt.contains(fileExt);
- }
-
- public static void addParseableMimeTypes(final String enabledMimeTypes) {
- HashSet mimeTypes = null;
- if ((enabledMimeTypes == null) || (enabledMimeTypes.length() == 0)) {
- mimeTypes = new HashSet();
- } else {
- final String[] enabledMimeTypeList = enabledMimeTypes.split(",");
- mimeTypes = new HashSet(enabledMimeTypeList.length);
- for (int i = 0; i < enabledMimeTypeList.length; i++) mimeTypes.add(enabledMimeTypeList[i].toLowerCase().trim());
- }
- setEnabledParserList(mimeTypes);
- }
-
- public static void enableAllParsers() {
- final Set availableMimeTypes = Parser.availableParserList.keySet();
- setEnabledParserList(availableMimeTypes);
- }
-
- public static String[] setEnabledParserList(final Set mimeTypeSet) {
-
- final HashSet newEnabledParsers = new HashSet();
- final HashSet newSupportedFileExt = new HashSet();
-
- if (mimeTypeSet != null) {
- final Iterator mimeTypes = mimeTypeSet.iterator();
- while (mimeTypes.hasNext()) {
- final String mimeType = mimeTypes.next();
- Idiom theParser = Parser.availableParserList.get(mimeType);
- if (theParser != null) {
- try {
- // getting a list of mimeTypes that the parser supports
- final Hashtable parserSupportsMimeTypes = theParser.getSupportedMimeTypes();
- if (parserSupportsMimeTypes != null) {
- final Object supportedExtensions = parserSupportsMimeTypes.get(mimeType);
- if ((supportedExtensions != null) &&
- (supportedExtensions instanceof String) &&
- (((String)supportedExtensions).length() > 0)) {
- final String[] extArray = ((String)supportedExtensions).split(",");
- newSupportedFileExt.addAll(Arrays.asList(extArray));
- }
- }
- newEnabledParsers.add(mimeType);
-
- } catch (final Exception e) {
- Log.logSevere("PARSER", "error in setEnabledParserList", e);
- } finally {
- if (theParser != null)
- theParser = null; // destroy object
- }
- }
- }
- }
-
- enabledParserList.addAll(newEnabledParsers);
- supportedFileExt.addAll(newSupportedFileExt);
-
- return newEnabledParsers.toArray(new String[newEnabledParsers.size()]);
- }
-
- @SuppressWarnings("unchecked")
- public static HashSet getEnabledParserList() {
- return (HashSet) enabledParserList.clone();
- }
-
- public static String getFileExt(final yacyURL url) {
- // getting the file path
- String name = url.getPath();
-
- // tetermining last position of / in the file path
- int p = name.lastIndexOf('/');
- if (p != -1) {
- name = name.substring(p);
- }
-
- // termining last position of . in file path
- p = name.lastIndexOf('.');
- if (p < 0)
- return "";
- return name.substring(p + 1);
- }
}
diff --git a/source/de/anomic/document/Document.java b/source/de/anomic/document/Document.java
index f991befaf..d6a3d144c 100644
--- a/source/de/anomic/document/Document.java
+++ b/source/de/anomic/document/Document.java
@@ -374,14 +374,14 @@ dc_rights
} else {
ext = u.substring(extpos + 1).toLowerCase();
}
- if (Classification.mediaExtContains(ext)) {
+ if (Classification.isMediaExtension(ext)) {
// this is not a normal anchor, its a media link
- if (Classification.imageExtContains(ext)) {
+ if (Classification.isImageExtension(ext)) {
ContentScraper.addImage(collectedImages, new ImageEntry(url, entry.getValue(), -1, -1));
}
- else if (Classification.audioExtContains(ext)) audiolinks.put(url, entry.getValue());
- else if (Classification.videoExtContains(ext)) videolinks.put(url, entry.getValue());
- else if (Classification.appsExtContains(ext)) applinks.put(url, entry.getValue());
+ else if (Classification.isAudioExtension(ext)) audiolinks.put(url, entry.getValue());
+ else if (Classification.isVideoExtension(ext)) videolinks.put(url, entry.getValue());
+ else if (Classification.isApplicationExtension(ext)) applinks.put(url, entry.getValue());
} else {
hyperlinks.put(url, entry.getValue());
}
diff --git a/source/de/anomic/document/Idiom.java b/source/de/anomic/document/Idiom.java
index 64cd92617..5ab8405ee 100644
--- a/source/de/anomic/document/Idiom.java
+++ b/source/de/anomic/document/Idiom.java
@@ -27,6 +27,7 @@ package de.anomic.document;
import java.io.File;
import java.io.InputStream;
+import java.util.HashMap;
import java.util.Hashtable;
import de.anomic.yacy.yacyURL;
@@ -85,11 +86,12 @@ public interface Idiom {
throws ParserException, InterruptedException;
/**
- * Can be used to determine the MimeType(s) that are supported by the parser
- * @return a {@link Hashtable} containing a list of MimeTypes that are supported by
- * the parser
+ * Get the MimeType(s) that are supported by the parser
+ * @return a {@link Hashtable} containing a mapping from a mime type string
+ * to a comma-separated String of file extensions
+ * that are supported by the idiom parser
*/
- public Hashtable getSupportedMimeTypes();
+ public HashMap getSupportedMimeTypes();
/**
* This function should be called before reusing the parser object.
diff --git a/source/de/anomic/document/Parser.java b/source/de/anomic/document/Parser.java
index 579a05a9e..5baf9eecf 100644
--- a/source/de/anomic/document/Parser.java
+++ b/source/de/anomic/document/Parser.java
@@ -31,9 +31,13 @@ import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
-import java.util.HashMap;
-import java.util.Hashtable;
-import java.util.Iterator;
+import java.text.Collator;
+import java.util.HashSet;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.TreeSet;
import de.anomic.document.parser.bzipParser;
import de.anomic.document.parser.docParser;
@@ -59,13 +63,24 @@ import de.anomic.yacy.logging.Log;
public final class Parser {
- private static final Log theLogger = new Log("PARSER");
- public static final HashMap availableParserList = new HashMap();
+ private static final Log log = new Log("PARSER");
+
+ // use a collator to relax when distinguishing between lowercase und uppercase letters
+ private static final Collator insensitiveCollator = Collator.getInstance(Locale.US);
+ static {
+ insensitiveCollator.setStrength(Collator.SECONDARY);
+ insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION);
+ }
+
+ private static final Map mime2parser = new TreeMap(insensitiveCollator);
+ private static final Map> ext2mime = new TreeMap>(insensitiveCollator);
+ private static final Set denyMime = new TreeSet(insensitiveCollator);
static {
initParser(new bzipParser());
initParser(new docParser());
initParser(new gzipParser());
+ initParser(new htmlParser());
initParser(new mimeTypeParser());
initParser(new odtParser());
initParser(new pdfParser());
@@ -82,14 +97,30 @@ public final class Parser {
initParser(new xlsParser());
initParser(new zipParser());
}
+
+ public static Set idioms() {
+ Set c = new HashSet();
+ c.addAll(mime2parser.values());
+ return c;
+ }
+
+ private static void initParser(Idiom parser) {
+ for (Map.Entry e: parser.getSupportedMimeTypes().entrySet()) {
+ // process the mime types
+ final String mimeType = e.getKey();
+ Idiom p0 = mime2parser.get(mimeType);
+ if (p0 != null) log.logSevere("parser for mime '" + mimeType + "' was set to '" + p0.getName() + "', overwriting with new parser.");
+ mime2parser.put(mimeType, parser);
+ Log.logInfo("PARSER", "Parser for mime type '" + mimeType + "': " + parser.getName());
- private static void initParser(Idiom theParser) {
- final Hashtable supportedMimeTypes = theParser.getSupportedMimeTypes();
- final Iterator mimeTypeIterator = supportedMimeTypes.keySet().iterator();
- while (mimeTypeIterator.hasNext()) {
- final String mimeType = mimeTypeIterator.next();
- availableParserList.put(mimeType, theParser);
- Log.logInfo("PARSER", "Found parser for mimeType '" + mimeType + "': " + theParser.getName());
+ // process the extensions
+ String[] exts = e.getValue().split(",");
+ for (String ext: exts) {
+ Set s = ext2mime.get(ext);
+ if (s == null) s = new HashSet();
+ s.add(mimeType);
+ ext2mime.put(ext, s);
+ }
}
}
@@ -99,10 +130,10 @@ public final class Parser {
ParserException {
ByteArrayInputStream byteIn = null;
try {
- if (theLogger.isFine()) theLogger.logFine("Parsing '" + location + "' from byte-array");
+ if (log.isFine()) log.logFine("Parsing '" + location + "' from byte-array");
if (sourceArray == null || sourceArray.length == 0) {
final String errorMsg = "No resource content available (1) " + (((sourceArray == null) ? "source == null" : "source.length() == 0") + ", url = " + location.toNormalform(true, false));
- theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
+ log.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location, errorMsg);
}
byteIn = new ByteArrayInputStream(sourceArray);
@@ -110,7 +141,7 @@ public final class Parser {
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e;
- theLogger.logSevere("Unexpected exception in parseSource from byte-array: " + e.getMessage(), e);
+ log.logSevere("Unexpected exception in parseSource from byte-array: " + e.getMessage(), e);
throw new ParserException("Unexpected exception while parsing " + location, location, e);
} finally {
if (byteIn != null) try {
@@ -125,10 +156,10 @@ public final class Parser {
BufferedInputStream sourceStream = null;
try {
- if (theLogger.isFine()) theLogger.logFine("Parsing '" + location + "' from file");
+ if (log.isFine()) log.logFine("Parsing '" + location + "' from file");
if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile.length() > 0)) {
final String errorMsg = sourceFile.exists() ? "Empty resource file." : "No resource content available (2).";
- theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
+ log.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location, "document has no content");
}
sourceStream = new BufferedInputStream(new FileInputStream(sourceFile));
@@ -136,7 +167,7 @@ public final class Parser {
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e;
- theLogger.logSevere("Unexpected exception in parseSource from File: " + e.getMessage(), e);
+ log.logSevere("Unexpected exception in parseSource from File: " + e.getMessage(), e);
throw new ParserException("Unexpected exception while parsing " + location, location, e);
} finally {
if (sourceStream != null)try {
@@ -150,31 +181,34 @@ public final class Parser {
final long contentLength, final InputStream sourceStream)
throws InterruptedException, ParserException {
try {
- if (theLogger.isFine()) theLogger.logFine("Parsing '" + location + "' from stream");
- mimeType = Classification.normalizeMimeType(mimeType);
- final String fileExt = Classification.getFileExt(location);
+ if (log.isFine()) log.logFine("Parsing '" + location + "' from stream");
+ mimeType = normalizeMimeType(mimeType);
+ final String fileExt = location.getFileExtension();
final String documentCharset = htmlParser.patchCharsetEncoding(charset);
- if (!Classification.supportedContent(location, mimeType)) {
- final String errorMsg = "No parser available to parse mimetype '" + mimeType + "' (1)";
- theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
- throw new ParserException(errorMsg, location, "wrong mime type or wrong extension");
+ if (!supportsMime(mimeType)) {
+ final String errorMsg = "No parser available to parse mimetype '" + mimeType + "'";
+ log.logInfo("Unable to parse '" + location + "'. " + errorMsg);
+ throw new ParserException(errorMsg, location, "wrong mime type");
+ }
+ if (!supportsExtension(location)) {
+ final String errorMsg = "No parser available to parse extension of url path";
+ log.logInfo("Unable to parse '" + location + "'. " + errorMsg);
+ throw new ParserException(errorMsg, location, "wrong extension");
}
- if (theLogger.isFine()) theLogger.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'.");
- Idiom parser = availableParserList.get(Classification.normalizeMimeType(mimeType));
+ if (log.isFine()) log.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'.");
+ Idiom parser = mime2parser.get(normalizeMimeType(mimeType));
Document doc = null;
if (parser != null) {
parser.setContentLength(contentLength);
doc = parser.parse(location, mimeType, documentCharset, sourceStream);
- } else if (Classification.HTMLParsableMimeTypesContains(mimeType)) {
- doc = new htmlParser().parse(location, mimeType, documentCharset, sourceStream);
} else {
final String errorMsg = "No parser available to parse mimetype '" + mimeType + "' (2)";
- theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
+ log.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location, "wrong mime type or wrong extension");
}
if (doc == null) {
final String errorMsg = "Unexpected error. Parser returned null.";
- theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
+ log.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location);
}
return doc;
@@ -182,9 +216,50 @@ public final class Parser {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e;
final String errorMsg = "Unexpected exception. " + e.getMessage();
- theLogger.logSevere("Unable to parse '" + location + "'. " + errorMsg, e);
+ log.logSevere("Unable to parse '" + location + "'. " + errorMsg, e);
throw new ParserException(errorMsg, location, e);
}
}
+ public static boolean supportsMime(String mimeType) {
+ return !denyMime.contains(mimeType) && mime2parser.containsKey(normalizeMimeType(mimeType));
+ }
+
+ public static boolean supportsExtension(final yacyURL url) {
+ String ext = url.getFileExtension();
+ if (ext.length() == 0) return true; // may be anything; thats ok if the mime type is ok
+ return ext2mime.containsKey(ext);
+ }
+
+ public static String mimeOf(yacyURL url) {
+ return mimeOf(url.getFileExtension());
+ }
+
+ public static String mimeOf(String ext) {
+ Set mimes = ext2mime.get(ext);
+ if (mimes == null) return null;
+ return mimes.iterator().next();
+ }
+
+ private static String normalizeMimeType(String mimeType) {
+ if (mimeType == null) return "application/octet-stream";
+ final int pos = mimeType.indexOf(';');
+ return ((pos < 0) ? mimeType.trim() : mimeType.substring(0, pos).trim());
+ }
+
+ public static void setDenyMime(String denyList) {
+ denyMime.clear();
+ for (String s: denyList.split(",")) denyMime.add(s);
+ }
+
+ public static String getDenyMime() {
+ String s = "";
+ for (String d: denyMime) s += d + ",";
+ s = s.substring(0, s.length() - 1);
+ return s;
+ }
+
+ public static void grantMime(String mime, boolean grant) {
+ if (grant) denyMime.remove(mime); else denyMime.add(mime);
+ }
}
diff --git a/source/de/anomic/document/parser/bzipParser.java b/source/de/anomic/document/parser/bzipParser.java
index 8173e80be..2b5321cc0 100644
--- a/source/de/anomic/document/parser/bzipParser.java
+++ b/source/de/anomic/document/parser/bzipParser.java
@@ -30,8 +30,7 @@ package de.anomic.document.parser;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
-import java.util.Hashtable;
-
+import java.util.HashMap;
import org.apache.tools.bzip2.CBZip2InputStream;
import de.anomic.document.AbstractParser;
@@ -48,7 +47,7 @@ public class bzipParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
- public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable();
+ public static final HashMap SUPPORTED_MIME_TYPES = new HashMap();
static final String fileExtensions = "bz2,tbz,tbz2";
static {
SUPPORTED_MIME_TYPES.put("application/x-bzip2",fileExtensions);
@@ -61,11 +60,10 @@ public class bzipParser extends AbstractParser implements Idiom {
}
public bzipParser() {
- super();
- this.parserName = "Bzip 2 UNIX Compressed File Parser";
+ super("Bzip 2 UNIX Compressed File Parser");
}
- public Hashtable getSupportedMimeTypes() {
+ public HashMap getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
diff --git a/source/de/anomic/document/parser/docParser.java b/source/de/anomic/document/parser/docParser.java
index ce4db99af..41b47d136 100644
--- a/source/de/anomic/document/parser/docParser.java
+++ b/source/de/anomic/document/parser/docParser.java
@@ -28,8 +28,7 @@
package de.anomic.document.parser;
import java.io.InputStream;
-import java.util.Hashtable;
-
+import java.util.HashMap;
import org.textmining.extraction.TextExtractor;
import org.textmining.extraction.word.WordTextExtractorFactory;
@@ -45,22 +44,22 @@ public class docParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
- public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable();
+ public static final HashMap SUPPORTED_MIME_TYPES = new HashMap();
static {
- SUPPORTED_MIME_TYPES.put("application/msword","doc");
- SUPPORTED_MIME_TYPES.put("application/doc","doc");
- SUPPORTED_MIME_TYPES.put("appl/text","doc");
- SUPPORTED_MIME_TYPES.put("application/vnd.msword","doc");
- SUPPORTED_MIME_TYPES.put("application/vnd.ms-word","doc");
- SUPPORTED_MIME_TYPES.put("application/winword","doc");
- SUPPORTED_MIME_TYPES.put("application/word","doc");
- SUPPORTED_MIME_TYPES.put("application/x-msw6","doc");
- SUPPORTED_MIME_TYPES.put("application/x-msword","doc");
+ String ext = "doc,docx";
+ SUPPORTED_MIME_TYPES.put("application/msword",ext);
+ SUPPORTED_MIME_TYPES.put("application/doc",ext);
+ SUPPORTED_MIME_TYPES.put("appl/text",ext);
+ SUPPORTED_MIME_TYPES.put("application/vnd.msword",ext);
+ SUPPORTED_MIME_TYPES.put("application/vnd.ms-word",ext);
+ SUPPORTED_MIME_TYPES.put("application/winword",ext);
+ SUPPORTED_MIME_TYPES.put("application/word",ext);
+ SUPPORTED_MIME_TYPES.put("application/x-msw6",ext);
+ SUPPORTED_MIME_TYPES.put("application/x-msword",ext);
}
public docParser() {
- super();
- this.parserName = "Word Document Parser";
+ super("Word Document Parser");
}
public Document parse(final yacyURL location, final String mimeType, final String charset,
@@ -103,7 +102,7 @@ public class docParser extends AbstractParser implements Idiom {
}
}
- public java.util.Hashtable getSupportedMimeTypes() {
+ public HashMap getSupportedMimeTypes() {
return docParser.SUPPORTED_MIME_TYPES;
}
diff --git a/source/de/anomic/document/parser/gzipParser.java b/source/de/anomic/document/parser/gzipParser.java
index 408bbff98..78aa2f491 100644
--- a/source/de/anomic/document/parser/gzipParser.java
+++ b/source/de/anomic/document/parser/gzipParser.java
@@ -30,7 +30,7 @@ package de.anomic.document.parser;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
-import java.util.Hashtable;
+import java.util.HashMap;
import java.util.zip.GZIPInputStream;
import de.anomic.document.AbstractParser;
@@ -47,27 +47,26 @@ public class gzipParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
- public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable();
- static final String fileExtensions = "gz,tgz";
+ public static final HashMap SUPPORTED_MIME_TYPES = new HashMap();
+ static final String ext = "gz,tgz";
static {
- SUPPORTED_MIME_TYPES.put("application/x-gzip",fileExtensions);
- SUPPORTED_MIME_TYPES.put("application/gzip",fileExtensions);
- SUPPORTED_MIME_TYPES.put("application/x-gunzip",fileExtensions);
- SUPPORTED_MIME_TYPES.put("application/gzipped",fileExtensions);
- SUPPORTED_MIME_TYPES.put("application/gzip-compressed",fileExtensions);
- SUPPORTED_MIME_TYPES.put("application/x-compressed",fileExtensions);
- SUPPORTED_MIME_TYPES.put("application/x-compress",fileExtensions);
- SUPPORTED_MIME_TYPES.put("gzip/document",fileExtensions);
- SUPPORTED_MIME_TYPES.put("application/octet-stream",fileExtensions);
- SUPPORTED_MIME_TYPES.put("application/x-tar",fileExtensions);
+ SUPPORTED_MIME_TYPES.put("application/x-gzip",ext);
+ SUPPORTED_MIME_TYPES.put("application/gzip",ext);
+ SUPPORTED_MIME_TYPES.put("application/x-gunzip",ext);
+ SUPPORTED_MIME_TYPES.put("application/gzipped",ext);
+ SUPPORTED_MIME_TYPES.put("application/gzip-compressed",ext);
+ SUPPORTED_MIME_TYPES.put("application/x-compressed",ext);
+ SUPPORTED_MIME_TYPES.put("application/x-compress",ext);
+ SUPPORTED_MIME_TYPES.put("gzip/document",ext);
+ SUPPORTED_MIME_TYPES.put("application/octet-stream",ext);
+ SUPPORTED_MIME_TYPES.put("application/x-tar",ext);
}
public gzipParser() {
- super();
- this.parserName = "GNU Zip Compressed Archive Parser";
+ super("GNU Zip Compressed Archive Parser");
}
- public Hashtable getSupportedMimeTypes() {
+ public HashMap getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
diff --git a/source/de/anomic/document/parser/htmlParser.java b/source/de/anomic/document/parser/htmlParser.java
index 2d83d09b6..743226dda 100644
--- a/source/de/anomic/document/parser/htmlParser.java
+++ b/source/de/anomic/document/parser/htmlParser.java
@@ -31,8 +31,7 @@ import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
-import java.util.Hashtable;
-
+import java.util.HashMap;
import de.anomic.document.AbstractParser;
import de.anomic.document.Document;
import de.anomic.document.Idiom;
@@ -49,17 +48,17 @@ public class htmlParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
- public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable();
- static {
- SUPPORTED_MIME_TYPES.put("application/xhtml+xml","htm,html,xhtml,php,asp");
- SUPPORTED_MIME_TYPES.put("text/html","htm,html,xhtml,php,asp");
- SUPPORTED_MIME_TYPES.put("text/plain","htm,html,xhtml,php,asp,txt");
- SUPPORTED_MIME_TYPES.put("text/sgml","htm,html,xhtml,php,asp,xml");
+ public static final HashMap SUPPORTED_MIME_TYPES = new HashMap();
+ static {
+ String ext = "htm,html,shtml,xhtml,php,asp,aspx,txt,jsp";
+ SUPPORTED_MIME_TYPES.put("application/xhtml+xml", ext);
+ SUPPORTED_MIME_TYPES.put("text/html", ext);
+ SUPPORTED_MIME_TYPES.put("text/plain", ext);
+ SUPPORTED_MIME_TYPES.put("text/sgml",ext);
}
public htmlParser() {
- super();
- this.parserName = "streaming html parser";
+ super("streaming html parser");
}
@Override
@@ -215,7 +214,7 @@ public class htmlParser extends AbstractParser implements Idiom {
}
- public Hashtable getSupportedMimeTypes() {
+ public HashMap getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
diff --git a/source/de/anomic/document/parser/mimeTypeParser.java b/source/de/anomic/document/parser/mimeTypeParser.java
index d36d72825..c4c568e17 100644
--- a/source/de/anomic/document/parser/mimeTypeParser.java
+++ b/source/de/anomic/document/parser/mimeTypeParser.java
@@ -31,6 +31,7 @@ import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collection;
+import java.util.HashMap;
import java.util.Hashtable;
import net.sf.jmimemagic.Magic;
@@ -54,14 +55,14 @@ public class mimeTypeParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
- public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable();
+ public static final HashMap SUPPORTED_MIME_TYPES = new HashMap();
static {
SUPPORTED_MIME_TYPES.put("text/xml","xml");
SUPPORTED_MIME_TYPES.put("application/xml","xml");
- SUPPORTED_MIME_TYPES.put("application/x-xml","xml");
- SUPPORTED_MIME_TYPES.put("application/octet-stream","");
- SUPPORTED_MIME_TYPES.put("application/x-compress","");
- SUPPORTED_MIME_TYPES.put("application/x-compressed","");
+ SUPPORTED_MIME_TYPES.put("application/x-xml","xml");
+ SUPPORTED_MIME_TYPES.put("application/octet-stream","xml");
+ SUPPORTED_MIME_TYPES.put("application/x-compress","xml");
+ SUPPORTED_MIME_TYPES.put("application/x-compressed","xml");
}
/**
@@ -71,8 +72,7 @@ public class mimeTypeParser extends AbstractParser implements Idiom {
private static Hashtable threadLoopDetection = new Hashtable();
public mimeTypeParser() {
- super();
- this.parserName = "MimeType Parser";
+ super("MimeType Parser");
}
@SuppressWarnings("unchecked")
@@ -174,7 +174,7 @@ public class mimeTypeParser extends AbstractParser implements Idiom {
}
- public java.util.Hashtable getSupportedMimeTypes() {
+ public HashMap getSupportedMimeTypes() {
return mimeTypeParser.SUPPORTED_MIME_TYPES;
}
diff --git a/source/de/anomic/document/parser/odtParser.java b/source/de/anomic/document/parser/odtParser.java
index 0f43be1a9..c7119289c 100644
--- a/source/de/anomic/document/parser/odtParser.java
+++ b/source/de/anomic/document/parser/odtParser.java
@@ -35,8 +35,8 @@ import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.charset.Charset;
import java.util.Enumeration;
+import java.util.HashMap;
import java.util.HashSet;
-import java.util.Hashtable;
import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
@@ -63,18 +63,17 @@ public class odtParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
- public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable();
+ public static final HashMap SUPPORTED_MIME_TYPES = new HashMap();
static {
SUPPORTED_MIME_TYPES.put("application/vnd.oasis.opendocument.text","odt");
SUPPORTED_MIME_TYPES.put("application/x-vnd.oasis.opendocument.text","odt");
}
public odtParser() {
- super();
- this.parserName = "OASIS OpenDocument V2 Text Document Parser";
+ super("OASIS OpenDocument V2 Text Document Parser");
}
- public Hashtable getSupportedMimeTypes() {
+ public HashMap getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
diff --git a/source/de/anomic/document/parser/pdfParser.java b/source/de/anomic/document/parser/pdfParser.java
index af76bd98a..78a5a589f 100644
--- a/source/de/anomic/document/parser/pdfParser.java
+++ b/source/de/anomic/document/parser/pdfParser.java
@@ -33,8 +33,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
-import java.util.Hashtable;
-
+import java.util.HashMap;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDDocumentInformation;
@@ -56,7 +55,7 @@ public class pdfParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
- public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable();
+ public static final HashMap SUPPORTED_MIME_TYPES = new HashMap();
static {
SUPPORTED_MIME_TYPES.put("application/pdf","pdf");
SUPPORTED_MIME_TYPES.put("application/x-pdf","pdf");
@@ -67,11 +66,10 @@ public class pdfParser extends AbstractParser implements Idiom {
}
public pdfParser() {
- super();
- this.parserName = "Acrobat Portable Document Parser";
+ super("Acrobat Portable Document Parser");
}
- public Hashtable getSupportedMimeTypes() {
+ public HashMap getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
diff --git a/source/de/anomic/document/parser/pptParser.java b/source/de/anomic/document/parser/pptParser.java
index 16676329f..3729182ed 100644
--- a/source/de/anomic/document/parser/pptParser.java
+++ b/source/de/anomic/document/parser/pptParser.java
@@ -29,8 +29,7 @@ package de.anomic.document.parser;
import java.io.BufferedInputStream;
import java.io.InputStream;
-import java.util.Hashtable;
-
+import java.util.HashMap;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import de.anomic.document.AbstractParser;
@@ -45,22 +44,21 @@ public class pptParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
- public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable();
- static final String fileExtensions = "ppt,pps";
+ public static final HashMap SUPPORTED_MIME_TYPES = new HashMap();
+ static final String ext = "ppt,pps";
static {
- SUPPORTED_MIME_TYPES.put("application/mspowerpoint",fileExtensions);
- SUPPORTED_MIME_TYPES.put("application/powerpoint",fileExtensions);
- SUPPORTED_MIME_TYPES.put("application/vnd.ms-powerpoint",fileExtensions);
- SUPPORTED_MIME_TYPES.put("application/ms-powerpoint",fileExtensions);
- SUPPORTED_MIME_TYPES.put("application/mspowerpnt",fileExtensions);
- SUPPORTED_MIME_TYPES.put("application/vnd-mspowerpoint",fileExtensions);
- SUPPORTED_MIME_TYPES.put("application/x-powerpoint",fileExtensions);
- SUPPORTED_MIME_TYPES.put("application/x-m",fileExtensions);
+ SUPPORTED_MIME_TYPES.put("application/mspowerpoint",ext);
+ SUPPORTED_MIME_TYPES.put("application/powerpoint",ext);
+ SUPPORTED_MIME_TYPES.put("application/vnd.ms-powerpoint",ext);
+ SUPPORTED_MIME_TYPES.put("application/ms-powerpoint",ext);
+ SUPPORTED_MIME_TYPES.put("application/mspowerpnt",ext);
+ SUPPORTED_MIME_TYPES.put("application/vnd-mspowerpoint",ext);
+ SUPPORTED_MIME_TYPES.put("application/x-powerpoint",ext);
+ SUPPORTED_MIME_TYPES.put("application/x-m",ext);
}
public pptParser(){
- super();
- this.parserName = "Microsoft Powerpoint Parser";
+ super("Microsoft Powerpoint Parser");
}
/*
@@ -116,7 +114,7 @@ public class pptParser extends AbstractParser implements Idiom {
}
}
- public Hashtable getSupportedMimeTypes() {
+ public HashMap getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
diff --git a/source/de/anomic/document/parser/psParser.java b/source/de/anomic/document/parser/psParser.java
index b7a60a405..be6674ce9 100644
--- a/source/de/anomic/document/parser/psParser.java
+++ b/source/de/anomic/document/parser/psParser.java
@@ -34,8 +34,7 @@ import java.io.FileReader;
import java.io.FileWriter;
import java.io.InputStream;
import java.io.InputStreamReader;
-import java.util.Hashtable;
-
+import java.util.HashMap;
import de.anomic.document.AbstractParser;
import de.anomic.document.Idiom;
import de.anomic.document.ParserException;
@@ -49,7 +48,7 @@ public class psParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
- public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable();
+ public static final HashMap SUPPORTED_MIME_TYPES = new HashMap();
static {
SUPPORTED_MIME_TYPES.put("application/ps","ps");
SUPPORTED_MIME_TYPES.put("application/x-postscript","ps");
@@ -62,8 +61,7 @@ public class psParser extends AbstractParser implements Idiom {
private static String parserMode = "java";
public psParser() {
- super();
- this.parserName = "PostScript Document Parser";
+ super("PostScript Document Parser");
if (!modeScanDone) synchronized (modeScan) {
if (testForPs2Ascii()) parserMode = "ps2ascii";
else parserMode = "java";
@@ -71,7 +69,7 @@ public class psParser extends AbstractParser implements Idiom {
}
}
- public Hashtable getSupportedMimeTypes() {
+ public HashMap getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
diff --git a/source/de/anomic/document/parser/rpmParser.java b/source/de/anomic/document/parser/rpmParser.java
index 79dba7936..452bc1572 100644
--- a/source/de/anomic/document/parser/rpmParser.java
+++ b/source/de/anomic/document/parser/rpmParser.java
@@ -31,8 +31,6 @@ import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.InputStream;
import java.util.HashMap;
-import java.util.Hashtable;
-
import com.jguild.jrpm.io.RPMFile;
import com.jguild.jrpm.io.datatype.DataTypeIf;
@@ -57,7 +55,7 @@ public class rpmParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
- public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable();
+ public static final HashMap SUPPORTED_MIME_TYPES = new HashMap();
static {
SUPPORTED_MIME_TYPES.put("application/x-rpm","rpm");
SUPPORTED_MIME_TYPES.put("application/x-redhat packet manager","rpm");
@@ -65,11 +63,10 @@ public class rpmParser extends AbstractParser implements Idiom {
}
public rpmParser() {
- super();
- this.parserName = "rpm Parser";
+ super("rpm Parser");
}
- public Hashtable getSupportedMimeTypes() {
+ public HashMap getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
diff --git a/source/de/anomic/document/parser/rssParser.java b/source/de/anomic/document/parser/rssParser.java
index aadf35034..d893b6ca8 100644
--- a/source/de/anomic/document/parser/rssParser.java
+++ b/source/de/anomic/document/parser/rssParser.java
@@ -33,7 +33,6 @@ import java.io.InputStream;
import java.io.Writer;
import java.nio.charset.Charset;
import java.util.HashMap;
-import java.util.Hashtable;
import java.util.LinkedList;
import java.util.Map;
@@ -59,7 +58,7 @@ public class rssParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
- public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable();
+ public static final HashMap SUPPORTED_MIME_TYPES = new HashMap();
static final String fileExtensions = "xml,rss,rdf";
static {
SUPPORTED_MIME_TYPES.put("text/rss",fileExtensions);
@@ -69,8 +68,7 @@ public class rssParser extends AbstractParser implements Idiom {
}
public rssParser() {
- super();
- this.parserName = "Rich Site Summary/Atom Feed Parser";
+ super("Rich Site Summary/Atom Feed Parser");
}
public Document parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
@@ -176,7 +174,7 @@ public class rssParser extends AbstractParser implements Idiom {
}
}
- public Hashtable getSupportedMimeTypes() {
+ public HashMap getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
diff --git a/source/de/anomic/document/parser/rtfParser.java b/source/de/anomic/document/parser/rtfParser.java
index 3a48ca4c3..30e70894c 100644
--- a/source/de/anomic/document/parser/rtfParser.java
+++ b/source/de/anomic/document/parser/rtfParser.java
@@ -28,8 +28,7 @@
package de.anomic.document.parser;
import java.io.InputStream;
-import java.util.Hashtable;
-
+import java.util.HashMap;
import javax.swing.text.DefaultStyledDocument;
import javax.swing.text.rtf.RTFEditorKit;
@@ -45,7 +44,7 @@ public class rtfParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
- public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable();
+ public static final HashMap SUPPORTED_MIME_TYPES = new HashMap();
static {
SUPPORTED_MIME_TYPES.put("application/rtf","rtf");
SUPPORTED_MIME_TYPES.put("text/rtf","rtf");
@@ -57,8 +56,7 @@ public class rtfParser extends AbstractParser implements Idiom {
}
public rtfParser() {
- super();
- this.parserName = "Rich Text Format Parser";
+ super("Rich Text Format Parser");
}
public Document parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
@@ -100,7 +98,7 @@ public class rtfParser extends AbstractParser implements Idiom {
}
}
- public Hashtable getSupportedMimeTypes() {
+ public HashMap getSupportedMimeTypes() {
return rtfParser.SUPPORTED_MIME_TYPES;
}
diff --git a/source/de/anomic/document/parser/sevenzipParser.java b/source/de/anomic/document/parser/sevenzipParser.java
index db2159ece..2d3fa7af4 100644
--- a/source/de/anomic/document/parser/sevenzipParser.java
+++ b/source/de/anomic/document/parser/sevenzipParser.java
@@ -32,8 +32,7 @@ import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
-import java.util.Hashtable;
-
+import java.util.HashMap;
import SevenZip.ArchiveExtractCallback;
import SevenZip.IInStream;
import SevenZip.MyRandomAccessFile;
@@ -41,7 +40,6 @@ import SevenZip.Archive.IInArchive;
import SevenZip.Archive.SevenZipEntry;
import SevenZip.Archive.SevenZip.Handler;
import de.anomic.document.AbstractParser;
-import de.anomic.document.Classification;
import de.anomic.document.Idiom;
import de.anomic.document.Parser;
import de.anomic.document.ParserException;
@@ -57,14 +55,13 @@ public class sevenzipParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
- public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable();
+ public static final HashMap SUPPORTED_MIME_TYPES = new HashMap();
static {
SUPPORTED_MIME_TYPES.put("application/x-7z-compressed", "7z");
}
public sevenzipParser() {
- super();
- super.parserName = "7zip Archive Parser";
+ super("7zip Archive Parser");
}
public Document parse(final yacyURL location, final String mimeType, final String charset,
@@ -127,7 +124,7 @@ public class sevenzipParser extends AbstractParser implements Idiom {
}
}
- public Hashtable getSupportedMimeTypes() {
+ public HashMap getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
@@ -190,7 +187,7 @@ public class sevenzipParser extends AbstractParser implements Idiom {
// workaround for relative links in file, normally '#' shall be used behind the location, see
// below for reversion of the effects
final yacyURL url = yacyURL.newURL(doc.dc_source(), this.prefix + "/" + super.filePath);
- final String mime = Classification.getMimeTypeByFileExt(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
+ final String mime = Parser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
if (this.cfos.isFallback()) {
theDoc = Parser.parseSource(url, mime, null, this.cfos.getContentFile());
} else {
diff --git a/source/de/anomic/document/parser/swfParser.java b/source/de/anomic/document/parser/swfParser.java
index d80bbdd0a..35caacab7 100644
--- a/source/de/anomic/document/parser/swfParser.java
+++ b/source/de/anomic/document/parser/swfParser.java
@@ -29,8 +29,6 @@ package de.anomic.document.parser;
import java.io.InputStream;
import java.util.HashMap;
-import java.util.Hashtable;
-
import pt.tumba.parser.swf.SWF2HTML;
import de.anomic.document.AbstractParser;
import de.anomic.document.Idiom;
@@ -44,7 +42,7 @@ public class swfParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
- public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable();
+ public static final HashMap SUPPORTED_MIME_TYPES = new HashMap();
static {
SUPPORTED_MIME_TYPES.put("application/x-shockwave-flash","swf");
SUPPORTED_MIME_TYPES.put("application/x-shockwave-flash2-preview","swf");
@@ -53,14 +51,13 @@ public class swfParser extends AbstractParser implements Idiom {
}
public swfParser() {
- super();
- this.parserName = "Adobe Flash Parser";
+ super("Adobe Flash Parser");
}
/**
* returns a hashtable containing the mimetypes that are supported by this class
*/
- public Hashtable getSupportedMimeTypes() {
+ public HashMap getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
diff --git a/source/de/anomic/document/parser/tarParser.java b/source/de/anomic/document/parser/tarParser.java
index 739d9e662..64113b8b7 100644
--- a/source/de/anomic/document/parser/tarParser.java
+++ b/source/de/anomic/document/parser/tarParser.java
@@ -34,7 +34,6 @@ import java.io.InputStream;
import java.io.OutputStream;
import java.util.Arrays;
import java.util.HashMap;
-import java.util.Hashtable;
import java.util.LinkedList;
import java.util.Map;
import java.util.zip.GZIPInputStream;
@@ -43,7 +42,6 @@ import com.ice.tar.TarEntry;
import com.ice.tar.TarInputStream;
import de.anomic.document.AbstractParser;
-import de.anomic.document.Classification;
import de.anomic.document.Idiom;
import de.anomic.document.Parser;
import de.anomic.document.ParserException;
@@ -60,7 +58,7 @@ public class tarParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
- public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable();
+ public static final HashMap SUPPORTED_MIME_TYPES = new HashMap();
static {
SUPPORTED_MIME_TYPES.put("application/x-tar","tar");
SUPPORTED_MIME_TYPES.put("application/tar","tar");
@@ -71,11 +69,10 @@ public class tarParser extends AbstractParser implements Idiom {
}
public tarParser() {
- super();
- this.parserName = "Tape Archive File Parser";
+ super("Tape Archive File Parser");
}
- public Hashtable getSupportedMimeTypes() {
+ public HashMap getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
@@ -97,7 +94,7 @@ public class tarParser extends AbstractParser implements Idiom {
* If the mimeType was not reported correcly by the webserve we
* have to decompress it first
*/
- final String ext = Classification.getFileExt(location).toLowerCase();
+ final String ext = location.getFileExtension().toLowerCase();
if (ext.equals("gz") || ext.equals("tgz")) {
source = new GZIPInputStream(source);
}
@@ -130,7 +127,7 @@ public class tarParser extends AbstractParser implements Idiom {
final String entryExt = (idx > -1) ? entryName.substring(idx+1) : "";
// trying to determine the mimeType per file extension
- final String entryMime = Classification.getMimeTypeByFileExt(entryExt);
+ final String entryMime = Parser.mimeOf(entryExt);
// getting the entry content
File subDocTempFile = null;
diff --git a/source/de/anomic/document/parser/vcfParser.java b/source/de/anomic/document/parser/vcfParser.java
index b172ad523..f2ad16267 100644
--- a/source/de/anomic/document/parser/vcfParser.java
+++ b/source/de/anomic/document/parser/vcfParser.java
@@ -33,7 +33,6 @@ import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.util.HashMap;
-import java.util.Hashtable;
import java.util.Iterator;
import java.util.LinkedList;
@@ -61,7 +60,7 @@ public class vcfParser extends AbstractParser implements Idiom {
*
* TODO: support of x-mozilla-cpt and x-mozilla-html tags
*/
- public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable();
+ public static final HashMap SUPPORTED_MIME_TYPES = new HashMap();
static {
SUPPORTED_MIME_TYPES.put("text/x-vcard","vcf");
SUPPORTED_MIME_TYPES.put("application/vcard","vcf");
@@ -73,11 +72,10 @@ public class vcfParser extends AbstractParser implements Idiom {
}
public vcfParser() {
- super();
- this.parserName = "vCard Parser";
+ super("vCard Parser");
}
- public Hashtable getSupportedMimeTypes() {
+ public HashMap getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
diff --git a/source/de/anomic/document/parser/vsdParser.java b/source/de/anomic/document/parser/vsdParser.java
index a3eb94fd1..0bea160cc 100644
--- a/source/de/anomic/document/parser/vsdParser.java
+++ b/source/de/anomic/document/parser/vsdParser.java
@@ -28,8 +28,7 @@
package de.anomic.document.parser;
import java.io.InputStream;
-import java.util.Hashtable;
-
+import java.util.HashMap;
import de.anomic.document.AbstractParser;
import de.anomic.document.Idiom;
import de.anomic.document.ParserException;
@@ -44,7 +43,7 @@ public class vsdParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
- public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable();
+ public static final HashMap SUPPORTED_MIME_TYPES = new HashMap();
static {
SUPPORTED_MIME_TYPES.put("application/visio","vsd");
SUPPORTED_MIME_TYPES.put("application/x-visio","vsd");
@@ -57,14 +56,13 @@ public class vsdParser extends AbstractParser implements Idiom {
}
public vsdParser() {
- super();
- this.parserName = "Microsoft Visio Parser";
+ super("Microsoft Visio Parser");
}
/**
* returns a hashtable containing the mimetypes that are supported by this class
*/
- public Hashtable getSupportedMimeTypes() {
+ public HashMap getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
diff --git a/source/de/anomic/document/parser/xlsParser.java b/source/de/anomic/document/parser/xlsParser.java
index 97634b0f0..0330677e8 100644
--- a/source/de/anomic/document/parser/xlsParser.java
+++ b/source/de/anomic/document/parser/xlsParser.java
@@ -28,8 +28,7 @@
package de.anomic.document.parser;
import java.io.InputStream;
-import java.util.Hashtable;
-
+import java.util.HashMap;
import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
import org.apache.poi.hssf.eventusermodel.HSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFRequest;
@@ -57,21 +56,21 @@ public class xlsParser extends AbstractParser implements Idiom, HSSFListener {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
- public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable();
- static {
- SUPPORTED_MIME_TYPES.put("application/msexcel","xls");
- SUPPORTED_MIME_TYPES.put("application/excel","xls");
- SUPPORTED_MIME_TYPES.put("application/vnd.ms-excel","xls");
- SUPPORTED_MIME_TYPES.put("application/x-excel","xls");
- SUPPORTED_MIME_TYPES.put("application/x-msexcel","xls");
- SUPPORTED_MIME_TYPES.put("application/x-ms-excel","xls");
- SUPPORTED_MIME_TYPES.put("application/x-dos_ms_excel","xls");
- SUPPORTED_MIME_TYPES.put("application/xls","xls");
+ public static final HashMap SUPPORTED_MIME_TYPES = new HashMap();
+ static {
+ String ext = "xls,xlsx";
+ SUPPORTED_MIME_TYPES.put("application/msexcel",ext);
+ SUPPORTED_MIME_TYPES.put("application/excel",ext);
+ SUPPORTED_MIME_TYPES.put("application/vnd.ms-excel",ext);
+ SUPPORTED_MIME_TYPES.put("application/x-excel",ext);
+ SUPPORTED_MIME_TYPES.put("application/x-msexcel",ext);
+ SUPPORTED_MIME_TYPES.put("application/x-ms-excel",ext);
+ SUPPORTED_MIME_TYPES.put("application/x-dos_ms_excel",ext);
+ SUPPORTED_MIME_TYPES.put("application/xls",ext);
}
public xlsParser(){
- super();
- this.parserName = "Microsoft Excel Parser";
+ super("Microsoft Excel Parser");
}
/*
@@ -135,7 +134,7 @@ public class xlsParser extends AbstractParser implements Idiom, HSSFListener {
}
}
- public Hashtable getSupportedMimeTypes() {
+ public HashMap getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
diff --git a/source/de/anomic/document/parser/zipParser.java b/source/de/anomic/document/parser/zipParser.java
index 7b08f58ec..29a2ac431 100644
--- a/source/de/anomic/document/parser/zipParser.java
+++ b/source/de/anomic/document/parser/zipParser.java
@@ -34,14 +34,12 @@ import java.io.InputStream;
import java.io.OutputStream;
import java.util.Arrays;
import java.util.HashMap;
-import java.util.Hashtable;
import java.util.LinkedList;
import java.util.Map;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import de.anomic.document.AbstractParser;
-import de.anomic.document.Classification;
import de.anomic.document.Idiom;
import de.anomic.document.Parser;
import de.anomic.document.ParserException;
@@ -58,7 +56,7 @@ public class zipParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
- public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable();
+ public static final HashMap SUPPORTED_MIME_TYPES = new HashMap();
static {
SUPPORTED_MIME_TYPES.put("application/zip","zip");
SUPPORTED_MIME_TYPES.put("application/x-zip","zip");
@@ -71,11 +69,10 @@ public class zipParser extends AbstractParser implements Idiom {
}
public zipParser() {
- super();
- this.parserName = "Compressed Archive File Parser";
+ super("Compressed Archive File Parser");
}
- public Hashtable getSupportedMimeTypes() {
+ public HashMap getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
@@ -118,7 +115,7 @@ public class zipParser extends AbstractParser implements Idiom {
final String entryExt = (idx > -1) ? entryName.substring(idx+1) : "";
// trying to determine the mimeType per file extension
- final String entryMime = Classification.getMimeTypeByFileExt(entryExt);
+ final String entryMime = Parser.mimeOf(entryExt);
// parsing the content
File subDocTempFile = null;
diff --git a/source/de/anomic/http/httpdFileHandler.java b/source/de/anomic/http/httpdFileHandler.java
index ad5df2dc4..a48a8007a 100644
--- a/source/de/anomic/http/httpdFileHandler.java
+++ b/source/de/anomic/http/httpdFileHandler.java
@@ -231,7 +231,7 @@ public final class httpdFileHandler {
}
headers.put(httpHeader.SERVER, "AnomicHTTPD (www.anomic.de)");
headers.put(httpHeader.DATE, DateFormatter.formatRFC1123(new Date()));
- if(!(Classification.mediaExtContains(ext))){
+ if(!(Classification.isMediaExtension(ext))){
headers.put(httpHeader.PRAGMA, "no-cache");
}
return headers;
diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java
index cb5e791c2..f365ae71b 100644
--- a/source/de/anomic/http/httpdProxyHandler.java
+++ b/source/de/anomic/http/httpdProxyHandler.java
@@ -73,7 +73,7 @@ import java.util.zip.GZIPOutputStream;
import de.anomic.crawler.HTTPLoader;
import de.anomic.data.Blacklist;
-import de.anomic.document.Classification;
+import de.anomic.document.Parser;
import de.anomic.document.parser.html.ContentTransformer;
import de.anomic.document.parser.html.Transformer;
import de.anomic.kelondro.util.DateFormatter;
@@ -522,13 +522,13 @@ public final class httpdProxyHandler {
res.getStatusLine().substring(4), // status text
responseHeader);
- if(hasBody(res.getStatusCode())) {
+ if (hasBody(res.getStatusCode())) {
final OutputStream outStream = (gzippedOut != null) ? gzippedOut : ((chunkedOut != null)? chunkedOut : respond);
final String storeError = cacheEntry.shallStoreCacheForProxy();
final boolean storeHTCache = cacheEntry.profile().storeHTCache();
- final boolean isSupportedContent = Classification.supportedContent(cacheEntry.url(), cacheEntry.getMimeType());
+ final boolean isSupportedContent = Parser.supportsExtension(cacheEntry.url()) && Parser.supportsMime(cacheEntry.getMimeType());
if (
/*
* Now we store the response into the htcache directory if
diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java
index 2dc04f298..66c8bfd6f 100644
--- a/source/de/anomic/plasma/plasmaHTCache.java
+++ b/source/de/anomic/plasma/plasmaHTCache.java
@@ -42,6 +42,7 @@ import java.util.HashMap;
import java.util.Map;
import de.anomic.document.Classification;
+import de.anomic.document.Parser;
import de.anomic.http.httpResponseHeader;
import de.anomic.http.httpDocument;
import de.anomic.kelondro.blob.ArrayStack;
@@ -181,7 +182,7 @@ public final class plasmaHTCache {
}
public static boolean isText(final String mimeType) {
- return Classification.supportedMimeTypesContains(mimeType);
+ return Parser.supportsMime(mimeType);
}
public static boolean noIndexingURL(final yacyURL url) {
@@ -200,7 +201,7 @@ public final class plasmaHTCache {
//php
- return Classification.mediaExtContains(urlString);
+ return Classification.isMediaExtension(urlString);
}
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index 5473fbb74..2613f4c31 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -144,7 +144,6 @@ import de.anomic.data.wiki.wikiBoard;
import de.anomic.data.wiki.wikiCode;
import de.anomic.data.wiki.wikiParser;
import de.anomic.document.Condenser;
-import de.anomic.document.Classification;
import de.anomic.document.Parser;
import de.anomic.document.ParserException;
import de.anomic.document.Word;
@@ -513,18 +512,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitchpublic static final String PROXY_ONLINE_CAUTION_DELAY = "onlineCautionDelay"
* Name of the setting how long indexing should pause after the last time the proxy was used in milliseconds
diff --git a/source/de/anomic/search/SnippetCache.java b/source/de/anomic/search/SnippetCache.java
index fff783ec2..489ca74fd 100644
--- a/source/de/anomic/search/SnippetCache.java
+++ b/source/de/anomic/search/SnippetCache.java
@@ -39,7 +39,6 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import de.anomic.document.Condenser;
-import de.anomic.document.Classification;
import de.anomic.document.Parser;
import de.anomic.document.ParserException;
import de.anomic.document.Word;
@@ -867,29 +866,13 @@ public class SnippetCache {
// STEP 3: if the metadata is still null try to guess the mimeType of the resource
if (responseHeader == null) {
- final String filename = url.getFileName();
- final int p = filename.lastIndexOf('.');
- if ( // if no extension is available
- (p < 0) ||
- // or the extension is supported by one of the parsers
- ((p >= 0) && (Classification.supportedFileExtContains(filename.substring(p + 1))))
- ) {
- String supposedMime = "text/html";
-
- // if the mimeType Parser is installed we can set the mimeType to null to force
- // a mimetype detection
- if (Classification.supportedMimeTypesContains("application/octet-stream")) {
- supposedMime = null;
- } else if (p != -1){
- // otherwise we try to determine the mimeType per file Extension
- supposedMime = Classification.getMimeTypeByFileExt(filename.substring(p + 1));
- }
-
+ if (Parser.supportsExtension(url)) {
+ String supposedMime = Parser.mimeOf(url);
return Parser.parseSource(url, supposedMime, null, contentLength, resourceStream);
}
return null;
}
- if (Classification.supportedMimeTypesContains(responseHeader.mime())) {
+ if (Parser.supportsMime(responseHeader.mime())) {
return Parser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), contentLength, resourceStream);
}
return null;
diff --git a/source/de/anomic/tools/mediawikiIndex.java b/source/de/anomic/tools/mediawikiIndex.java
index 8bad45c78..b248facf8 100644
--- a/source/de/anomic/tools/mediawikiIndex.java
+++ b/source/de/anomic/tools/mediawikiIndex.java
@@ -58,7 +58,6 @@ import java.util.concurrent.TimeoutException;
import de.anomic.data.wiki.wikiCode;
import de.anomic.data.wiki.wikiParser;
-import de.anomic.document.Classification;
import de.anomic.document.Parser;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
@@ -102,9 +101,6 @@ public class mediawikiIndex extends Thread {
this.wparser = new wikiCode(new URL(baseURL).getHost());
this.count = 0;
this.start = 0;
- // must be called before usage:
- Classification.initHTMLParsableMimeTypes("text/html");
- Classification.addParseableMimeTypes("text/html");
}
/**
@@ -146,8 +142,6 @@ public class mediawikiIndex extends Thread {
StringBuilder sb = new StringBuilder();
boolean page = false, text = false;
String title = null;
- Classification.initHTMLParsableMimeTypes("text/html");
- Classification.addParseableMimeTypes("text/html");
wikiparserrecord poison = newRecord();
int threads = Math.max(2, Runtime.getRuntime().availableProcessors() - 1);
BlockingQueue in = new ArrayBlockingQueue(threads * 10);
diff --git a/source/de/anomic/yacy/yacyURL.java b/source/de/anomic/yacy/yacyURL.java
index 188582ea8..623343a6a 100644
--- a/source/de/anomic/yacy/yacyURL.java
+++ b/source/de/anomic/yacy/yacyURL.java
@@ -528,6 +528,13 @@ public class yacyURL implements Serializable {
return path.substring(p + 1); // the 'real' file name
}
+ public String getFileExtension() {
+ String name = getFileName();
+ int p = name.lastIndexOf('.');
+ if (p < 0) return "";
+ return name.substring(p + 1);
+ }
+
public String getPath() {
return path;
}