enhanced document type recognition

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6209 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent aa38eb5a20
commit b2263bc720

@ -250,6 +250,7 @@ minimumGlobalDelta = 500
# the following mime-types are a blacklist for indexing: # the following mime-types are a blacklist for indexing:
# parser.mime.deny: specifies mime-types that shall not be indexed # parser.mime.deny: specifies mime-types that shall not be indexed
parser.mime.deny= parser.mime.deny=
parser.extensions.deny=
# Promotion Strings # Promotion Strings
# These strings appear in the Web Mask of the YACY search client # These strings appear in the Web Mask of the YACY search client

@ -49,7 +49,7 @@ public class ConfigParser {
post.remove("parserSettings"); post.remove("parserSettings");
for (Idiom parser: Parser.idioms()) { for (Idiom parser: Parser.idioms()) {
for (String mimeType: parser.getSupportedMimeTypes().keySet()) { for (String mimeType: parser.supportedMimeTypes()) {
Parser.grantMime(mimeType, post.get("mimename_" + mimeType, "").equals("on")); Parser.grantMime(mimeType, post.get("mimename_" + mimeType, "").equals("on"));
} }
} }
@ -62,9 +62,9 @@ public class ConfigParser {
prop.put("parser_" + i + "_name", parser.getName()); prop.put("parser_" + i + "_name", parser.getName());
int mimeIdx = 0; int mimeIdx = 0;
for (String mimeType: parser.getSupportedMimeTypes().keySet()) { for (String mimeType: parser.supportedMimeTypes()) {
prop.put("parser_" + i + "_mime_" + mimeIdx + "_mimetype", mimeType); prop.put("parser_" + i + "_mime_" + mimeIdx + "_mimetype", mimeType);
prop.put("parser_" + i + "_mime_" + mimeIdx + "_status", (Parser.supportsMime(mimeType)) ? 1 : 0); prop.put("parser_" + i + "_mime_" + mimeIdx + "_status", (Parser.supportsMime(mimeType) == null) ? 1 : 0);
mimeIdx++; mimeIdx++;
} }
prop.put("parser_" + i + "_mime", mimeIdx); prop.put("parser_" + i + "_mime", mimeIdx);

@ -224,16 +224,12 @@ public class FTPLoader {
// if the mimetype and file extension is supported we start to download // if the mimetype and file extension is supported we start to download
// the file // the file
httpDocument htCache = null; httpDocument htCache = null;
if (!Parser.supportsExtension(entryUrl)) { String supportError = Parser.supports(entryUrl, mimeType);
// if the response has not the right file type then reject file if (supportError != null) {
log.logInfo("REJECTED WRONG EXTENSION TYPE " + mimeType + " for URL " + entry.url().toString()); // reject file
sb.crawlQueues.errorURL.newEntry(entry, this.sb.peers.mySeed().hash, new Date(), 1, "wrong extension"); log.logInfo("PARSER REJECTED URL " + entry.url().toString() + ": " + supportError);
throw new Exception("response has not the right extension type -> rejected"); sb.crawlQueues.errorURL.newEntry(entry, this.sb.peers.mySeed().hash, new Date(), 1, supportError);
} else if (!Parser.supportsMime(mimeType)) { throw new Exception(supportError);
// if the response has not the right file type then reject file
log.logInfo("REJECTED WRONG MIME TYPE " + mimeType + " for URL " + entry.url().toString());
sb.crawlQueues.errorURL.newEntry(entry, this.sb.peers.mySeed().hash, new Date(), 1, "wrong mime type");
throw new Exception("response has not the right mime type -> rejected");
} else { } else {
// abort the download if content is too long // abort the download if content is too long
final int size = ftpClient.fileSize(path); final int size = ftpClient.fileSize(path);

@ -120,8 +120,9 @@ public final class HTTPLoader {
if (port < 0) port = (ssl) ? 443 : 80; if (port < 0) port = (ssl) ? 443 : 80;
// if not the right file type then reject file // if not the right file type then reject file
if (!Parser.supportsExtension(entry.url())) { String supportError = Parser.supportsExtension(entry.url());
sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "wrong extension"); if (supportError != null) {
sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, supportError);
throw new IOException("REJECTED WRONG EXTENSION TYPE " + entry.url().getFileExtension()+ " for URL " + entry.url().toString()); throw new IOException("REJECTED WRONG EXTENSION TYPE " + entry.url().getFileExtension()+ " for URL " + entry.url().toString());
} }
@ -166,8 +167,9 @@ public final class HTTPLoader {
//try { //try {
// if the response has not the right file type then reject file // if the response has not the right file type then reject file
if (!Parser.supportsMime(res.getResponseHeader().mime())) { supportError = Parser.supports(entry.url(), res.getResponseHeader().mime());
sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "wrong mime type"); if (supportError != null) {
sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, supportError);
throw new IOException("REJECTED WRONG MIME TYPE " + res.getResponseHeader().mime() + " for URL " + entry.url().toString()); throw new IOException("REJECTED WRONG MIME TYPE " + res.getResponseHeader().mime() + " for URL " + entry.url().toString());
} }

@ -34,6 +34,7 @@ import java.util.Date;
import java.util.Iterator; import java.util.Iterator;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import de.anomic.document.Parser;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.http.httpResponseHeader; import de.anomic.http.httpResponseHeader;
import de.anomic.kelondro.index.Row; import de.anomic.kelondro.index.Row;
@ -480,8 +481,9 @@ public class IndexingStack {
if (plasmaHTCache.isPicture(mimeType)) { if (plasmaHTCache.isPicture(mimeType)) {
return "Media_Content_(Picture)"; return "Media_Content_(Picture)";
} }
if (!plasmaHTCache.isText(mimeType)) { String parserError = Parser.supportsMime(mimeType);
return "Media_Content_(not_text)"; if (parserError != null) {
return "Media_Content, no parser: " + parserError;
} }
// -if-modified-since in request // -if-modified-since in request
@ -598,7 +600,8 @@ public class IndexingStack {
if (responseHeader != null) { if (responseHeader != null) {
final String mimeType = responseHeader.mime(); final String mimeType = responseHeader.mime();
if (plasmaHTCache.isPicture(mimeType)) { return "Media_Content_(Picture)"; } if (plasmaHTCache.isPicture(mimeType)) { return "Media_Content_(Picture)"; }
if (!plasmaHTCache.isText(mimeType)) { return "Media_Content_(not_text)"; } String parserError = Parser.supportsMime(mimeType);
if (parserError != null) { return "Media_Content, parser error: " + parserError; }
} }
if (plasmaHTCache.noIndexingURL(url())) { return "Media_Content_(forbidden)"; } if (plasmaHTCache.noIndexingURL(url())) { return "Media_Content_(forbidden)"; }

@ -27,8 +27,7 @@ package de.anomic.document;
import java.io.File; import java.io.File;
import java.io.InputStream; import java.io.InputStream;
import java.util.HashMap; import java.util.Set;
import java.util.Hashtable;
import de.anomic.yacy.yacyURL; import de.anomic.yacy.yacyURL;
@ -87,11 +86,15 @@ public interface Idiom {
/** /**
* Get the MimeType(s) that are supported by the parser * Get the MimeType(s) that are supported by the parser
* @return a {@link Hashtable} containing a mapping from a mime type string * @return a set of strings denoting the supported mime types
* to a comma-separated String of file extensions
* that are supported by the idiom parser
*/ */
public HashMap<String, String> getSupportedMimeTypes(); public Set<String> supportedMimeTypes();
/**
* Get the File extension(s) that are supported by the parser
* @return a set of strings denoting the supported file extensions
*/
public Set<String> supportedExtensions();
/** /**
* This function should be called before reusing the parser object. * This function should be called before reusing the parser object.

@ -72,8 +72,9 @@ public final class Parser {
} }
private static final Map<String, Idiom> mime2parser = new TreeMap<String, Idiom>(insensitiveCollator); private static final Map<String, Idiom> mime2parser = new TreeMap<String, Idiom>(insensitiveCollator);
private static final Map<String, Set<String>> ext2mime = new TreeMap<String, Set<String>>(insensitiveCollator); private static final Map<String, String> ext2mime = new TreeMap<String, String>(insensitiveCollator);
private static final Set<String> denyMime = new TreeSet<String>(insensitiveCollator); private static final Set<String> denyMime = new TreeSet<String>(insensitiveCollator);
private static final Set<String> denyExtension = new TreeSet<String>(insensitiveCollator);
static { static {
initParser(new bzipParser()); initParser(new bzipParser());
@ -103,21 +104,20 @@ public final class Parser {
} }
private static void initParser(Idiom parser) { private static void initParser(Idiom parser) {
for (Map.Entry<String, String> e: parser.getSupportedMimeTypes().entrySet()) { String prototypeMime = null;
for (String mime: parser.supportedMimeTypes()) {
// process the mime types // process the mime types
final String mimeType = normalizeMimeType(e.getKey()); final String mimeType = normalizeMimeType(mime);
if (prototypeMime == null) prototypeMime = mimeType;
Idiom p0 = mime2parser.get(mimeType); Idiom p0 = mime2parser.get(mimeType);
if (p0 != null) log.logSevere("parser for mime '" + mimeType + "' was set to '" + p0.getName() + "', overwriting with new parser '" + parser.getName() + "'."); if (p0 != null) log.logSevere("parser for mime '" + mimeType + "' was set to '" + p0.getName() + "', overwriting with new parser '" + parser.getName() + "'.");
mime2parser.put(mimeType, parser); mime2parser.put(mimeType, parser);
Log.logInfo("PARSER", "Parser for mime type '" + mimeType + "': " + parser.getName()); Log.logInfo("PARSER", "Parser for mime type '" + mimeType + "': " + parser.getName());
// process the extensions if (prototypeMime != null) for (String ext: parser.supportedExtensions()) {
String[] exts = e.getValue().split(","); String s = ext2mime.get(ext);
for (String ext: exts) { if (s != null) log.logSevere("parser for extension '" + ext + "' was set to mime '" + s + "', overwriting with new mime '" + prototypeMime + "'.");
Set<String> s = ext2mime.get(ext); ext2mime.put(ext, prototypeMime);
if (s == null) s = new HashSet<String>();
s.add(mimeType);
ext2mime.put(ext, s);
} }
} }
} }
@ -148,9 +148,12 @@ public final class Parser {
} }
} }
public static Document parseSource(final yacyURL location, public static Document parseSource(
final String mimeType, final String charset, final yacyURL location,
final File sourceFile) throws InterruptedException, ParserException { final String mimeType,
final String charset,
final File sourceFile
) throws InterruptedException, ParserException {
BufferedInputStream sourceStream = null; BufferedInputStream sourceStream = null;
try { try {
@ -174,39 +177,33 @@ public final class Parser {
} }
} }
public static Document parseSource(final yacyURL location, public static Document parseSource(
String mimeType, final String charset, final yacyURL location,
final long contentLength, final InputStream sourceStream) String mimeType,
throws InterruptedException, ParserException { final String charset,
final long contentLength,
final InputStream sourceStream
) throws InterruptedException, ParserException {
try { try {
if (log.isFine()) log.logFine("Parsing '" + location + "' from stream"); if (log.isFine()) log.logFine("Parsing '" + location + "' from stream");
mimeType = normalizeMimeType(mimeType); mimeType = normalizeMimeType(mimeType);
final String fileExt = location.getFileExtension(); final String fileExt = location.getFileExtension();
final String documentCharset = htmlParser.patchCharsetEncoding(charset); final String documentCharset = htmlParser.patchCharsetEncoding(charset);
if (!supportsMime(mimeType)) { Idiom parser = idiomParser(location, mimeType);
final String errorMsg = "No parser available to parse mimetype '" + mimeType + "'";
log.logInfo("Unable to parse '" + location + "'. " + errorMsg); if (parser == null) {
throw new ParserException(errorMsg, location); final String errorMsg = "No parser available to parse extension '" + location.getFileExtension() + "' or mimetype '" + mimeType + "'";
}
if (!supportsExtension(location)) {
final String errorMsg = "No parser available to parse extension of url path";
log.logInfo("Unable to parse '" + location + "'. " + errorMsg); log.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location); throw new ParserException(errorMsg, location);
} }
if (log.isFine()) log.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'."); if (log.isFine()) log.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'.");
Idiom parser = mime2parser.get(normalizeMimeType(mimeType)); parser.setContentLength(contentLength);
Document doc = null; Document doc = parser.parse(location, mimeType, documentCharset, sourceStream);
if (parser != null) {
parser.setContentLength(contentLength);
doc = parser.parse(location, mimeType, documentCharset, sourceStream);
} else {
final String errorMsg = "No parser available to parse mimetype '" + mimeType + "' (2)";
log.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location);
}
if (doc == null) { if (doc == null) {
final String errorMsg = "Unexpected error. Parser returned null."; final String errorMsg = "Parsing content with file extension '" + location.getFileExtension() + "' and mimetype '" + mimeType + "' failed: document == null";
log.logInfo("Unable to parse '" + location + "'. " + errorMsg); log.logWarning("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location); throw new ParserException(errorMsg, location);
} }
return doc; return doc;
@ -218,16 +215,66 @@ public final class Parser {
throw new ParserException(errorMsg, location); throw new ParserException(errorMsg, location);
} }
} }
public static boolean supportsMime(String mimeType) { /**
* check if the parser supports the given content.
* @param url
* @param mimeType
* @return returns null if the content is supportet. If the content is not supported, return a error string.
*/
public static String supports(final yacyURL url, String mimeType) {
try {
// try to get a parser. If this works, we don't need the parser itself, we just return null to show that everything is ok.
idiomParser(url, mimeType);
return null;
} catch (ParserException e) {
// in case that a parser is not available, return a error string describing the problem.
return e.getMessage();
}
}
private static Idiom idiomParser(final yacyURL url, String mimeType) throws ParserException {
// check mime type
if (mimeType != null) {
mimeType = normalizeMimeType(mimeType);
if (denyMime.contains(mimeType)) throw new ParserException("mime type '" + mimeType + "' is denied", url);
} else {
mimeType = normalizeMimeType(mimeType);
}
Idiom idiom = mime2parser.get(mimeType);
if (idiom != null) return idiom;
// check extension
String ext = url.getFileExtension();
if (ext == null || ext.length() == 0) throw new ParserException("no file extension", url);
if (denyExtension.contains(ext)) throw new ParserException("file extension '" + ext + "' is denied", url);
mimeType = ext2mime.get(ext);
if (mimeType == null) throw new ParserException("no parser available", url);
idiom = mime2parser.get(mimeType);
assert idiom != null;
if (idiom == null) throw new ParserException("no parser available (internal error!)", url);
return idiom;
}
public static String supportsMime(String mimeType) {
if (mimeType == null) return null;
mimeType = normalizeMimeType(mimeType); mimeType = normalizeMimeType(mimeType);
return !denyMime.contains(mimeType) && mime2parser.containsKey(normalizeMimeType(mimeType)); if (denyMime.contains(mimeType)) return "mime type '" + mimeType + "' is denied";
if (mime2parser.get(mimeType) == null) return "no parser for mime '" + mimeType + "' available";
return null;
} }
public static boolean supportsExtension(final yacyURL url) { public static String supportsExtension(final yacyURL url) {
String ext = url.getFileExtension(); String ext = url.getFileExtension();
if (ext.length() == 0) return true; // may be anything; thats ok if the mime type is ok if (ext == null || ext.length() == 0) return null;
return ext2mime.containsKey(ext); if (denyExtension.contains(ext)) return "file extension '" + ext + "' is denied";
String mimeType = ext2mime.get(ext);
if (mimeType == null) return "no parser available";
Idiom idiom = mime2parser.get(mimeType);
assert idiom != null;
if (idiom == null) return "no parser available (internal error!)";
return null;
} }
public static String mimeOf(yacyURL url) { public static String mimeOf(yacyURL url) {
@ -235,9 +282,7 @@ public final class Parser {
} }
public static String mimeOf(String ext) { public static String mimeOf(String ext) {
Set<String> mimes = ext2mime.get(ext); return ext2mime.get(ext);
if (mimes == null) return null;
return mimes.iterator().next();
} }
private static String normalizeMimeType(String mimeType) { private static String normalizeMimeType(String mimeType) {
@ -261,4 +306,20 @@ public final class Parser {
public static void grantMime(String mime, boolean grant) { public static void grantMime(String mime, boolean grant) {
if (grant) denyMime.remove(normalizeMimeType(mime)); else denyMime.add(normalizeMimeType(mime)); if (grant) denyMime.remove(normalizeMimeType(mime)); else denyMime.add(normalizeMimeType(mime));
} }
public static void setDenyExtension(String denyList) {
denyExtension.clear();
for (String s: denyList.split(",")) denyExtension.add(s);
}
public static String getDenyExtension() {
String s = "";
for (String d: denyExtension) s += d + ",";
s = s.substring(0, s.length() - 1);
return s;
}
public static void grantExtension(String ext, boolean grant) {
if (grant) denyExtension.remove(ext); else denyExtension.add(ext);
}
} }

@ -30,7 +30,9 @@ package de.anomic.document.parser;
import java.io.File; import java.io.File;
import java.io.FileOutputStream; import java.io.FileOutputStream;
import java.io.InputStream; import java.io.InputStream;
import java.util.HashMap; import java.util.HashSet;
import java.util.Set;
import org.apache.tools.bzip2.CBZip2InputStream; import org.apache.tools.bzip2.CBZip2InputStream;
import de.anomic.document.AbstractParser; import de.anomic.document.AbstractParser;
@ -47,24 +49,31 @@ public class bzipParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class * a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes() * @see #getSupportedMimeTypes()
*/ */
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>(); public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
static final String fileExtensions = "bz2,tbz,tbz2"; public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static { static {
SUPPORTED_MIME_TYPES.put("application/x-bzip2",fileExtensions); SUPPORTED_EXTENSIONS.add("bz2");
SUPPORTED_MIME_TYPES.put("application/bzip2", fileExtensions); SUPPORTED_EXTENSIONS.add("tbz");
SUPPORTED_MIME_TYPES.put("application/x-bz2", fileExtensions); SUPPORTED_EXTENSIONS.add("tbz2");
SUPPORTED_MIME_TYPES.put("application/x-bzip",fileExtensions); SUPPORTED_MIME_TYPES.add("application/x-bzip2");
SUPPORTED_MIME_TYPES.put("application/x-stuffit",fileExtensions); SUPPORTED_MIME_TYPES.add("application/bzip2");
SUPPORTED_MIME_TYPES.add("application/x-bz2");
SUPPORTED_MIME_TYPES.add("application/x-bzip");
SUPPORTED_MIME_TYPES.add("application/x-stuffit");
} }
public bzipParser() { public bzipParser() {
super("Bzip 2 UNIX Compressed File Parser"); super("Bzip 2 UNIX Compressed File Parser");
} }
public HashMap<String, String> getSupportedMimeTypes() { public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES; return SUPPORTED_MIME_TYPES;
} }
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
public Document parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { public Document parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
File tempFile = null; File tempFile = null;

@ -30,7 +30,8 @@ package de.anomic.document.parser;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.util.HashMap; import java.util.HashSet;
import java.util.Set;
import de.anomic.document.AbstractParser; import de.anomic.document.AbstractParser;
import de.anomic.document.Idiom; import de.anomic.document.Idiom;
@ -45,18 +46,20 @@ public class docParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class * a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes() * @see #getSupportedMimeTypes()
*/ */
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>(); public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static { static {
String ext = "doc,docx"; SUPPORTED_EXTENSIONS.add("doc");
SUPPORTED_MIME_TYPES.put("application/msword",ext); SUPPORTED_EXTENSIONS.add("docx");
SUPPORTED_MIME_TYPES.put("application/doc",ext); SUPPORTED_MIME_TYPES.add("application/msword");
SUPPORTED_MIME_TYPES.put("appl/text",ext); SUPPORTED_MIME_TYPES.add("application/doc");
SUPPORTED_MIME_TYPES.put("application/vnd.msword",ext); SUPPORTED_MIME_TYPES.add("appl/text");
SUPPORTED_MIME_TYPES.put("application/vnd.ms-word",ext); SUPPORTED_MIME_TYPES.add("application/vnd.msword");
SUPPORTED_MIME_TYPES.put("application/winword",ext); SUPPORTED_MIME_TYPES.add("application/vnd.ms-word");
SUPPORTED_MIME_TYPES.put("application/word",ext); SUPPORTED_MIME_TYPES.add("application/winword");
SUPPORTED_MIME_TYPES.put("application/x-msw6",ext); SUPPORTED_MIME_TYPES.add("application/word");
SUPPORTED_MIME_TYPES.put("application/x-msword",ext); SUPPORTED_MIME_TYPES.add("application/x-msw6");
SUPPORTED_MIME_TYPES.add("application/x-msword");
} }
public docParser() { public docParser() {
@ -115,9 +118,13 @@ public class docParser extends AbstractParser implements Idiom {
return theDoc; return theDoc;
} }
public HashMap<String, String> getSupportedMimeTypes() { public Set<String> supportedMimeTypes() {
return docParser.SUPPORTED_MIME_TYPES; return SUPPORTED_MIME_TYPES;
} }
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
@Override @Override
public void reset() { public void reset() {

@ -30,7 +30,8 @@ package de.anomic.document.parser;
import java.io.File; import java.io.File;
import java.io.FileOutputStream; import java.io.FileOutputStream;
import java.io.InputStream; import java.io.InputStream;
import java.util.HashMap; import java.util.HashSet;
import java.util.Set;
import java.util.zip.GZIPInputStream; import java.util.zip.GZIPInputStream;
import de.anomic.document.AbstractParser; import de.anomic.document.AbstractParser;
@ -47,28 +48,31 @@ public class gzipParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class * a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes() * @see #getSupportedMimeTypes()
*/ */
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>(); public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
static final String ext = "gz,tgz"; public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static { static {
SUPPORTED_MIME_TYPES.put("application/x-gzip",ext); SUPPORTED_EXTENSIONS.add("gz");
SUPPORTED_MIME_TYPES.put("application/gzip",ext); SUPPORTED_EXTENSIONS.add("tgz");
SUPPORTED_MIME_TYPES.put("application/x-gunzip",ext); SUPPORTED_MIME_TYPES.add("application/x-gzip");
SUPPORTED_MIME_TYPES.put("application/gzipped",ext); SUPPORTED_MIME_TYPES.add("application/gzip");
SUPPORTED_MIME_TYPES.put("application/gzip-compressed",ext); SUPPORTED_MIME_TYPES.add("application/x-gunzip");
SUPPORTED_MIME_TYPES.put("application/x-compressed",ext); SUPPORTED_MIME_TYPES.add("application/gzipped");
SUPPORTED_MIME_TYPES.put("application/x-compress",ext); SUPPORTED_MIME_TYPES.add("application/gzip-compressed");
SUPPORTED_MIME_TYPES.put("gzip/document",ext); SUPPORTED_MIME_TYPES.add("gzip/document");
SUPPORTED_MIME_TYPES.put("application/octet-stream",ext);
} }
public gzipParser() { public gzipParser() {
super("GNU Zip Compressed Archive Parser"); super("GNU Zip Compressed Archive Parser");
} }
public HashMap<String, String> getSupportedMimeTypes() { public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES; return SUPPORTED_MIME_TYPES;
} }
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
public Document parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { public Document parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
File tempFile = null; File tempFile = null;

@ -31,7 +31,9 @@ import java.io.InputStream;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException; import java.nio.charset.UnsupportedCharsetException;
import java.util.HashMap; import java.util.HashSet;
import java.util.Set;
import de.anomic.document.AbstractParser; import de.anomic.document.AbstractParser;
import de.anomic.document.Document; import de.anomic.document.Document;
import de.anomic.document.Idiom; import de.anomic.document.Idiom;
@ -48,17 +50,29 @@ public class htmlParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class * a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes() * @see #getSupportedMimeTypes()
*/ */
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>(); public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static { static {
String ext = "htm,html,shtml,xhtml,php,asp,aspx,txt,jsp,csv,pl,py"; SUPPORTED_EXTENSIONS.add("htm");
SUPPORTED_MIME_TYPES.put("application/xhtml+xml", ext); SUPPORTED_EXTENSIONS.add("html");
SUPPORTED_MIME_TYPES.put("text/html", ext); SUPPORTED_EXTENSIONS.add("shtml");
SUPPORTED_MIME_TYPES.put("text/plain", ext); SUPPORTED_EXTENSIONS.add("xhtml");
SUPPORTED_MIME_TYPES.put("text/sgml",ext); SUPPORTED_EXTENSIONS.add("php");
SUPPORTED_EXTENSIONS.add("asp");
SUPPORTED_EXTENSIONS.add("aspx");
SUPPORTED_EXTENSIONS.add("txt");
SUPPORTED_EXTENSIONS.add("jsp");
SUPPORTED_EXTENSIONS.add("csv");
SUPPORTED_EXTENSIONS.add("pl");
SUPPORTED_EXTENSIONS.add("py");
SUPPORTED_MIME_TYPES.add("application/xhtml+xml");
SUPPORTED_MIME_TYPES.add("text/html");
SUPPORTED_MIME_TYPES.add("text/plain");
SUPPORTED_MIME_TYPES.add("text/sgml");
} }
public htmlParser() { public htmlParser() {
super("streaming html parser"); super("HTML Parser");
} }
@Override @Override
@ -213,9 +227,12 @@ public class htmlParser extends AbstractParser implements Idiom {
return encoding; return encoding;
} }
public Set<String> supportedMimeTypes() {
public HashMap<String, String> getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES; return SUPPORTED_MIME_TYPES;
} }
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
} }

@ -35,7 +35,6 @@ import java.io.OutputStreamWriter;
import java.io.Writer; import java.io.Writer;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.util.Enumeration; import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.Set; import java.util.Set;
import java.util.zip.ZipEntry; import java.util.zip.ZipEntry;
@ -63,20 +62,30 @@ public class odtParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class * a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes() * @see #getSupportedMimeTypes()
*/ */
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>(); public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
static { public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
SUPPORTED_MIME_TYPES.put("application/vnd.oasis.opendocument.text","odt,ods,odp"); static {
SUPPORTED_MIME_TYPES.put("application/x-vnd.oasis.opendocument.text","odt,ods,odp"); SUPPORTED_EXTENSIONS.add("odt");
} SUPPORTED_EXTENSIONS.add("ods");
SUPPORTED_EXTENSIONS.add("odp");
SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.text");
SUPPORTED_MIME_TYPES.add("application/x-vnd.oasis.opendocument.text");
SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.presentation");
SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.spreadsheet");
}
public odtParser() { public odtParser() {
super("OASIS OpenDocument V2 Text Document Parser"); super("OASIS OpenDocument V2 Text Document Parser");
} }
public HashMap<String, String> getSupportedMimeTypes() { public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES; return SUPPORTED_MIME_TYPES;
} }
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
@Override @Override
public Document parse(final yacyURL location, final String mimeType, final String charset, final File dest) throws ParserException, InterruptedException { public Document parse(final yacyURL location, final String mimeType, final String charset, final File dest) throws ParserException, InterruptedException {

@ -33,7 +33,9 @@ import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.OutputStreamWriter; import java.io.OutputStreamWriter;
import java.io.Writer; import java.io.Writer;
import java.util.HashMap; import java.util.HashSet;
import java.util.Set;
import org.pdfbox.pdfparser.PDFParser; import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument; import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDDocumentInformation; import org.pdfbox.pdmodel.PDDocumentInformation;
@ -55,24 +57,30 @@ public class pdfParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class * a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes() * @see #getSupportedMimeTypes()
*/ */
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>(); public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static { static {
SUPPORTED_MIME_TYPES.put("application/pdf","pdf"); SUPPORTED_EXTENSIONS.add("pdf");
SUPPORTED_MIME_TYPES.put("application/x-pdf","pdf"); SUPPORTED_MIME_TYPES.add("application/pdf");
SUPPORTED_MIME_TYPES.put("application/acrobat","pdf"); SUPPORTED_MIME_TYPES.add("application/x-pdf");
SUPPORTED_MIME_TYPES.put("applications/vnd.pdf","pdf"); SUPPORTED_MIME_TYPES.add("application/acrobat");
SUPPORTED_MIME_TYPES.put("text/pdf","pdf"); SUPPORTED_MIME_TYPES.add("applications/vnd.pdf");
SUPPORTED_MIME_TYPES.put("text/x-pdf","pdf"); SUPPORTED_MIME_TYPES.add("text/pdf");
SUPPORTED_MIME_TYPES.add("text/x-pdf");
} }
public pdfParser() { public pdfParser() {
super("Acrobat Portable Document Parser"); super("Acrobat Portable Document Parser");
} }
public HashMap<String, String> getSupportedMimeTypes() { public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES; return SUPPORTED_MIME_TYPES;
} }
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
public Document parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { public Document parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
PDDocument theDocument = null; PDDocument theDocument = null;

@ -29,7 +29,9 @@ package de.anomic.document.parser;
import java.io.BufferedInputStream; import java.io.BufferedInputStream;
import java.io.InputStream; import java.io.InputStream;
import java.util.HashMap; import java.util.HashSet;
import java.util.Set;
import org.apache.poi.hslf.extractor.PowerPointExtractor; import org.apache.poi.hslf.extractor.PowerPointExtractor;
import de.anomic.document.AbstractParser; import de.anomic.document.AbstractParser;
@ -44,17 +46,20 @@ public class pptParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class * a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes() * @see #getSupportedMimeTypes()
*/ */
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>(); public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
static final String ext = "ppt,pptx,pps"; public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static { static {
SUPPORTED_MIME_TYPES.put("application/mspowerpoint",ext); SUPPORTED_EXTENSIONS.add("ppt");
SUPPORTED_MIME_TYPES.put("application/powerpoint",ext); SUPPORTED_EXTENSIONS.add("pptx");
SUPPORTED_MIME_TYPES.put("application/vnd.ms-powerpoint",ext); SUPPORTED_EXTENSIONS.add("pps");
SUPPORTED_MIME_TYPES.put("application/ms-powerpoint",ext); SUPPORTED_MIME_TYPES.add("application/mspowerpoint");
SUPPORTED_MIME_TYPES.put("application/mspowerpnt",ext); SUPPORTED_MIME_TYPES.add("application/powerpoint");
SUPPORTED_MIME_TYPES.put("application/vnd-mspowerpoint",ext); SUPPORTED_MIME_TYPES.add("application/vnd.ms-powerpoint");
SUPPORTED_MIME_TYPES.put("application/x-powerpoint",ext); SUPPORTED_MIME_TYPES.add("application/ms-powerpoint");
SUPPORTED_MIME_TYPES.put("application/x-m",ext); SUPPORTED_MIME_TYPES.add("application/mspowerpnt");
SUPPORTED_MIME_TYPES.add("application/vnd-mspowerpoint");
SUPPORTED_MIME_TYPES.add("application/x-powerpoint");
SUPPORTED_MIME_TYPES.add("application/x-m");
} }
public pptParser(){ public pptParser(){
@ -114,9 +119,13 @@ public class pptParser extends AbstractParser implements Idiom {
} }
} }
public HashMap<String, String> getSupportedMimeTypes() { public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES; return SUPPORTED_MIME_TYPES;
} }
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
@Override @Override
public void reset(){ public void reset(){

@ -34,7 +34,9 @@ import java.io.FileReader;
import java.io.FileWriter; import java.io.FileWriter;
import java.io.InputStream; import java.io.InputStream;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.util.HashMap; import java.util.HashSet;
import java.util.Set;
import de.anomic.document.AbstractParser; import de.anomic.document.AbstractParser;
import de.anomic.document.Idiom; import de.anomic.document.Idiom;
import de.anomic.document.ParserException; import de.anomic.document.ParserException;
@ -48,12 +50,14 @@ public class psParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class * a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes() * @see #getSupportedMimeTypes()
*/ */
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>(); public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
static { public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
SUPPORTED_MIME_TYPES.put("application/ps","ps"); static {
SUPPORTED_MIME_TYPES.put("application/x-postscript","ps"); SUPPORTED_EXTENSIONS.add("ps");
SUPPORTED_MIME_TYPES.put("application/x-ps","ps"); SUPPORTED_MIME_TYPES.add("application/ps");
SUPPORTED_MIME_TYPES.put("application/x-postscript-not-eps","ps"); SUPPORTED_MIME_TYPES.add("application/x-postscript");
SUPPORTED_MIME_TYPES.add("application/x-ps");
SUPPORTED_MIME_TYPES.add("application/x-postscript-not-eps");
} }
private final static Object modeScan = new Object(); private final static Object modeScan = new Object();
@ -69,10 +73,14 @@ public class psParser extends AbstractParser implements Idiom {
} }
} }
public HashMap<String, String> getSupportedMimeTypes() { public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES; return SUPPORTED_MIME_TYPES;
} }
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
public boolean testForPs2Ascii() { public boolean testForPs2Ascii() {
try { try {
String procOutputLine = null; String procOutputLine = null;

@ -31,6 +31,9 @@ import java.io.ByteArrayInputStream;
import java.io.File; import java.io.File;
import java.io.InputStream; import java.io.InputStream;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
import com.jguild.jrpm.io.RPMFile; import com.jguild.jrpm.io.RPMFile;
import com.jguild.jrpm.io.datatype.DataTypeIf; import com.jguild.jrpm.io.datatype.DataTypeIf;
@ -55,21 +58,27 @@ public class rpmParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class * a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes() * @see #getSupportedMimeTypes()
*/ */
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>(); public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
static { public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
SUPPORTED_MIME_TYPES.put("application/x-rpm","rpm"); static {
SUPPORTED_MIME_TYPES.put("application/x-redhat packet manager","rpm"); SUPPORTED_EXTENSIONS.add("rpm");
SUPPORTED_MIME_TYPES.put("application/x-redhat-package-manager","rpm"); SUPPORTED_MIME_TYPES.add("application/x-rpm");
SUPPORTED_MIME_TYPES.add("application/x-redhat packet manager");
SUPPORTED_MIME_TYPES.add("application/x-redhat-package-manager");
} }
public rpmParser() { public rpmParser() {
super("rpm Parser"); super("rpm Parser");
} }
public HashMap<String, String> getSupportedMimeTypes() { public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES; return SUPPORTED_MIME_TYPES;
} }
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
public Document parse(final yacyURL location, final String mimeType, final String charset, public Document parse(final yacyURL location, final String mimeType, final String charset,
final InputStream source) throws ParserException { final InputStream source) throws ParserException {
File dstFile = null; File dstFile = null;

@ -33,8 +33,10 @@ import java.io.InputStream;
import java.io.Writer; import java.io.Writer;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.Map; import java.util.Map;
import java.util.Set;
import de.anomic.content.RSSMessage; import de.anomic.content.RSSMessage;
import de.anomic.document.AbstractParser; import de.anomic.document.AbstractParser;
@ -58,13 +60,16 @@ public class rssParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class * a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes() * @see #getSupportedMimeTypes()
*/ */
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>(); public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
static final String fileExtensions = "xml,rss,rdf"; public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static { static {
SUPPORTED_MIME_TYPES.put("text/rss",fileExtensions); SUPPORTED_EXTENSIONS.add("xml");
SUPPORTED_MIME_TYPES.put("application/rdf+xml",fileExtensions); SUPPORTED_EXTENSIONS.add("rss");
SUPPORTED_MIME_TYPES.put("application/rss+xml",fileExtensions); SUPPORTED_EXTENSIONS.add("rdf");
SUPPORTED_MIME_TYPES.put("application/atom+xml",fileExtensions); SUPPORTED_MIME_TYPES.add("text/rss");
SUPPORTED_MIME_TYPES.add("application/rdf+xml");
SUPPORTED_MIME_TYPES.add("application/rss+xml");
SUPPORTED_MIME_TYPES.add("application/atom+xml");
} }
public rssParser() { public rssParser() {
@ -174,9 +179,13 @@ public class rssParser extends AbstractParser implements Idiom {
} }
} }
public HashMap<String, String> getSupportedMimeTypes() { public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES; return SUPPORTED_MIME_TYPES;
} }
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
@Override @Override
public void reset() { public void reset() {

@ -28,7 +28,9 @@
package de.anomic.document.parser; package de.anomic.document.parser;
import java.io.InputStream; import java.io.InputStream;
import java.util.HashMap; import java.util.HashSet;
import java.util.Set;
import javax.swing.text.DefaultStyledDocument; import javax.swing.text.DefaultStyledDocument;
import javax.swing.text.rtf.RTFEditorKit; import javax.swing.text.rtf.RTFEditorKit;
@ -44,13 +46,15 @@ public class rtfParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class * a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes() * @see #getSupportedMimeTypes()
*/ */
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>(); public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
static { public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
SUPPORTED_MIME_TYPES.put("application/rtf","rtf"); static {
SUPPORTED_MIME_TYPES.put("text/rtf","rtf"); SUPPORTED_EXTENSIONS.add("rtf");
SUPPORTED_MIME_TYPES.put("application/x-rtf","rtf"); SUPPORTED_MIME_TYPES.add("application/rtf");
SUPPORTED_MIME_TYPES.put("text/richtext","rtf"); SUPPORTED_MIME_TYPES.add("text/rtf");
SUPPORTED_MIME_TYPES.put("application/x-soffice","rtf"); SUPPORTED_MIME_TYPES.add("application/x-rtf");
SUPPORTED_MIME_TYPES.add("text/richtext");
SUPPORTED_MIME_TYPES.add("application/x-soffice");
} }
public rtfParser() { public rtfParser() {
@ -96,9 +100,13 @@ public class rtfParser extends AbstractParser implements Idiom {
} }
} }
public HashMap<String, String> getSupportedMimeTypes() { public Set<String> supportedMimeTypes() {
return rtfParser.SUPPORTED_MIME_TYPES; return SUPPORTED_MIME_TYPES;
} }
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
public void reset() { public void reset() {
// Nothing todo here at the moment // Nothing todo here at the moment

@ -32,7 +32,9 @@ import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.OutputStream; import java.io.OutputStream;
import java.util.HashMap; import java.util.HashSet;
import java.util.Set;
import SevenZip.ArchiveExtractCallback; import SevenZip.ArchiveExtractCallback;
import SevenZip.IInStream; import SevenZip.IInStream;
import SevenZip.MyRandomAccessFile; import SevenZip.MyRandomAccessFile;
@ -55,9 +57,11 @@ public class sevenzipParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class * a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes() * @see #getSupportedMimeTypes()
*/ */
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>(); public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
static { public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
SUPPORTED_MIME_TYPES.put("application/x-7z-compressed", "7z"); static {
SUPPORTED_EXTENSIONS.add("7z");
SUPPORTED_MIME_TYPES.add("application/x-7z-compressed");
} }
public sevenzipParser() { public sevenzipParser() {
@ -124,10 +128,14 @@ public class sevenzipParser extends AbstractParser implements Idiom {
} }
} }
public HashMap<String, String> getSupportedMimeTypes() { public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES; return SUPPORTED_MIME_TYPES;
} }
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
// wrapper class to redirect output of standard ArchiveExtractCallback to serverLog // wrapper class to redirect output of standard ArchiveExtractCallback to serverLog
// and parse the extracted content // and parse the extracted content

@ -29,6 +29,9 @@ package de.anomic.document.parser;
import java.io.InputStream; import java.io.InputStream;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
import pt.tumba.parser.swf.SWF2HTML; import pt.tumba.parser.swf.SWF2HTML;
import de.anomic.document.AbstractParser; import de.anomic.document.AbstractParser;
import de.anomic.document.Idiom; import de.anomic.document.Idiom;
@ -42,24 +45,27 @@ public class swfParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class * a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes() * @see #getSupportedMimeTypes()
*/ */
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>(); public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static { static {
SUPPORTED_MIME_TYPES.put("application/x-shockwave-flash","swf"); SUPPORTED_EXTENSIONS.add("swf");
SUPPORTED_MIME_TYPES.put("application/x-shockwave-flash2-preview","swf"); SUPPORTED_MIME_TYPES.add("application/x-shockwave-flash");
SUPPORTED_MIME_TYPES.put("application/futuresplash","swf"); SUPPORTED_MIME_TYPES.add("application/x-shockwave-flash2-preview");
SUPPORTED_MIME_TYPES.put("image/vnd.rn-realflash","swf"); SUPPORTED_MIME_TYPES.add("application/futuresplash");
SUPPORTED_MIME_TYPES.add("image/vnd.rn-realflash");
} }
public swfParser() { public swfParser() {
super("Adobe Flash Parser"); super("Adobe Flash Parser");
} }
/** public Set<String> supportedMimeTypes() {
* returns a hashtable containing the mimetypes that are supported by this class
*/
public HashMap<String, String> getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES; return SUPPORTED_MIME_TYPES;
} }
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
/* /*
* parses the source documents and returns a plasmaParserDocument containing * parses the source documents and returns a plasmaParserDocument containing

@ -34,8 +34,10 @@ import java.io.InputStream;
import java.io.OutputStream; import java.io.OutputStream;
import java.util.Arrays; import java.util.Arrays;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.Map; import java.util.Map;
import java.util.Set;
import java.util.zip.GZIPInputStream; import java.util.zip.GZIPInputStream;
import com.ice.tar.TarEntry; import com.ice.tar.TarEntry;
@ -58,22 +60,28 @@ public class tarParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class * a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes() * @see #getSupportedMimeTypes()
*/ */
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>(); public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
static { public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
SUPPORTED_MIME_TYPES.put("application/x-tar","tar"); static {
SUPPORTED_MIME_TYPES.put("application/tar","tar"); SUPPORTED_EXTENSIONS.add("tar");
SUPPORTED_MIME_TYPES.put("applicaton/x-gtar","tar"); SUPPORTED_MIME_TYPES.add("application/x-tar");
SUPPORTED_MIME_TYPES.put("multipart/x-tar","tar"); SUPPORTED_MIME_TYPES.add("application/tar");
SUPPORTED_MIME_TYPES.add("applicaton/x-gtar");
SUPPORTED_MIME_TYPES.add("multipart/x-tar");
} }
public tarParser() { public tarParser() {
super("Tape Archive File Parser"); super("Tape Archive File Parser");
} }
public HashMap<String, String> getSupportedMimeTypes() { public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES; return SUPPORTED_MIME_TYPES;
} }
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
public Document parse(final yacyURL location, final String mimeType, final String charset, InputStream source) throws ParserException, InterruptedException { public Document parse(final yacyURL location, final String mimeType, final String charset, InputStream source) throws ParserException, InterruptedException {
long docTextLength = 0; long docTextLength = 0;

@ -33,8 +33,10 @@ import java.io.InputStream;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.Set;
import de.anomic.crawler.HTTPLoader; import de.anomic.crawler.HTTPLoader;
import de.anomic.document.AbstractParser; import de.anomic.document.AbstractParser;
@ -60,25 +62,31 @@ public class vcfParser extends AbstractParser implements Idiom {
* *
* TODO: support of x-mozilla-cpt and x-mozilla-html tags * TODO: support of x-mozilla-cpt and x-mozilla-html tags
*/ */
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>(); public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
static { public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
SUPPORTED_MIME_TYPES.put("text/x-vcard","vcf"); static {
SUPPORTED_MIME_TYPES.put("application/vcard","vcf"); SUPPORTED_EXTENSIONS.add("vcf");
SUPPORTED_MIME_TYPES.put("text/anytext","vcf"); SUPPORTED_MIME_TYPES.add("text/x-vcard");
SUPPORTED_MIME_TYPES.put("text/directory","vcf"); SUPPORTED_MIME_TYPES.add("application/vcard");
SUPPORTED_MIME_TYPES.put("application/x-versit","vcf"); SUPPORTED_MIME_TYPES.add("text/anytext");
SUPPORTED_MIME_TYPES.put("text/x-versit","vcf"); SUPPORTED_MIME_TYPES.add("text/directory");
SUPPORTED_MIME_TYPES.put("text/x-vcalendar","vcf"); SUPPORTED_MIME_TYPES.add("application/x-versit");
SUPPORTED_MIME_TYPES.add("text/x-versit");
SUPPORTED_MIME_TYPES.add("text/x-vcalendar");
} }
public vcfParser() { public vcfParser() {
super("vCard Parser"); super("vCard Parser");
} }
public HashMap<String, String> getSupportedMimeTypes() { public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES; return SUPPORTED_MIME_TYPES;
} }
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
public Document parse(final yacyURL url, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { public Document parse(final yacyURL url, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
try { try {

@ -28,7 +28,9 @@
package de.anomic.document.parser; package de.anomic.document.parser;
import java.io.InputStream; import java.io.InputStream;
import java.util.HashMap; import java.util.HashSet;
import java.util.Set;
import de.anomic.document.AbstractParser; import de.anomic.document.AbstractParser;
import de.anomic.document.Idiom; import de.anomic.document.Idiom;
import de.anomic.document.ParserException; import de.anomic.document.ParserException;
@ -43,29 +45,32 @@ public class vsdParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class * a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes() * @see #getSupportedMimeTypes()
*/ */
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>(); public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static { static {
SUPPORTED_MIME_TYPES.put("application/visio","vsd"); SUPPORTED_EXTENSIONS.add("vsd");
SUPPORTED_MIME_TYPES.put("application/x-visio","vsd"); SUPPORTED_MIME_TYPES.add("application/visio");
SUPPORTED_MIME_TYPES.put("application/vnd.visio","vsd"); SUPPORTED_MIME_TYPES.add("application/x-visio");
SUPPORTED_MIME_TYPES.put("application/visio.drawing","vsd"); SUPPORTED_MIME_TYPES.add("application/vnd.visio");
SUPPORTED_MIME_TYPES.put("application/vsd","vsd"); SUPPORTED_MIME_TYPES.add("application/visio.drawing");
SUPPORTED_MIME_TYPES.put("application/x-vsd","vsd"); SUPPORTED_MIME_TYPES.add("application/vsd");
SUPPORTED_MIME_TYPES.put("image/x-vsd","vsd"); SUPPORTED_MIME_TYPES.add("application/x-vsd");
SUPPORTED_MIME_TYPES.put("zz-application/zz-winassoc-vsd","vsd"); SUPPORTED_MIME_TYPES.add("image/x-vsd");
SUPPORTED_MIME_TYPES.add("zz-application/zz-winassoc-vsd");
} }
public vsdParser() { public vsdParser() {
super("Microsoft Visio Parser"); super("Microsoft Visio Parser");
} }
/** public Set<String> supportedMimeTypes() {
* returns a hashtable containing the mimetypes that are supported by this class
*/
public HashMap<String, String> getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES; return SUPPORTED_MIME_TYPES;
} }
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
/* /*
* parses the source documents and returns a plasmaParserDocument containing * parses the source documents and returns a plasmaParserDocument containing
* all extracted information about the parsed document * all extracted information about the parsed document

@ -28,7 +28,9 @@
package de.anomic.document.parser; package de.anomic.document.parser;
import java.io.InputStream; import java.io.InputStream;
import java.util.HashMap; import java.util.HashSet;
import java.util.Set;
import org.apache.poi.hssf.eventusermodel.HSSFEventFactory; import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
import org.apache.poi.hssf.eventusermodel.HSSFListener; import org.apache.poi.hssf.eventusermodel.HSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFRequest; import org.apache.poi.hssf.eventusermodel.HSSFRequest;
@ -56,17 +58,19 @@ public class xlsParser extends AbstractParser implements Idiom, HSSFListener {
* a list of mime types that are supported by this parser class * a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes() * @see #getSupportedMimeTypes()
*/ */
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>(); public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static { static {
String ext = "xls,xlsx"; SUPPORTED_EXTENSIONS.add("xls");
SUPPORTED_MIME_TYPES.put("application/msexcel",ext); SUPPORTED_EXTENSIONS.add("xlsx");
SUPPORTED_MIME_TYPES.put("application/excel",ext); SUPPORTED_MIME_TYPES.add("application/msexcel");
SUPPORTED_MIME_TYPES.put("application/vnd.ms-excel",ext); SUPPORTED_MIME_TYPES.add("application/excel");
SUPPORTED_MIME_TYPES.put("application/x-excel",ext); SUPPORTED_MIME_TYPES.add("application/vnd.ms-excel");
SUPPORTED_MIME_TYPES.put("application/x-msexcel",ext); SUPPORTED_MIME_TYPES.add("application/x-excel");
SUPPORTED_MIME_TYPES.put("application/x-ms-excel",ext); SUPPORTED_MIME_TYPES.add("application/x-msexcel");
SUPPORTED_MIME_TYPES.put("application/x-dos_ms_excel",ext); SUPPORTED_MIME_TYPES.add("application/x-ms-excel");
SUPPORTED_MIME_TYPES.put("application/xls",ext); SUPPORTED_MIME_TYPES.add("application/x-dos_ms_excel");
SUPPORTED_MIME_TYPES.add("application/xls");
} }
public xlsParser(){ public xlsParser(){
@ -134,9 +138,13 @@ public class xlsParser extends AbstractParser implements Idiom, HSSFListener {
} }
} }
public HashMap<String, String> getSupportedMimeTypes() { public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES; return SUPPORTED_MIME_TYPES;
} }
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
@Override @Override
public void reset(){ public void reset(){

@ -34,8 +34,10 @@ import java.io.InputStream;
import java.io.OutputStream; import java.io.OutputStream;
import java.util.Arrays; import java.util.Arrays;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.Map; import java.util.Map;
import java.util.Set;
import java.util.zip.ZipEntry; import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream; import java.util.zip.ZipInputStream;
@ -56,26 +58,31 @@ public class zipParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class * a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes() * @see #getSupportedMimeTypes()
*/ */
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>(); public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
static { public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
SUPPORTED_MIME_TYPES.put("application/zip","zip"); static {
SUPPORTED_MIME_TYPES.put("application/x-zip","zip"); SUPPORTED_EXTENSIONS.add("zip");
SUPPORTED_MIME_TYPES.put("application/x-zip-compressed","zip"); SUPPORTED_MIME_TYPES.add("application/zip");
SUPPORTED_MIME_TYPES.put("application/octet-stream","zip"); SUPPORTED_MIME_TYPES.add("application/x-zip");
SUPPORTED_MIME_TYPES.put("application/x-compress","zip"); SUPPORTED_MIME_TYPES.add("application/x-zip-compressed");
SUPPORTED_MIME_TYPES.put("application/x-compressed","zip"); SUPPORTED_MIME_TYPES.add("application/x-compress");
SUPPORTED_MIME_TYPES.put("multipart/x-zip","zip"); SUPPORTED_MIME_TYPES.add("application/x-compressed");
SUPPORTED_MIME_TYPES.put("application/java-archive","jar"); SUPPORTED_MIME_TYPES.add("multipart/x-zip");
SUPPORTED_MIME_TYPES.add("application/java-archive");
} }
public zipParser() { public zipParser() {
super("Compressed Archive File Parser"); super("ZIP File Parser");
} }
public HashMap<String, String> getSupportedMimeTypes() { public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES; return SUPPORTED_MIME_TYPES;
} }
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
public Document parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { public Document parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
long docTextLength = 0; long docTextLength = 0;

@ -528,7 +528,7 @@ public final class httpdProxyHandler {
final String storeError = cacheEntry.shallStoreCacheForProxy(); final String storeError = cacheEntry.shallStoreCacheForProxy();
final boolean storeHTCache = cacheEntry.profile().storeHTCache(); final boolean storeHTCache = cacheEntry.profile().storeHTCache();
final boolean isSupportedContent = Parser.supportsExtension(cacheEntry.url()) && Parser.supportsMime(cacheEntry.getMimeType()); final String supportError = Parser.supports(cacheEntry.url(), cacheEntry.getMimeType());
if ( if (
/* /*
* Now we store the response into the htcache directory if * Now we store the response into the htcache directory if
@ -539,7 +539,7 @@ public final class httpdProxyHandler {
* b) the user has configured to use the htcache OR * b) the user has configured to use the htcache OR
* c) the content should be indexed * c) the content should be indexed
*/ */
((storeHTCache) || (isSupportedContent)) ((storeHTCache) || (supportError != null))
) { ) {
// we don't write actually into a file, only to RAM, and schedule writing the file. // we don't write actually into a file, only to RAM, and schedule writing the file.
int l = res.getResponseHeader().size(); int l = res.getResponseHeader().size();
@ -580,7 +580,7 @@ public final class httpdProxyHandler {
if (theLogger.isFine()) theLogger.logFine(reqID +" "+ url.toString() + " not cached." + if (theLogger.isFine()) theLogger.logFine(reqID +" "+ url.toString() + " not cached." +
" StoreError=" + ((storeError==null)?"None":storeError) + " StoreError=" + ((storeError==null)?"None":storeError) +
" StoreHTCache=" + storeHTCache + " StoreHTCache=" + storeHTCache +
" SupportetContent=" + isSupportedContent); " SupportError=" + supportError);
FileUtils.copy(res.getDataAsStream(), outStream); FileUtils.copy(res.getDataAsStream(), outStream);

@ -42,7 +42,6 @@ import java.util.HashMap;
import java.util.Map; import java.util.Map;
import de.anomic.document.Classification; import de.anomic.document.Classification;
import de.anomic.document.Parser;
import de.anomic.http.httpResponseHeader; import de.anomic.http.httpResponseHeader;
import de.anomic.http.httpDocument; import de.anomic.http.httpDocument;
import de.anomic.kelondro.blob.ArrayStack; import de.anomic.kelondro.blob.ArrayStack;
@ -181,10 +180,6 @@ public final class plasmaHTCache {
return mimeType.toUpperCase().startsWith("IMAGE"); return mimeType.toUpperCase().startsWith("IMAGE");
} }
public static boolean isText(final String mimeType) {
return Parser.supportsMime(mimeType);
}
public static boolean noIndexingURL(final yacyURL url) { public static boolean noIndexingURL(final yacyURL url) {
if (url == null) return false; if (url == null) return false;
String urlString = url.toString().toLowerCase(); String urlString = url.toString().toLowerCase();

@ -1087,8 +1087,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
* *
* Testing if the content type is supported by the available parsers * Testing if the content type is supported by the available parsers
* ========================================================================= */ * ========================================================================= */
final boolean isSupportedContent = Parser.supportsExtension(entry.url()) && Parser.supportsMime(entry.getMimeType()); final String supportError = Parser.supports(entry.url(), entry.getMimeType());
if (log.isFinest()) log.logFinest("STORE "+ entry.url() +" content of type "+ entry.getMimeType() +" is supported: "+ isSupportedContent); if (log.isFinest()) log.logFinest("STORE "+ entry.url() +" content of type "+ entry.getMimeType() + " is supported: " + supportError);
/* ========================================================================= /* =========================================================================
* INDEX CONTROL HEADER * INDEX CONTROL HEADER
@ -1121,7 +1121,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
* a) the user has configured to use the htcache or * a) the user has configured to use the htcache or
* b) the content should be indexed * b) the content should be indexed
* ========================================================================= */ * ========================================================================= */
if (((entry.profile() != null) && (entry.profile().storeHTCache())) || (doIndexing && isSupportedContent)) { if (((entry.profile() != null) && (entry.profile().storeHTCache())) || (doIndexing && supportError == null)) {
// store response header // store response header
/* /*
if (entry.writeResourceInfo()) { if (entry.writeResourceInfo()) {
@ -1146,7 +1146,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
/* ========================================================================= /* =========================================================================
* INDEXING * INDEXING
* ========================================================================= */ * ========================================================================= */
if (doIndexing && isSupportedContent) { if (doIndexing && supportError == null) {
// enqueue for further crawling // enqueue for further crawling
enQueue(this.crawler.queuePreStack.newEntry( enQueue(this.crawler.queuePreStack.newEntry(

@ -865,17 +865,15 @@ public class SnippetCache {
} }
// STEP 3: if the metadata is still null try to guess the mimeType of the resource // STEP 3: if the metadata is still null try to guess the mimeType of the resource
if (responseHeader == null) { String supportError = Parser.supports(url, responseHeader == null ? null : responseHeader.mime());
if (Parser.supportsExtension(url)) { if (supportError != null) {
String supposedMime = Parser.mimeOf(url); log.logInfo("could not generate snippet for " + url.toNormalform(true, false) + ": " + supportError);
return Parser.parseSource(url, supposedMime, null, contentLength, resourceStream);
}
return null; return null;
}
if (Parser.supportsMime(responseHeader.mime())) {
return Parser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), contentLength, resourceStream);
} }
return null; if (responseHeader == null) {
return Parser.parseSource(url, null, null, contentLength, resourceStream);
}
return Parser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), contentLength, resourceStream);
} catch (final InterruptedException e) { } catch (final InterruptedException e) {
// interruption of thread detected // interruption of thread detected
return null; return null;

Loading…
Cancel
Save