enhanced document type recognition

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6209 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent aa38eb5a20
commit b2263bc720

@ -250,6 +250,7 @@ minimumGlobalDelta = 500
# the following mime-types are a blacklist for indexing:
# parser.mime.deny: specifies mime-types that shall not be indexed
parser.mime.deny=
parser.extensions.deny=
# Promotion Strings
# These strings appear in the Web Mask of the YACY search client

@ -49,7 +49,7 @@ public class ConfigParser {
post.remove("parserSettings");
for (Idiom parser: Parser.idioms()) {
for (String mimeType: parser.getSupportedMimeTypes().keySet()) {
for (String mimeType: parser.supportedMimeTypes()) {
Parser.grantMime(mimeType, post.get("mimename_" + mimeType, "").equals("on"));
}
}
@ -62,9 +62,9 @@ public class ConfigParser {
prop.put("parser_" + i + "_name", parser.getName());
int mimeIdx = 0;
for (String mimeType: parser.getSupportedMimeTypes().keySet()) {
for (String mimeType: parser.supportedMimeTypes()) {
prop.put("parser_" + i + "_mime_" + mimeIdx + "_mimetype", mimeType);
prop.put("parser_" + i + "_mime_" + mimeIdx + "_status", (Parser.supportsMime(mimeType)) ? 1 : 0);
prop.put("parser_" + i + "_mime_" + mimeIdx + "_status", (Parser.supportsMime(mimeType) == null) ? 1 : 0);
mimeIdx++;
}
prop.put("parser_" + i + "_mime", mimeIdx);

@ -224,16 +224,12 @@ public class FTPLoader {
// if the mimetype and file extension is supported we start to download
// the file
httpDocument htCache = null;
if (!Parser.supportsExtension(entryUrl)) {
// if the response has not the right file type then reject file
log.logInfo("REJECTED WRONG EXTENSION TYPE " + mimeType + " for URL " + entry.url().toString());
sb.crawlQueues.errorURL.newEntry(entry, this.sb.peers.mySeed().hash, new Date(), 1, "wrong extension");
throw new Exception("response has not the right extension type -> rejected");
} else if (!Parser.supportsMime(mimeType)) {
// if the response has not the right file type then reject file
log.logInfo("REJECTED WRONG MIME TYPE " + mimeType + " for URL " + entry.url().toString());
sb.crawlQueues.errorURL.newEntry(entry, this.sb.peers.mySeed().hash, new Date(), 1, "wrong mime type");
throw new Exception("response has not the right mime type -> rejected");
String supportError = Parser.supports(entryUrl, mimeType);
if (supportError != null) {
// reject file
log.logInfo("PARSER REJECTED URL " + entry.url().toString() + ": " + supportError);
sb.crawlQueues.errorURL.newEntry(entry, this.sb.peers.mySeed().hash, new Date(), 1, supportError);
throw new Exception(supportError);
} else {
// abort the download if content is too long
final int size = ftpClient.fileSize(path);

@ -120,8 +120,9 @@ public final class HTTPLoader {
if (port < 0) port = (ssl) ? 443 : 80;
// if not the right file type then reject file
if (!Parser.supportsExtension(entry.url())) {
sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "wrong extension");
String supportError = Parser.supportsExtension(entry.url());
if (supportError != null) {
sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, supportError);
throw new IOException("REJECTED WRONG EXTENSION TYPE " + entry.url().getFileExtension()+ " for URL " + entry.url().toString());
}
@ -166,8 +167,9 @@ public final class HTTPLoader {
//try {
// if the response has not the right file type then reject file
if (!Parser.supportsMime(res.getResponseHeader().mime())) {
sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "wrong mime type");
supportError = Parser.supports(entry.url(), res.getResponseHeader().mime());
if (supportError != null) {
sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, supportError);
throw new IOException("REJECTED WRONG MIME TYPE " + res.getResponseHeader().mime() + " for URL " + entry.url().toString());
}

@ -34,6 +34,7 @@ import java.util.Date;
import java.util.Iterator;
import java.util.concurrent.ConcurrentHashMap;
import de.anomic.document.Parser;
import de.anomic.http.httpHeader;
import de.anomic.http.httpResponseHeader;
import de.anomic.kelondro.index.Row;
@ -480,8 +481,9 @@ public class IndexingStack {
if (plasmaHTCache.isPicture(mimeType)) {
return "Media_Content_(Picture)";
}
if (!plasmaHTCache.isText(mimeType)) {
return "Media_Content_(not_text)";
String parserError = Parser.supportsMime(mimeType);
if (parserError != null) {
return "Media_Content, no parser: " + parserError;
}
// -if-modified-since in request
@ -598,7 +600,8 @@ public class IndexingStack {
if (responseHeader != null) {
final String mimeType = responseHeader.mime();
if (plasmaHTCache.isPicture(mimeType)) { return "Media_Content_(Picture)"; }
if (!plasmaHTCache.isText(mimeType)) { return "Media_Content_(not_text)"; }
String parserError = Parser.supportsMime(mimeType);
if (parserError != null) { return "Media_Content, parser error: " + parserError; }
}
if (plasmaHTCache.noIndexingURL(url())) { return "Media_Content_(forbidden)"; }

@ -27,8 +27,7 @@ package de.anomic.document;
import java.io.File;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.Set;
import de.anomic.yacy.yacyURL;
@ -87,11 +86,15 @@ public interface Idiom {
/**
* Get the MimeType(s) that are supported by the parser
* @return a {@link Hashtable} containing a mapping from a mime type string
* to a comma-separated String of file extensions
* that are supported by the idiom parser
* @return a set of strings denoting the supported mime types
*/
public HashMap<String, String> getSupportedMimeTypes();
public Set<String> supportedMimeTypes();
/**
* Get the File extension(s) that are supported by the parser
* @return a set of strings denoting the supported file extensions
*/
public Set<String> supportedExtensions();
/**
* This function should be called before reusing the parser object.

@ -72,8 +72,9 @@ public final class Parser {
}
private static final Map<String, Idiom> mime2parser = new TreeMap<String, Idiom>(insensitiveCollator);
private static final Map<String, Set<String>> ext2mime = new TreeMap<String, Set<String>>(insensitiveCollator);
private static final Map<String, String> ext2mime = new TreeMap<String, String>(insensitiveCollator);
private static final Set<String> denyMime = new TreeSet<String>(insensitiveCollator);
private static final Set<String> denyExtension = new TreeSet<String>(insensitiveCollator);
static {
initParser(new bzipParser());
@ -103,21 +104,20 @@ public final class Parser {
}
private static void initParser(Idiom parser) {
for (Map.Entry<String, String> e: parser.getSupportedMimeTypes().entrySet()) {
String prototypeMime = null;
for (String mime: parser.supportedMimeTypes()) {
// process the mime types
final String mimeType = normalizeMimeType(e.getKey());
final String mimeType = normalizeMimeType(mime);
if (prototypeMime == null) prototypeMime = mimeType;
Idiom p0 = mime2parser.get(mimeType);
if (p0 != null) log.logSevere("parser for mime '" + mimeType + "' was set to '" + p0.getName() + "', overwriting with new parser '" + parser.getName() + "'.");
mime2parser.put(mimeType, parser);
Log.logInfo("PARSER", "Parser for mime type '" + mimeType + "': " + parser.getName());
// process the extensions
String[] exts = e.getValue().split(",");
for (String ext: exts) {
Set<String> s = ext2mime.get(ext);
if (s == null) s = new HashSet<String>();
s.add(mimeType);
ext2mime.put(ext, s);
if (prototypeMime != null) for (String ext: parser.supportedExtensions()) {
String s = ext2mime.get(ext);
if (s != null) log.logSevere("parser for extension '" + ext + "' was set to mime '" + s + "', overwriting with new mime '" + prototypeMime + "'.");
ext2mime.put(ext, prototypeMime);
}
}
}
@ -148,9 +148,12 @@ public final class Parser {
}
}
public static Document parseSource(final yacyURL location,
final String mimeType, final String charset,
final File sourceFile) throws InterruptedException, ParserException {
public static Document parseSource(
final yacyURL location,
final String mimeType,
final String charset,
final File sourceFile
) throws InterruptedException, ParserException {
BufferedInputStream sourceStream = null;
try {
@ -174,39 +177,33 @@ public final class Parser {
}
}
public static Document parseSource(final yacyURL location,
String mimeType, final String charset,
final long contentLength, final InputStream sourceStream)
throws InterruptedException, ParserException {
public static Document parseSource(
final yacyURL location,
String mimeType,
final String charset,
final long contentLength,
final InputStream sourceStream
) throws InterruptedException, ParserException {
try {
if (log.isFine()) log.logFine("Parsing '" + location + "' from stream");
mimeType = normalizeMimeType(mimeType);
final String fileExt = location.getFileExtension();
final String documentCharset = htmlParser.patchCharsetEncoding(charset);
if (!supportsMime(mimeType)) {
final String errorMsg = "No parser available to parse mimetype '" + mimeType + "'";
log.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location);
}
if (!supportsExtension(location)) {
final String errorMsg = "No parser available to parse extension of url path";
Idiom parser = idiomParser(location, mimeType);
if (parser == null) {
final String errorMsg = "No parser available to parse extension '" + location.getFileExtension() + "' or mimetype '" + mimeType + "'";
log.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location);
}
if (log.isFine()) log.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'.");
Idiom parser = mime2parser.get(normalizeMimeType(mimeType));
Document doc = null;
if (parser != null) {
parser.setContentLength(contentLength);
doc = parser.parse(location, mimeType, documentCharset, sourceStream);
} else {
final String errorMsg = "No parser available to parse mimetype '" + mimeType + "' (2)";
log.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location);
}
parser.setContentLength(contentLength);
Document doc = parser.parse(location, mimeType, documentCharset, sourceStream);
if (doc == null) {
final String errorMsg = "Unexpected error. Parser returned null.";
log.logInfo("Unable to parse '" + location + "'. " + errorMsg);
final String errorMsg = "Parsing content with file extension '" + location.getFileExtension() + "' and mimetype '" + mimeType + "' failed: document == null";
log.logWarning("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location);
}
return doc;
@ -219,15 +216,65 @@ public final class Parser {
}
}
public static boolean supportsMime(String mimeType) {
/**
* check if the parser supports the given content.
* @param url
* @param mimeType
* @return returns null if the content is supportet. If the content is not supported, return a error string.
*/
public static String supports(final yacyURL url, String mimeType) {
try {
// try to get a parser. If this works, we don't need the parser itself, we just return null to show that everything is ok.
idiomParser(url, mimeType);
return null;
} catch (ParserException e) {
// in case that a parser is not available, return a error string describing the problem.
return e.getMessage();
}
}
private static Idiom idiomParser(final yacyURL url, String mimeType) throws ParserException {
// check mime type
if (mimeType != null) {
mimeType = normalizeMimeType(mimeType);
if (denyMime.contains(mimeType)) throw new ParserException("mime type '" + mimeType + "' is denied", url);
} else {
mimeType = normalizeMimeType(mimeType);
}
Idiom idiom = mime2parser.get(mimeType);
if (idiom != null) return idiom;
// check extension
String ext = url.getFileExtension();
if (ext == null || ext.length() == 0) throw new ParserException("no file extension", url);
if (denyExtension.contains(ext)) throw new ParserException("file extension '" + ext + "' is denied", url);
mimeType = ext2mime.get(ext);
if (mimeType == null) throw new ParserException("no parser available", url);
idiom = mime2parser.get(mimeType);
assert idiom != null;
if (idiom == null) throw new ParserException("no parser available (internal error!)", url);
return idiom;
}
public static String supportsMime(String mimeType) {
if (mimeType == null) return null;
mimeType = normalizeMimeType(mimeType);
return !denyMime.contains(mimeType) && mime2parser.containsKey(normalizeMimeType(mimeType));
if (denyMime.contains(mimeType)) return "mime type '" + mimeType + "' is denied";
if (mime2parser.get(mimeType) == null) return "no parser for mime '" + mimeType + "' available";
return null;
}
public static boolean supportsExtension(final yacyURL url) {
public static String supportsExtension(final yacyURL url) {
String ext = url.getFileExtension();
if (ext.length() == 0) return true; // may be anything; thats ok if the mime type is ok
return ext2mime.containsKey(ext);
if (ext == null || ext.length() == 0) return null;
if (denyExtension.contains(ext)) return "file extension '" + ext + "' is denied";
String mimeType = ext2mime.get(ext);
if (mimeType == null) return "no parser available";
Idiom idiom = mime2parser.get(mimeType);
assert idiom != null;
if (idiom == null) return "no parser available (internal error!)";
return null;
}
public static String mimeOf(yacyURL url) {
@ -235,9 +282,7 @@ public final class Parser {
}
public static String mimeOf(String ext) {
Set<String> mimes = ext2mime.get(ext);
if (mimes == null) return null;
return mimes.iterator().next();
return ext2mime.get(ext);
}
private static String normalizeMimeType(String mimeType) {
@ -261,4 +306,20 @@ public final class Parser {
public static void grantMime(String mime, boolean grant) {
if (grant) denyMime.remove(normalizeMimeType(mime)); else denyMime.add(normalizeMimeType(mime));
}
public static void setDenyExtension(String denyList) {
denyExtension.clear();
for (String s: denyList.split(",")) denyExtension.add(s);
}
public static String getDenyExtension() {
String s = "";
for (String d: denyExtension) s += d + ",";
s = s.substring(0, s.length() - 1);
return s;
}
public static void grantExtension(String ext, boolean grant) {
if (grant) denyExtension.remove(ext); else denyExtension.add(ext);
}
}

@ -30,7 +30,9 @@ package de.anomic.document.parser;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
import org.apache.tools.bzip2.CBZip2InputStream;
import de.anomic.document.AbstractParser;
@ -47,24 +49,31 @@ public class bzipParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
static final String fileExtensions = "bz2,tbz,tbz2";
public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static {
SUPPORTED_MIME_TYPES.put("application/x-bzip2",fileExtensions);
SUPPORTED_MIME_TYPES.put("application/bzip2", fileExtensions);
SUPPORTED_MIME_TYPES.put("application/x-bz2", fileExtensions);
SUPPORTED_MIME_TYPES.put("application/x-bzip",fileExtensions);
SUPPORTED_MIME_TYPES.put("application/x-stuffit",fileExtensions);
SUPPORTED_EXTENSIONS.add("bz2");
SUPPORTED_EXTENSIONS.add("tbz");
SUPPORTED_EXTENSIONS.add("tbz2");
SUPPORTED_MIME_TYPES.add("application/x-bzip2");
SUPPORTED_MIME_TYPES.add("application/bzip2");
SUPPORTED_MIME_TYPES.add("application/x-bz2");
SUPPORTED_MIME_TYPES.add("application/x-bzip");
SUPPORTED_MIME_TYPES.add("application/x-stuffit");
}
public bzipParser() {
super("Bzip 2 UNIX Compressed File Parser");
}
public HashMap<String, String> getSupportedMimeTypes() {
public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
public Document parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
File tempFile = null;

@ -30,7 +30,8 @@ package de.anomic.document.parser;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
import de.anomic.document.AbstractParser;
import de.anomic.document.Idiom;
@ -45,18 +46,20 @@ public class docParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static {
String ext = "doc,docx";
SUPPORTED_MIME_TYPES.put("application/msword",ext);
SUPPORTED_MIME_TYPES.put("application/doc",ext);
SUPPORTED_MIME_TYPES.put("appl/text",ext);
SUPPORTED_MIME_TYPES.put("application/vnd.msword",ext);
SUPPORTED_MIME_TYPES.put("application/vnd.ms-word",ext);
SUPPORTED_MIME_TYPES.put("application/winword",ext);
SUPPORTED_MIME_TYPES.put("application/word",ext);
SUPPORTED_MIME_TYPES.put("application/x-msw6",ext);
SUPPORTED_MIME_TYPES.put("application/x-msword",ext);
SUPPORTED_EXTENSIONS.add("doc");
SUPPORTED_EXTENSIONS.add("docx");
SUPPORTED_MIME_TYPES.add("application/msword");
SUPPORTED_MIME_TYPES.add("application/doc");
SUPPORTED_MIME_TYPES.add("appl/text");
SUPPORTED_MIME_TYPES.add("application/vnd.msword");
SUPPORTED_MIME_TYPES.add("application/vnd.ms-word");
SUPPORTED_MIME_TYPES.add("application/winword");
SUPPORTED_MIME_TYPES.add("application/word");
SUPPORTED_MIME_TYPES.add("application/x-msw6");
SUPPORTED_MIME_TYPES.add("application/x-msword");
}
public docParser() {
@ -115,9 +118,13 @@ public class docParser extends AbstractParser implements Idiom {
return theDoc;
}
public HashMap<String, String> getSupportedMimeTypes() {
return docParser.SUPPORTED_MIME_TYPES;
}
public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
@Override
public void reset() {

@ -30,7 +30,8 @@ package de.anomic.document.parser;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
import java.util.zip.GZIPInputStream;
import de.anomic.document.AbstractParser;
@ -47,28 +48,31 @@ public class gzipParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
static final String ext = "gz,tgz";
public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static {
SUPPORTED_MIME_TYPES.put("application/x-gzip",ext);
SUPPORTED_MIME_TYPES.put("application/gzip",ext);
SUPPORTED_MIME_TYPES.put("application/x-gunzip",ext);
SUPPORTED_MIME_TYPES.put("application/gzipped",ext);
SUPPORTED_MIME_TYPES.put("application/gzip-compressed",ext);
SUPPORTED_MIME_TYPES.put("application/x-compressed",ext);
SUPPORTED_MIME_TYPES.put("application/x-compress",ext);
SUPPORTED_MIME_TYPES.put("gzip/document",ext);
SUPPORTED_MIME_TYPES.put("application/octet-stream",ext);
SUPPORTED_EXTENSIONS.add("gz");
SUPPORTED_EXTENSIONS.add("tgz");
SUPPORTED_MIME_TYPES.add("application/x-gzip");
SUPPORTED_MIME_TYPES.add("application/gzip");
SUPPORTED_MIME_TYPES.add("application/x-gunzip");
SUPPORTED_MIME_TYPES.add("application/gzipped");
SUPPORTED_MIME_TYPES.add("application/gzip-compressed");
SUPPORTED_MIME_TYPES.add("gzip/document");
}
public gzipParser() {
super("GNU Zip Compressed Archive Parser");
}
public HashMap<String, String> getSupportedMimeTypes() {
public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
public Document parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
File tempFile = null;

@ -31,7 +31,9 @@ import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
import de.anomic.document.AbstractParser;
import de.anomic.document.Document;
import de.anomic.document.Idiom;
@ -48,17 +50,29 @@ public class htmlParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static {
String ext = "htm,html,shtml,xhtml,php,asp,aspx,txt,jsp,csv,pl,py";
SUPPORTED_MIME_TYPES.put("application/xhtml+xml", ext);
SUPPORTED_MIME_TYPES.put("text/html", ext);
SUPPORTED_MIME_TYPES.put("text/plain", ext);
SUPPORTED_MIME_TYPES.put("text/sgml",ext);
SUPPORTED_EXTENSIONS.add("htm");
SUPPORTED_EXTENSIONS.add("html");
SUPPORTED_EXTENSIONS.add("shtml");
SUPPORTED_EXTENSIONS.add("xhtml");
SUPPORTED_EXTENSIONS.add("php");
SUPPORTED_EXTENSIONS.add("asp");
SUPPORTED_EXTENSIONS.add("aspx");
SUPPORTED_EXTENSIONS.add("txt");
SUPPORTED_EXTENSIONS.add("jsp");
SUPPORTED_EXTENSIONS.add("csv");
SUPPORTED_EXTENSIONS.add("pl");
SUPPORTED_EXTENSIONS.add("py");
SUPPORTED_MIME_TYPES.add("application/xhtml+xml");
SUPPORTED_MIME_TYPES.add("text/html");
SUPPORTED_MIME_TYPES.add("text/plain");
SUPPORTED_MIME_TYPES.add("text/sgml");
}
public htmlParser() {
super("streaming html parser");
super("HTML Parser");
}
@Override
@ -213,9 +227,12 @@ public class htmlParser extends AbstractParser implements Idiom {
return encoding;
}
public HashMap<String, String> getSupportedMimeTypes() {
public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
}

@ -35,7 +35,6 @@ import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.charset.Charset;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
import java.util.zip.ZipEntry;
@ -63,20 +62,30 @@ public class odtParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static {
SUPPORTED_MIME_TYPES.put("application/vnd.oasis.opendocument.text","odt,ods,odp");
SUPPORTED_MIME_TYPES.put("application/x-vnd.oasis.opendocument.text","odt,ods,odp");
SUPPORTED_EXTENSIONS.add("odt");
SUPPORTED_EXTENSIONS.add("ods");
SUPPORTED_EXTENSIONS.add("odp");
SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.text");
SUPPORTED_MIME_TYPES.add("application/x-vnd.oasis.opendocument.text");
SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.presentation");
SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.spreadsheet");
}
public odtParser() {
super("OASIS OpenDocument V2 Text Document Parser");
}
public HashMap<String, String> getSupportedMimeTypes() {
public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
@Override
public Document parse(final yacyURL location, final String mimeType, final String charset, final File dest) throws ParserException, InterruptedException {

@ -33,7 +33,9 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDDocumentInformation;
@ -55,24 +57,30 @@ public class pdfParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static {
SUPPORTED_MIME_TYPES.put("application/pdf","pdf");
SUPPORTED_MIME_TYPES.put("application/x-pdf","pdf");
SUPPORTED_MIME_TYPES.put("application/acrobat","pdf");
SUPPORTED_MIME_TYPES.put("applications/vnd.pdf","pdf");
SUPPORTED_MIME_TYPES.put("text/pdf","pdf");
SUPPORTED_MIME_TYPES.put("text/x-pdf","pdf");
SUPPORTED_EXTENSIONS.add("pdf");
SUPPORTED_MIME_TYPES.add("application/pdf");
SUPPORTED_MIME_TYPES.add("application/x-pdf");
SUPPORTED_MIME_TYPES.add("application/acrobat");
SUPPORTED_MIME_TYPES.add("applications/vnd.pdf");
SUPPORTED_MIME_TYPES.add("text/pdf");
SUPPORTED_MIME_TYPES.add("text/x-pdf");
}
public pdfParser() {
super("Acrobat Portable Document Parser");
}
public HashMap<String, String> getSupportedMimeTypes() {
public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
public Document parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
PDDocument theDocument = null;

@ -29,7 +29,9 @@ package de.anomic.document.parser;
import java.io.BufferedInputStream;
import java.io.InputStream;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import de.anomic.document.AbstractParser;
@ -44,17 +46,20 @@ public class pptParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
static final String ext = "ppt,pptx,pps";
public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static {
SUPPORTED_MIME_TYPES.put("application/mspowerpoint",ext);
SUPPORTED_MIME_TYPES.put("application/powerpoint",ext);
SUPPORTED_MIME_TYPES.put("application/vnd.ms-powerpoint",ext);
SUPPORTED_MIME_TYPES.put("application/ms-powerpoint",ext);
SUPPORTED_MIME_TYPES.put("application/mspowerpnt",ext);
SUPPORTED_MIME_TYPES.put("application/vnd-mspowerpoint",ext);
SUPPORTED_MIME_TYPES.put("application/x-powerpoint",ext);
SUPPORTED_MIME_TYPES.put("application/x-m",ext);
SUPPORTED_EXTENSIONS.add("ppt");
SUPPORTED_EXTENSIONS.add("pptx");
SUPPORTED_EXTENSIONS.add("pps");
SUPPORTED_MIME_TYPES.add("application/mspowerpoint");
SUPPORTED_MIME_TYPES.add("application/powerpoint");
SUPPORTED_MIME_TYPES.add("application/vnd.ms-powerpoint");
SUPPORTED_MIME_TYPES.add("application/ms-powerpoint");
SUPPORTED_MIME_TYPES.add("application/mspowerpnt");
SUPPORTED_MIME_TYPES.add("application/vnd-mspowerpoint");
SUPPORTED_MIME_TYPES.add("application/x-powerpoint");
SUPPORTED_MIME_TYPES.add("application/x-m");
}
public pptParser(){
@ -114,10 +119,14 @@ public class pptParser extends AbstractParser implements Idiom {
}
}
public HashMap<String, String> getSupportedMimeTypes() {
public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
@Override
public void reset(){
//nothing to do

@ -34,7 +34,9 @@ import java.io.FileReader;
import java.io.FileWriter;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
import de.anomic.document.AbstractParser;
import de.anomic.document.Idiom;
import de.anomic.document.ParserException;
@ -48,12 +50,14 @@ public class psParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static {
SUPPORTED_MIME_TYPES.put("application/ps","ps");
SUPPORTED_MIME_TYPES.put("application/x-postscript","ps");
SUPPORTED_MIME_TYPES.put("application/x-ps","ps");
SUPPORTED_MIME_TYPES.put("application/x-postscript-not-eps","ps");
SUPPORTED_EXTENSIONS.add("ps");
SUPPORTED_MIME_TYPES.add("application/ps");
SUPPORTED_MIME_TYPES.add("application/x-postscript");
SUPPORTED_MIME_TYPES.add("application/x-ps");
SUPPORTED_MIME_TYPES.add("application/x-postscript-not-eps");
}
private final static Object modeScan = new Object();
@ -69,10 +73,14 @@ public class psParser extends AbstractParser implements Idiom {
}
}
public HashMap<String, String> getSupportedMimeTypes() {
public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
public boolean testForPs2Ascii() {
try {
String procOutputLine = null;

@ -31,6 +31,9 @@ import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.InputStream;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
import com.jguild.jrpm.io.RPMFile;
import com.jguild.jrpm.io.datatype.DataTypeIf;
@ -55,21 +58,27 @@ public class rpmParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static {
SUPPORTED_MIME_TYPES.put("application/x-rpm","rpm");
SUPPORTED_MIME_TYPES.put("application/x-redhat packet manager","rpm");
SUPPORTED_MIME_TYPES.put("application/x-redhat-package-manager","rpm");
SUPPORTED_EXTENSIONS.add("rpm");
SUPPORTED_MIME_TYPES.add("application/x-rpm");
SUPPORTED_MIME_TYPES.add("application/x-redhat packet manager");
SUPPORTED_MIME_TYPES.add("application/x-redhat-package-manager");
}
public rpmParser() {
super("rpm Parser");
}
public HashMap<String, String> getSupportedMimeTypes() {
public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
public Document parse(final yacyURL location, final String mimeType, final String charset,
final InputStream source) throws ParserException {
File dstFile = null;

@ -33,8 +33,10 @@ import java.io.InputStream;
import java.io.Writer;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Map;
import java.util.Set;
import de.anomic.content.RSSMessage;
import de.anomic.document.AbstractParser;
@ -58,13 +60,16 @@ public class rssParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
static final String fileExtensions = "xml,rss,rdf";
public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static {
SUPPORTED_MIME_TYPES.put("text/rss",fileExtensions);
SUPPORTED_MIME_TYPES.put("application/rdf+xml",fileExtensions);
SUPPORTED_MIME_TYPES.put("application/rss+xml",fileExtensions);
SUPPORTED_MIME_TYPES.put("application/atom+xml",fileExtensions);
SUPPORTED_EXTENSIONS.add("xml");
SUPPORTED_EXTENSIONS.add("rss");
SUPPORTED_EXTENSIONS.add("rdf");
SUPPORTED_MIME_TYPES.add("text/rss");
SUPPORTED_MIME_TYPES.add("application/rdf+xml");
SUPPORTED_MIME_TYPES.add("application/rss+xml");
SUPPORTED_MIME_TYPES.add("application/atom+xml");
}
public rssParser() {
@ -174,9 +179,13 @@ public class rssParser extends AbstractParser implements Idiom {
}
}
public HashMap<String, String> getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
@Override
public void reset() {

@ -28,7 +28,9 @@
package de.anomic.document.parser;
import java.io.InputStream;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
import javax.swing.text.DefaultStyledDocument;
import javax.swing.text.rtf.RTFEditorKit;
@ -44,13 +46,15 @@ public class rtfParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static {
SUPPORTED_MIME_TYPES.put("application/rtf","rtf");
SUPPORTED_MIME_TYPES.put("text/rtf","rtf");
SUPPORTED_MIME_TYPES.put("application/x-rtf","rtf");
SUPPORTED_MIME_TYPES.put("text/richtext","rtf");
SUPPORTED_MIME_TYPES.put("application/x-soffice","rtf");
SUPPORTED_EXTENSIONS.add("rtf");
SUPPORTED_MIME_TYPES.add("application/rtf");
SUPPORTED_MIME_TYPES.add("text/rtf");
SUPPORTED_MIME_TYPES.add("application/x-rtf");
SUPPORTED_MIME_TYPES.add("text/richtext");
SUPPORTED_MIME_TYPES.add("application/x-soffice");
}
public rtfParser() {
@ -96,9 +100,13 @@ public class rtfParser extends AbstractParser implements Idiom {
}
}
public HashMap<String, String> getSupportedMimeTypes() {
return rtfParser.SUPPORTED_MIME_TYPES;
}
public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
public void reset() {
// Nothing todo here at the moment

@ -32,7 +32,9 @@ import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
import SevenZip.ArchiveExtractCallback;
import SevenZip.IInStream;
import SevenZip.MyRandomAccessFile;
@ -55,9 +57,11 @@ public class sevenzipParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static {
SUPPORTED_MIME_TYPES.put("application/x-7z-compressed", "7z");
SUPPORTED_EXTENSIONS.add("7z");
SUPPORTED_MIME_TYPES.add("application/x-7z-compressed");
}
public sevenzipParser() {
@ -124,10 +128,14 @@ public class sevenzipParser extends AbstractParser implements Idiom {
}
}
public HashMap<String, String> getSupportedMimeTypes() {
public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
// wrapper class to redirect output of standard ArchiveExtractCallback to serverLog
// and parse the extracted content

@ -29,6 +29,9 @@ package de.anomic.document.parser;
import java.io.InputStream;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
import pt.tumba.parser.swf.SWF2HTML;
import de.anomic.document.AbstractParser;
import de.anomic.document.Idiom;
@ -42,25 +45,28 @@ public class swfParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static {
SUPPORTED_MIME_TYPES.put("application/x-shockwave-flash","swf");
SUPPORTED_MIME_TYPES.put("application/x-shockwave-flash2-preview","swf");
SUPPORTED_MIME_TYPES.put("application/futuresplash","swf");
SUPPORTED_MIME_TYPES.put("image/vnd.rn-realflash","swf");
SUPPORTED_EXTENSIONS.add("swf");
SUPPORTED_MIME_TYPES.add("application/x-shockwave-flash");
SUPPORTED_MIME_TYPES.add("application/x-shockwave-flash2-preview");
SUPPORTED_MIME_TYPES.add("application/futuresplash");
SUPPORTED_MIME_TYPES.add("image/vnd.rn-realflash");
}
public swfParser() {
super("Adobe Flash Parser");
}
/**
* returns a hashtable containing the mimetypes that are supported by this class
*/
public HashMap<String, String> getSupportedMimeTypes() {
public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
/*
* parses the source documents and returns a plasmaParserDocument containing
* all extracted information about the parsed document

@ -34,8 +34,10 @@ import java.io.InputStream;
import java.io.OutputStream;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Map;
import java.util.Set;
import java.util.zip.GZIPInputStream;
import com.ice.tar.TarEntry;
@ -58,22 +60,28 @@ public class tarParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static {
SUPPORTED_MIME_TYPES.put("application/x-tar","tar");
SUPPORTED_MIME_TYPES.put("application/tar","tar");
SUPPORTED_MIME_TYPES.put("applicaton/x-gtar","tar");
SUPPORTED_MIME_TYPES.put("multipart/x-tar","tar");
SUPPORTED_EXTENSIONS.add("tar");
SUPPORTED_MIME_TYPES.add("application/x-tar");
SUPPORTED_MIME_TYPES.add("application/tar");
SUPPORTED_MIME_TYPES.add("applicaton/x-gtar");
SUPPORTED_MIME_TYPES.add("multipart/x-tar");
}
public tarParser() {
super("Tape Archive File Parser");
}
public HashMap<String, String> getSupportedMimeTypes() {
public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
public Document parse(final yacyURL location, final String mimeType, final String charset, InputStream source) throws ParserException, InterruptedException {
long docTextLength = 0;

@ -33,8 +33,10 @@ import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Set;
import de.anomic.crawler.HTTPLoader;
import de.anomic.document.AbstractParser;
@ -60,25 +62,31 @@ public class vcfParser extends AbstractParser implements Idiom {
*
* TODO: support of x-mozilla-cpt and x-mozilla-html tags
*/
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static {
SUPPORTED_MIME_TYPES.put("text/x-vcard","vcf");
SUPPORTED_MIME_TYPES.put("application/vcard","vcf");
SUPPORTED_MIME_TYPES.put("text/anytext","vcf");
SUPPORTED_MIME_TYPES.put("text/directory","vcf");
SUPPORTED_MIME_TYPES.put("application/x-versit","vcf");
SUPPORTED_MIME_TYPES.put("text/x-versit","vcf");
SUPPORTED_MIME_TYPES.put("text/x-vcalendar","vcf");
SUPPORTED_EXTENSIONS.add("vcf");
SUPPORTED_MIME_TYPES.add("text/x-vcard");
SUPPORTED_MIME_TYPES.add("application/vcard");
SUPPORTED_MIME_TYPES.add("text/anytext");
SUPPORTED_MIME_TYPES.add("text/directory");
SUPPORTED_MIME_TYPES.add("application/x-versit");
SUPPORTED_MIME_TYPES.add("text/x-versit");
SUPPORTED_MIME_TYPES.add("text/x-vcalendar");
}
public vcfParser() {
super("vCard Parser");
}
public HashMap<String, String> getSupportedMimeTypes() {
public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
public Document parse(final yacyURL url, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
try {

@ -28,7 +28,9 @@
package de.anomic.document.parser;
import java.io.InputStream;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
import de.anomic.document.AbstractParser;
import de.anomic.document.Idiom;
import de.anomic.document.ParserException;
@ -43,29 +45,32 @@ public class vsdParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static {
SUPPORTED_MIME_TYPES.put("application/visio","vsd");
SUPPORTED_MIME_TYPES.put("application/x-visio","vsd");
SUPPORTED_MIME_TYPES.put("application/vnd.visio","vsd");
SUPPORTED_MIME_TYPES.put("application/visio.drawing","vsd");
SUPPORTED_MIME_TYPES.put("application/vsd","vsd");
SUPPORTED_MIME_TYPES.put("application/x-vsd","vsd");
SUPPORTED_MIME_TYPES.put("image/x-vsd","vsd");
SUPPORTED_MIME_TYPES.put("zz-application/zz-winassoc-vsd","vsd");
SUPPORTED_EXTENSIONS.add("vsd");
SUPPORTED_MIME_TYPES.add("application/visio");
SUPPORTED_MIME_TYPES.add("application/x-visio");
SUPPORTED_MIME_TYPES.add("application/vnd.visio");
SUPPORTED_MIME_TYPES.add("application/visio.drawing");
SUPPORTED_MIME_TYPES.add("application/vsd");
SUPPORTED_MIME_TYPES.add("application/x-vsd");
SUPPORTED_MIME_TYPES.add("image/x-vsd");
SUPPORTED_MIME_TYPES.add("zz-application/zz-winassoc-vsd");
}
public vsdParser() {
super("Microsoft Visio Parser");
}
/**
* returns a hashtable containing the mimetypes that are supported by this class
*/
public HashMap<String, String> getSupportedMimeTypes() {
public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
/*
* parses the source documents and returns a plasmaParserDocument containing
* all extracted information about the parsed document

@ -28,7 +28,9 @@
package de.anomic.document.parser;
import java.io.InputStream;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
import org.apache.poi.hssf.eventusermodel.HSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFRequest;
@ -56,17 +58,19 @@ public class xlsParser extends AbstractParser implements Idiom, HSSFListener {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static {
String ext = "xls,xlsx";
SUPPORTED_MIME_TYPES.put("application/msexcel",ext);
SUPPORTED_MIME_TYPES.put("application/excel",ext);
SUPPORTED_MIME_TYPES.put("application/vnd.ms-excel",ext);
SUPPORTED_MIME_TYPES.put("application/x-excel",ext);
SUPPORTED_MIME_TYPES.put("application/x-msexcel",ext);
SUPPORTED_MIME_TYPES.put("application/x-ms-excel",ext);
SUPPORTED_MIME_TYPES.put("application/x-dos_ms_excel",ext);
SUPPORTED_MIME_TYPES.put("application/xls",ext);
SUPPORTED_EXTENSIONS.add("xls");
SUPPORTED_EXTENSIONS.add("xlsx");
SUPPORTED_MIME_TYPES.add("application/msexcel");
SUPPORTED_MIME_TYPES.add("application/excel");
SUPPORTED_MIME_TYPES.add("application/vnd.ms-excel");
SUPPORTED_MIME_TYPES.add("application/x-excel");
SUPPORTED_MIME_TYPES.add("application/x-msexcel");
SUPPORTED_MIME_TYPES.add("application/x-ms-excel");
SUPPORTED_MIME_TYPES.add("application/x-dos_ms_excel");
SUPPORTED_MIME_TYPES.add("application/xls");
}
public xlsParser(){
@ -134,10 +138,14 @@ public class xlsParser extends AbstractParser implements Idiom, HSSFListener {
}
}
public HashMap<String, String> getSupportedMimeTypes() {
public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
@Override
public void reset(){
//nothing to do

@ -34,8 +34,10 @@ import java.io.InputStream;
import java.io.OutputStream;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Map;
import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
@ -56,26 +58,31 @@ public class zipParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static {
SUPPORTED_MIME_TYPES.put("application/zip","zip");
SUPPORTED_MIME_TYPES.put("application/x-zip","zip");
SUPPORTED_MIME_TYPES.put("application/x-zip-compressed","zip");
SUPPORTED_MIME_TYPES.put("application/octet-stream","zip");
SUPPORTED_MIME_TYPES.put("application/x-compress","zip");
SUPPORTED_MIME_TYPES.put("application/x-compressed","zip");
SUPPORTED_MIME_TYPES.put("multipart/x-zip","zip");
SUPPORTED_MIME_TYPES.put("application/java-archive","jar");
SUPPORTED_EXTENSIONS.add("zip");
SUPPORTED_MIME_TYPES.add("application/zip");
SUPPORTED_MIME_TYPES.add("application/x-zip");
SUPPORTED_MIME_TYPES.add("application/x-zip-compressed");
SUPPORTED_MIME_TYPES.add("application/x-compress");
SUPPORTED_MIME_TYPES.add("application/x-compressed");
SUPPORTED_MIME_TYPES.add("multipart/x-zip");
SUPPORTED_MIME_TYPES.add("application/java-archive");
}
public zipParser() {
super("Compressed Archive File Parser");
super("ZIP File Parser");
}
public HashMap<String, String> getSupportedMimeTypes() {
public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
public Document parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
long docTextLength = 0;

@ -528,7 +528,7 @@ public final class httpdProxyHandler {
final String storeError = cacheEntry.shallStoreCacheForProxy();
final boolean storeHTCache = cacheEntry.profile().storeHTCache();
final boolean isSupportedContent = Parser.supportsExtension(cacheEntry.url()) && Parser.supportsMime(cacheEntry.getMimeType());
final String supportError = Parser.supports(cacheEntry.url(), cacheEntry.getMimeType());
if (
/*
* Now we store the response into the htcache directory if
@ -539,7 +539,7 @@ public final class httpdProxyHandler {
* b) the user has configured to use the htcache OR
* c) the content should be indexed
*/
((storeHTCache) || (isSupportedContent))
((storeHTCache) || (supportError != null))
) {
// we don't write actually into a file, only to RAM, and schedule writing the file.
int l = res.getResponseHeader().size();
@ -580,7 +580,7 @@ public final class httpdProxyHandler {
if (theLogger.isFine()) theLogger.logFine(reqID +" "+ url.toString() + " not cached." +
" StoreError=" + ((storeError==null)?"None":storeError) +
" StoreHTCache=" + storeHTCache +
" SupportetContent=" + isSupportedContent);
" SupportError=" + supportError);
FileUtils.copy(res.getDataAsStream(), outStream);

@ -42,7 +42,6 @@ import java.util.HashMap;
import java.util.Map;
import de.anomic.document.Classification;
import de.anomic.document.Parser;
import de.anomic.http.httpResponseHeader;
import de.anomic.http.httpDocument;
import de.anomic.kelondro.blob.ArrayStack;
@ -181,10 +180,6 @@ public final class plasmaHTCache {
return mimeType.toUpperCase().startsWith("IMAGE");
}
public static boolean isText(final String mimeType) {
return Parser.supportsMime(mimeType);
}
public static boolean noIndexingURL(final yacyURL url) {
if (url == null) return false;
String urlString = url.toString().toLowerCase();

@ -1087,8 +1087,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
*
* Testing if the content type is supported by the available parsers
* ========================================================================= */
final boolean isSupportedContent = Parser.supportsExtension(entry.url()) && Parser.supportsMime(entry.getMimeType());
if (log.isFinest()) log.logFinest("STORE "+ entry.url() +" content of type "+ entry.getMimeType() +" is supported: "+ isSupportedContent);
final String supportError = Parser.supports(entry.url(), entry.getMimeType());
if (log.isFinest()) log.logFinest("STORE "+ entry.url() +" content of type "+ entry.getMimeType() + " is supported: " + supportError);
/* =========================================================================
* INDEX CONTROL HEADER
@ -1121,7 +1121,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
* a) the user has configured to use the htcache or
* b) the content should be indexed
* ========================================================================= */
if (((entry.profile() != null) && (entry.profile().storeHTCache())) || (doIndexing && isSupportedContent)) {
if (((entry.profile() != null) && (entry.profile().storeHTCache())) || (doIndexing && supportError == null)) {
// store response header
/*
if (entry.writeResourceInfo()) {
@ -1146,7 +1146,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
/* =========================================================================
* INDEXING
* ========================================================================= */
if (doIndexing && isSupportedContent) {
if (doIndexing && supportError == null) {
// enqueue for further crawling
enQueue(this.crawler.queuePreStack.newEntry(

@ -865,17 +865,15 @@ public class SnippetCache {
}
// STEP 3: if the metadata is still null try to guess the mimeType of the resource
if (responseHeader == null) {
if (Parser.supportsExtension(url)) {
String supposedMime = Parser.mimeOf(url);
return Parser.parseSource(url, supposedMime, null, contentLength, resourceStream);
}
String supportError = Parser.supports(url, responseHeader == null ? null : responseHeader.mime());
if (supportError != null) {
log.logInfo("could not generate snippet for " + url.toNormalform(true, false) + ": " + supportError);
return null;
}
if (Parser.supportsMime(responseHeader.mime())) {
return Parser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), contentLength, resourceStream);
if (responseHeader == null) {
return Parser.parseSource(url, null, null, contentLength, resourceStream);
}
return null;
return Parser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), contentLength, resourceStream);
} catch (final InterruptedException e) {
// interruption of thread detected
return null;

Loading…
Cancel
Save