From 43c8defd7932d5c0ce9d9ec137d328409c62d4d7 Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 14 Jul 2009 13:32:53 +0000 Subject: [PATCH] enhanced parser with more extension + mime attributes git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6214 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- .classpath | 2 +- source/de/anomic/crawler/HTTPLoader.java | 4 ++-- source/de/anomic/document/parser/htmlParser.java | 1 + source/de/anomic/document/parser/odtParser.java | 4 ++++ source/de/anomic/document/parser/psParser.java | 1 + source/de/anomic/document/parser/rtfParser.java | 4 ++-- source/de/anomic/document/parser/vsdParser.java | 3 +++ 7 files changed, 14 insertions(+), 5 deletions(-) diff --git a/.classpath b/.classpath index b4f58cd80..1b443b512 100644 --- a/.classpath +++ b/.classpath @@ -18,7 +18,6 @@ - @@ -37,5 +36,6 @@ + diff --git a/source/de/anomic/crawler/HTTPLoader.java b/source/de/anomic/crawler/HTTPLoader.java index acdadf150..94f17361b 100644 --- a/source/de/anomic/crawler/HTTPLoader.java +++ b/source/de/anomic/crawler/HTTPLoader.java @@ -123,7 +123,7 @@ public final class HTTPLoader { String supportError = Parser.supportsExtension(entry.url()); if (supportError != null) { sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, supportError); - throw new IOException("REJECTED WRONG EXTENSION TYPE " + entry.url().getFileExtension()+ " for URL " + entry.url().toString()); + throw new IOException("REJECTED WRONG EXTENSION TYPE: " + supportError); } // check if url is in blacklist @@ -170,7 +170,7 @@ public final class HTTPLoader { supportError = Parser.supports(entry.url(), res.getResponseHeader().mime()); if (supportError != null) { sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, supportError); - throw new IOException("REJECTED WRONG MIME TYPE " + res.getResponseHeader().mime() + " for URL " + entry.url().toString()); + throw new IOException("REJECTED WRONG MIME TYPE: " + supportError); } /* diff --git a/source/de/anomic/document/parser/htmlParser.java b/source/de/anomic/document/parser/htmlParser.java index 3e686eb93..2c6529fbc 100644 --- a/source/de/anomic/document/parser/htmlParser.java +++ b/source/de/anomic/document/parser/htmlParser.java @@ -69,6 +69,7 @@ public class htmlParser extends AbstractParser implements Idiom { SUPPORTED_MIME_TYPES.add("text/html"); SUPPORTED_MIME_TYPES.add("text/plain"); SUPPORTED_MIME_TYPES.add("text/sgml"); + SUPPORTED_MIME_TYPES.add("text/csv"); } public htmlParser() { diff --git a/source/de/anomic/document/parser/odtParser.java b/source/de/anomic/document/parser/odtParser.java index a65244f99..8f918fc33 100644 --- a/source/de/anomic/document/parser/odtParser.java +++ b/source/de/anomic/document/parser/odtParser.java @@ -68,10 +68,14 @@ public class odtParser extends AbstractParser implements Idiom { SUPPORTED_EXTENSIONS.add("odt"); SUPPORTED_EXTENSIONS.add("ods"); SUPPORTED_EXTENSIONS.add("odp"); + SUPPORTED_EXTENSIONS.add("sxw"); // Star Office Writer file format + SUPPORTED_EXTENSIONS.add("sxc"); // Star Office Calc file format SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.text"); SUPPORTED_MIME_TYPES.add("application/x-vnd.oasis.opendocument.text"); SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.presentation"); SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.spreadsheet"); + SUPPORTED_MIME_TYPES.add("application/OOo-calc"); + SUPPORTED_MIME_TYPES.add("application/OOo-writer"); } public odtParser() { diff --git a/source/de/anomic/document/parser/psParser.java b/source/de/anomic/document/parser/psParser.java index c6c8fdf09..53f965d7d 100644 --- a/source/de/anomic/document/parser/psParser.java +++ b/source/de/anomic/document/parser/psParser.java @@ -54,6 +54,7 @@ public class psParser extends AbstractParser implements Idiom { public static final Set SUPPORTED_EXTENSIONS = new HashSet(); static { SUPPORTED_EXTENSIONS.add("ps"); + SUPPORTED_MIME_TYPES.add("application/postscript"); SUPPORTED_MIME_TYPES.add("application/ps"); SUPPORTED_MIME_TYPES.add("application/x-postscript"); SUPPORTED_MIME_TYPES.add("application/x-ps"); diff --git a/source/de/anomic/document/parser/rtfParser.java b/source/de/anomic/document/parser/rtfParser.java index f795904c2..63de790d8 100644 --- a/source/de/anomic/document/parser/rtfParser.java +++ b/source/de/anomic/document/parser/rtfParser.java @@ -50,10 +50,10 @@ public class rtfParser extends AbstractParser implements Idiom { public static final Set SUPPORTED_EXTENSIONS = new HashSet(); static { SUPPORTED_EXTENSIONS.add("rtf"); - SUPPORTED_MIME_TYPES.add("application/rtf"); SUPPORTED_MIME_TYPES.add("text/rtf"); - SUPPORTED_MIME_TYPES.add("application/x-rtf"); SUPPORTED_MIME_TYPES.add("text/richtext"); + SUPPORTED_MIME_TYPES.add("application/rtf"); + SUPPORTED_MIME_TYPES.add("application/x-rtf"); SUPPORTED_MIME_TYPES.add("application/x-soffice"); } diff --git a/source/de/anomic/document/parser/vsdParser.java b/source/de/anomic/document/parser/vsdParser.java index e50604532..e133e1146 100644 --- a/source/de/anomic/document/parser/vsdParser.java +++ b/source/de/anomic/document/parser/vsdParser.java @@ -49,6 +49,9 @@ public class vsdParser extends AbstractParser implements Idiom { public static final Set SUPPORTED_EXTENSIONS = new HashSet(); static { SUPPORTED_EXTENSIONS.add("vsd"); + SUPPORTED_EXTENSIONS.add("vst"); + SUPPORTED_EXTENSIONS.add("vdx"); + SUPPORTED_EXTENSIONS.add("vtx"); SUPPORTED_MIME_TYPES.add("application/visio"); SUPPORTED_MIME_TYPES.add("application/x-visio"); SUPPORTED_MIME_TYPES.add("application/vnd.visio");