enhanced parser with more extension + mime attributes

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6214 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent aee35bff6f
commit 43c8defd79

@ -18,7 +18,6 @@
<classpathentry exported="true" kind="lib" path="lib/commons-fileupload-1.2.1.jar"/> <classpathentry exported="true" kind="lib" path="lib/commons-fileupload-1.2.1.jar"/>
<classpathentry exported="true" kind="lib" path="lib/servlet-api.jar"/> <classpathentry exported="true" kind="lib" path="lib/servlet-api.jar"/>
<classpathentry exported="true" kind="lib" path="lib/commons-jxpath-1.1.jar"/> <classpathentry exported="true" kind="lib" path="lib/commons-jxpath-1.1.jar"/>
<classpathentry exported="true" kind="lib" path="lib/sbbi-upnplib-1.0.4.jar"/>
<classpathentry kind="lib" path="lib/xerces.jar"/> <classpathentry kind="lib" path="lib/xerces.jar"/>
<classpathentry kind="lib" path="lib/bzip2.jar"/> <classpathentry kind="lib" path="lib/bzip2.jar"/>
<classpathentry kind="lib" path="lib/mysql-connector-java-5.1.7-bin.jar"/> <classpathentry kind="lib" path="lib/mysql-connector-java-5.1.7-bin.jar"/>
@ -37,5 +36,6 @@
<classpathentry kind="lib" path="lib/odf_utils_05_11_29.jar"/> <classpathentry kind="lib" path="lib/odf_utils_05_11_29.jar"/>
<classpathentry kind="lib" path="lib/jrpm-SNAPSHOT.jar"/> <classpathentry kind="lib" path="lib/jrpm-SNAPSHOT.jar"/>
<classpathentry kind="lib" path="lib/activation.jar"/> <classpathentry kind="lib" path="lib/activation.jar"/>
<classpathentry kind="lib" path="lib/sbbi-upnplib-1.0.4.jar"/>
<classpathentry kind="output" path="gen"/> <classpathentry kind="output" path="gen"/>
</classpath> </classpath>

@ -123,7 +123,7 @@ public final class HTTPLoader {
String supportError = Parser.supportsExtension(entry.url()); String supportError = Parser.supportsExtension(entry.url());
if (supportError != null) { if (supportError != null) {
sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, supportError); sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, supportError);
throw new IOException("REJECTED WRONG EXTENSION TYPE " + entry.url().getFileExtension()+ " for URL " + entry.url().toString()); throw new IOException("REJECTED WRONG EXTENSION TYPE: " + supportError);
} }
// check if url is in blacklist // check if url is in blacklist
@ -170,7 +170,7 @@ public final class HTTPLoader {
supportError = Parser.supports(entry.url(), res.getResponseHeader().mime()); supportError = Parser.supports(entry.url(), res.getResponseHeader().mime());
if (supportError != null) { if (supportError != null) {
sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, supportError); sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, supportError);
throw new IOException("REJECTED WRONG MIME TYPE " + res.getResponseHeader().mime() + " for URL " + entry.url().toString()); throw new IOException("REJECTED WRONG MIME TYPE: " + supportError);
} }
/* /*

@ -69,6 +69,7 @@ public class htmlParser extends AbstractParser implements Idiom {
SUPPORTED_MIME_TYPES.add("text/html"); SUPPORTED_MIME_TYPES.add("text/html");
SUPPORTED_MIME_TYPES.add("text/plain"); SUPPORTED_MIME_TYPES.add("text/plain");
SUPPORTED_MIME_TYPES.add("text/sgml"); SUPPORTED_MIME_TYPES.add("text/sgml");
SUPPORTED_MIME_TYPES.add("text/csv");
} }
public htmlParser() { public htmlParser() {

@ -68,10 +68,14 @@ public class odtParser extends AbstractParser implements Idiom {
SUPPORTED_EXTENSIONS.add("odt"); SUPPORTED_EXTENSIONS.add("odt");
SUPPORTED_EXTENSIONS.add("ods"); SUPPORTED_EXTENSIONS.add("ods");
SUPPORTED_EXTENSIONS.add("odp"); SUPPORTED_EXTENSIONS.add("odp");
SUPPORTED_EXTENSIONS.add("sxw"); // Star Office Writer file format
SUPPORTED_EXTENSIONS.add("sxc"); // Star Office Calc file format
SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.text"); SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.text");
SUPPORTED_MIME_TYPES.add("application/x-vnd.oasis.opendocument.text"); SUPPORTED_MIME_TYPES.add("application/x-vnd.oasis.opendocument.text");
SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.presentation"); SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.presentation");
SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.spreadsheet"); SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.spreadsheet");
SUPPORTED_MIME_TYPES.add("application/OOo-calc");
SUPPORTED_MIME_TYPES.add("application/OOo-writer");
} }
public odtParser() { public odtParser() {

@ -54,6 +54,7 @@ public class psParser extends AbstractParser implements Idiom {
public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>(); public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static { static {
SUPPORTED_EXTENSIONS.add("ps"); SUPPORTED_EXTENSIONS.add("ps");
SUPPORTED_MIME_TYPES.add("application/postscript");
SUPPORTED_MIME_TYPES.add("application/ps"); SUPPORTED_MIME_TYPES.add("application/ps");
SUPPORTED_MIME_TYPES.add("application/x-postscript"); SUPPORTED_MIME_TYPES.add("application/x-postscript");
SUPPORTED_MIME_TYPES.add("application/x-ps"); SUPPORTED_MIME_TYPES.add("application/x-ps");

@ -50,10 +50,10 @@ public class rtfParser extends AbstractParser implements Idiom {
public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>(); public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static { static {
SUPPORTED_EXTENSIONS.add("rtf"); SUPPORTED_EXTENSIONS.add("rtf");
SUPPORTED_MIME_TYPES.add("application/rtf");
SUPPORTED_MIME_TYPES.add("text/rtf"); SUPPORTED_MIME_TYPES.add("text/rtf");
SUPPORTED_MIME_TYPES.add("application/x-rtf");
SUPPORTED_MIME_TYPES.add("text/richtext"); SUPPORTED_MIME_TYPES.add("text/richtext");
SUPPORTED_MIME_TYPES.add("application/rtf");
SUPPORTED_MIME_TYPES.add("application/x-rtf");
SUPPORTED_MIME_TYPES.add("application/x-soffice"); SUPPORTED_MIME_TYPES.add("application/x-soffice");
} }

@ -49,6 +49,9 @@ public class vsdParser extends AbstractParser implements Idiom {
public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>(); public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static { static {
SUPPORTED_EXTENSIONS.add("vsd"); SUPPORTED_EXTENSIONS.add("vsd");
SUPPORTED_EXTENSIONS.add("vst");
SUPPORTED_EXTENSIONS.add("vdx");
SUPPORTED_EXTENSIONS.add("vtx");
SUPPORTED_MIME_TYPES.add("application/visio"); SUPPORTED_MIME_TYPES.add("application/visio");
SUPPORTED_MIME_TYPES.add("application/x-visio"); SUPPORTED_MIME_TYPES.add("application/x-visio");
SUPPORTED_MIME_TYPES.add("application/vnd.visio"); SUPPORTED_MIME_TYPES.add("application/vnd.visio");

Loading…
Cancel
Save