From 1432a817dd2fd94bea3a6a3f145d7efe968ee727 Mon Sep 17 00:00:00 2001 From: reger Date: Thu, 8 May 2014 22:21:24 +0200 Subject: [PATCH] respect "index media" switched off in CrawlStartExpert.html fix http://mantis.tokeek.de/view.php?id=64 --- source/net/yacy/search/Switchboard.java | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 3b335fafe..5c5071a3c 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2655,6 +2655,19 @@ public final class Switchboard extends serverSwitch { if (!profile.indexText() && !profile.indexMedia()) { if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': indexing of this media type not wanted by crawl profile"); return new IndexingQueueEntry(in.queueEntry, in.documents, null); + } else if (!profile.indexMedia()) { // check for media excluded for indexing + // check media by file extension + if ( Classification.isMediaExtension(MultiProtocolURL.getFileExtension(in.queueEntry.url().getFileName()))) { + this.log.info("Not Condensed Resource '" + urls + "': indexing of media files not wanted by crawl profile"); + return new IndexingQueueEntry(in.queueEntry, in.documents, null); + } + // double check media by mime in case of no file extension + Classification.ContentDomain cd = Classification.getContentDomainFromMime(in.queueEntry.getMimeType()); + // don't exclude contentdomain.app (from mime) to keep pdf word etc. + if (cd == Classification.ContentDomain.IMAGE || cd == Classification.ContentDomain.VIDEO || cd == Classification.ContentDomain.AUDIO ) { + this.log.info("Not Condensed Resource '" + urls + "': indexing of media not wanted by crawl profile"); + return new IndexingQueueEntry(in.queueEntry, in.documents, null); + } } if (!(profile.indexUrlMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexUrlMustMatchPattern().matcher(urls).matches()) || (profile.indexUrlMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexUrlMustNotMatchPattern().matcher(urls).matches())) {