diff --git a/defaults/yacy.init b/defaults/yacy.init index 7a51c6ced..4671e41b7 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -799,7 +799,7 @@ search.verify.delete = true # images may be treated either as documents that are shown in search results or as objects # that are only visible in special search environments, like image search search.excludeintext.image = true -crawler.load.image = true; +crawler.load.image = true # remote search details remotesearch.maxcount = 10 diff --git a/htroot/CrawlStartExpert_p.java b/htroot/CrawlStartExpert_p.java index 32aaef62e..adb2de9f4 100644 --- a/htroot/CrawlStartExpert_p.java +++ b/htroot/CrawlStartExpert_p.java @@ -31,14 +31,13 @@ import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.crawler.data.CrawlProfile; import net.yacy.search.Switchboard; -import net.yacy.search.SwitchboardConstants; import net.yacy.search.schema.CollectionSchema; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; public class CrawlStartExpert_p { - public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, @SuppressWarnings("unused") final serverObjects post, final serverSwitch env) { + public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) { // return variable that accumulates replacements final Switchboard sb = (Switchboard) env; final serverObjects prop = new serverObjects(); diff --git a/source/net/yacy/cora/document/analysis/Classification.java b/source/net/yacy/cora/document/analysis/Classification.java index 470c1e98f..30fe7b8b3 100644 --- a/source/net/yacy/cora/document/analysis/Classification.java +++ b/source/net/yacy/cora/document/analysis/Classification.java @@ -149,7 +149,13 @@ public class Classification { return textExtSet.contains(ext) || mediaExtSet.contains(ext) || ctrlExtSet.contains(ext); } - public static ContentDomain getContentDomain(final String ext) { + /** + * Get the content domain of a document according to the file extension. + * This can produce wrong results because the extension is a weak hint for the content domain. + * If possible, use the mime type, call Classification.getContentDomainFromMime() + * @return the content domain which classifies the content type + */ + public static ContentDomain getContentDomainFromExt(final String ext) { if (isTextExtension(ext)) return ContentDomain.TEXT; if (isImageExtension(ext)) return ContentDomain.IMAGE; if (isAudioExtension(ext)) return ContentDomain.AUDIO; @@ -159,6 +165,19 @@ public class Classification { return ContentDomain.ALL; } + /** + * Get the content domain of a document according to the mime type. + * @return the content domain which classifies the content type + */ + public static ContentDomain getContentDomainFromMime(final String mime) { + if (mime.startsWith("text/")) return ContentDomain.TEXT; + if (mime.startsWith("image/")) return ContentDomain.IMAGE; + if (mime.startsWith("audio/")) return ContentDomain.AUDIO; + if (mime.startsWith("video/")) return ContentDomain.VIDEO; + if (mime.startsWith("application/")) return ContentDomain.APP; + return ContentDomain.ALL; + } + public static boolean isPictureMime(final String mimeType) { if (mimeType == null) return false; return mimeType.toUpperCase().startsWith("IMAGE"); diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java index 86400e504..de91810f7 100644 --- a/source/net/yacy/cora/document/id/MultiProtocolURL.java +++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java @@ -269,9 +269,15 @@ public class MultiProtocolURL implements Serializable, Comparable= 0 && entry.size() > maxFileSize) || - entry.url().getContentDomain() == ContentDomain.APP || - (!loadImages && entry.url().getContentDomain() == ContentDomain.IMAGE) || - entry.url().getContentDomain() == ContentDomain.AUDIO || - entry.url().getContentDomain() == ContentDomain.VIDEO || - entry.url().getContentDomain() == ContentDomain.CTRL) { + contentDomain == ContentDomain.APP || + (!loadImages && contentDomain == ContentDomain.IMAGE) || + contentDomain == ContentDomain.AUDIO || + contentDomain == ContentDomain.VIDEO || + contentDomain == ContentDomain.CTRL) { warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry, profile, this.robots); //if (warning != null && this.log.isFine()) this.log.logFine("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true, false) + " - not pushed: " + warning); return null; diff --git a/source/net/yacy/data/ymark/YMarkMetadata.java b/source/net/yacy/data/ymark/YMarkMetadata.java index 3bc88dd7b..955eb80e3 100644 --- a/source/net/yacy/data/ymark/YMarkMetadata.java +++ b/source/net/yacy/data/ymark/YMarkMetadata.java @@ -99,7 +99,8 @@ public class YMarkMetadata { if(this.document == null) { Response response = null; response = loader.load(loader.request(this.uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, agent); - this.document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse()); + Document[] docs = response.parse(); + this.document = Document.mergeDocuments(response.url(), response.getMimeType(), docs); } return this.document; } diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index 80591758a..21e2ab26e 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -102,10 +102,11 @@ public final class Condenser { this.RESULT_FLAGS = new Bitfield(4); // construct flag set for document - if (document.dc_source().getContentDomain() == ContentDomain.IMAGE || !document.getImages().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasimage, true); - if (document.dc_source().getContentDomain() == ContentDomain.AUDIO || !document.getAudiolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasaudio, true); - if (document.dc_source().getContentDomain() == ContentDomain.VIDEO || !document.getVideolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasvideo, true); - if (document.dc_source().getContentDomain() == ContentDomain.APP || !document.getApplinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasapp, true); + ContentDomain contentDomain = document.getContentDomain(); + if (contentDomain == ContentDomain.IMAGE || !document.getImages().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasimage, true); + if (contentDomain == ContentDomain.AUDIO || !document.getAudiolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasaudio, true); + if (contentDomain == ContentDomain.VIDEO || !document.getVideolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasvideo, true); + if (contentDomain == ContentDomain.APP || !document.getApplinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasapp, true); if (document.lat() != 0.0 && document.lon() != 0.0) this.RESULT_FLAGS.set(flag_cat_haslocation, true); this.languageIdentificator = new Identificator(); diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index af82bbb56..6ef0b6262 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -50,6 +50,7 @@ import java.util.TreeSet; import net.yacy.cora.date.ISO8601Formatter; import net.yacy.cora.document.analysis.Classification; +import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; @@ -145,6 +146,17 @@ public class Document { this.date = date == null ? new Date() : date; } + /** + * Get the content domain of a document. This tries to get the content domain from the mime type + * and if this fails it uses alternatively the content domain from the file extension. + * @return the content domain which classifies the content type + */ + public ContentDomain getContentDomain() { + ContentDomain contentDomain = Classification.getContentDomainFromMime(this.mimeType); + if (contentDomain != ContentDomain.ALL) return contentDomain; + return this.dc_source().getContentDomainFromExt(); + } + public Object getParserObject() { return this.parserObject; } @@ -480,7 +492,7 @@ dc_rights this.applinks = new LinkedHashMap(); this.emaillinks = new LinkedHashMap(); final Map collectedImages = new HashMap(); // this is a set that is collected now and joined later to the imagelinks - for (final Map.Entry entry: collectedImages.entrySet()) { + for (final Map.Entry entry: this.images.entrySet()) { if (entry.getKey().getHost().equals(thishost)) this.inboundlinks.put(entry.getKey(), "image"); else this.outboundlinks.put(entry.getKey(), "image"); } for (final AnchorURL url: this.anchors) { diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java index 3e3c73b42..e24acd7ce 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java @@ -38,7 +38,6 @@ import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.DigestURL; -import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.solr.SolrType; import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.cora.order.Base64Order; @@ -92,6 +91,20 @@ public class URIMetadataNode { this.word = searchedWord; this.ranking = ranking; } + + /** + * Get the content domain of a document. This tries to get the content domain from the mime type + * and if this fails it uses alternatively the content domain from the file extension. + * @return the content domain which classifies the content type + */ + public ContentDomain getContentDomain() { + if (this.doc == null) return this.url.getContentDomainFromExt(); + String mime = mime(); + if (mime == null) return this.url.getContentDomainFromExt(); + ContentDomain contentDomain = Classification.getContentDomainFromMime(mime); + if (contentDomain != ContentDomain.ALL) return contentDomain; + return this.url.getContentDomainFromExt(); + } public SolrDocument getDocument() { return this.doc; @@ -183,6 +196,11 @@ public class URIMetadataNode { return Response.docType(a.get(0)); } + public String mime() { + ArrayList mime = getStringList(CollectionSchema.content_type); + return mime == null || mime.size() == 0 ? null : mime.get(0); + } + public byte[] language() { String language = getString(CollectionSchema.language_s); if (language == null || language.length() == 0) return ASCII.getBytes("en"); @@ -203,7 +221,7 @@ public class URIMetadataNode { if (flags == null) { this.flags = new Bitfield(); if (dc_subject() != null && dc_subject().indexOf("indexof") >= 0) this.flags.set(Condenser.flag_cat_indexof, true); - ContentDomain cd = Classification.getContentDomain(MultiProtocolURL.getFileExtension(this.url().getFileName())); + ContentDomain cd = getContentDomain(); if (lon() != 0.0d || lat() != 0.0d) this.flags.set(Condenser.flag_cat_haslocation, true); if (cd == ContentDomain.IMAGE || limage() > 0) this.flags.set(Condenser.flag_cat_hasimage, true); if (cd == ContentDomain.AUDIO || laudio() > 0) this.flags.set(Condenser.flag_cat_hasaudio, true); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 9ad3c8804..e5a36a242 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2553,7 +2553,7 @@ public final class Switchboard extends serverSwitch { ) { // get the hyperlinks final Map hl = Document.getHyperlinks(documents); - boolean loadImages = getConfigBool("crawler.load.image", true); + boolean loadImages = getConfigBool(SwitchboardConstants.CRAWLER_LOAD_IMAGE, true); if (loadImages) hl.putAll(Document.getImagelinks(documents)); // add all media links also to the crawl stack. They will be re-sorted to the NOLOAD queue and indexed afterwards as pure links diff --git a/source/net/yacy/search/SwitchboardConstants.java b/source/net/yacy/search/SwitchboardConstants.java index d6dfd24c8..a4ac06708 100644 --- a/source/net/yacy/search/SwitchboardConstants.java +++ b/source/net/yacy/search/SwitchboardConstants.java @@ -323,6 +323,7 @@ public final class SwitchboardConstants { *

public static final String CRAWLER_THREADS_ACTIVE_MAX = "crawler.MaxActiveThreads"

*

Name of the setting how many active crawler-threads may maximal be running on the same time

*/ + public static final String CRAWLER_LOAD_IMAGE = "crawler.load.image"; public static final String CRAWLER_THREADS_ACTIVE_MAX = "crawler.MaxActiveThreads"; public static final String CRAWLER_FOLLOW_REDIRECTS = "crawler.http.FollowRedirects"; // ignore the target url and follow to the redirect public static final String CRAWLER_RECORD_REDIRECTS = "crawler.http.RecordRedirects"; // record the ignored redirected page to the index store diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index bc1a5a584..73c794b55 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -1040,12 +1040,13 @@ public final class SearchEvent { } // check content domain + ContentDomain contentDomain = page.getContentDomain(); if (this.query.contentdom.getCode() > 0 && ( - (this.query.contentdom == Classification.ContentDomain.IMAGE && page.url().getContentDomain() != Classification.ContentDomain.IMAGE) || - (this.query.contentdom == Classification.ContentDomain.AUDIO && page.url().getContentDomain() != Classification.ContentDomain.AUDIO) || - (this.query.contentdom == Classification.ContentDomain.VIDEO && page.url().getContentDomain() != Classification.ContentDomain.VIDEO) || - (this.query.contentdom == Classification.ContentDomain.APP && page.url().getContentDomain() != Classification.ContentDomain.APP)) && this.query.urlMask_isCatchall) { - if (log.isFine()) log.fine("dropped RWI: wrong contentdom = " + this.query.contentdom + ", domain = " + page.url().getContentDomain()); + (this.query.contentdom == Classification.ContentDomain.IMAGE && contentDomain != Classification.ContentDomain.IMAGE) || + (this.query.contentdom == Classification.ContentDomain.AUDIO && contentDomain != Classification.ContentDomain.AUDIO) || + (this.query.contentdom == Classification.ContentDomain.VIDEO && contentDomain != Classification.ContentDomain.VIDEO) || + (this.query.contentdom == Classification.ContentDomain.APP && contentDomain != Classification.ContentDomain.APP)) && this.query.urlMask_isCatchall) { + if (log.isFine()) log.fine("dropped RWI: wrong contentdom = " + this.query.contentdom + ", domain = " + contentDomain); if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet(); continue; } @@ -1321,7 +1322,8 @@ public final class SearchEvent { } // load snippet - if (page.url().getContentDomain() == Classification.ContentDomain.TEXT || page.url().getContentDomain() == Classification.ContentDomain.ALL) { + ContentDomain contentDomain = page.getContentDomain(); + if (contentDomain == Classification.ContentDomain.TEXT || contentDomain == Classification.ContentDomain.ALL) { // attach text snippet long startTime = System.currentTimeMillis(); final TextSnippet snippet = new TextSnippet(