From 1b4fa2947d365a8ed35f60516f08e2eea68c0b3a Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 23 Oct 2013 00:16:54 +0200 Subject: [PATCH] - fixed a problem which ocurred when a document was not recognized with the right content domain (i.e. identifying that it is an image, text etc.) because it used the file extension and not an existing mime type assignment. - fixed the new setting that images shall be loaded for a better image search. - both fixes together makes it now possible to crawl commons.wikimedia.org which makes use of 'funny' document names (i.e. ending with .jpg while the document is html) --- defaults/yacy.init | 2 +- htroot/CrawlStartExpert_p.java | 3 +-- .../document/analysis/Classification.java | 21 +++++++++++++++++- .../cora/document/id/MultiProtocolURL.java | 10 +++++++-- source/net/yacy/crawler/CrawlStacker.java | 20 ++++++++++++----- source/net/yacy/data/ymark/YMarkMetadata.java | 3 ++- source/net/yacy/document/Condenser.java | 9 ++++---- source/net/yacy/document/Document.java | 14 +++++++++++- .../kelondro/data/meta/URIMetadataNode.java | 22 +++++++++++++++++-- source/net/yacy/search/Switchboard.java | 2 +- .../net/yacy/search/SwitchboardConstants.java | 1 + source/net/yacy/search/query/SearchEvent.java | 14 +++++++----- 12 files changed, 94 insertions(+), 27 deletions(-) diff --git a/defaults/yacy.init b/defaults/yacy.init index 7a51c6ced..4671e41b7 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -799,7 +799,7 @@ search.verify.delete = true # images may be treated either as documents that are shown in search results or as objects # that are only visible in special search environments, like image search search.excludeintext.image = true -crawler.load.image = true; +crawler.load.image = true # remote search details remotesearch.maxcount = 10 diff --git a/htroot/CrawlStartExpert_p.java b/htroot/CrawlStartExpert_p.java index 32aaef62e..adb2de9f4 100644 --- a/htroot/CrawlStartExpert_p.java +++ b/htroot/CrawlStartExpert_p.java @@ -31,14 +31,13 @@ import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.crawler.data.CrawlProfile; import net.yacy.search.Switchboard; -import net.yacy.search.SwitchboardConstants; import net.yacy.search.schema.CollectionSchema; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; public class CrawlStartExpert_p { - public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, @SuppressWarnings("unused") final serverObjects post, final serverSwitch env) { + public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) { // return variable that accumulates replacements final Switchboard sb = (Switchboard) env; final serverObjects prop = new serverObjects(); diff --git a/source/net/yacy/cora/document/analysis/Classification.java b/source/net/yacy/cora/document/analysis/Classification.java index 470c1e98f..30fe7b8b3 100644 --- a/source/net/yacy/cora/document/analysis/Classification.java +++ b/source/net/yacy/cora/document/analysis/Classification.java @@ -149,7 +149,13 @@ public class Classification { return textExtSet.contains(ext) || mediaExtSet.contains(ext) || ctrlExtSet.contains(ext); } - public static ContentDomain getContentDomain(final String ext) { + /** + * Get the content domain of a document according to the file extension. + * This can produce wrong results because the extension is a weak hint for the content domain. + * If possible, use the mime type, call Classification.getContentDomainFromMime() + * @return the content domain which classifies the content type + */ + public static ContentDomain getContentDomainFromExt(final String ext) { if (isTextExtension(ext)) return ContentDomain.TEXT; if (isImageExtension(ext)) return ContentDomain.IMAGE; if (isAudioExtension(ext)) return ContentDomain.AUDIO; @@ -159,6 +165,19 @@ public class Classification { return ContentDomain.ALL; } + /** + * Get the content domain of a document according to the mime type. + * @return the content domain which classifies the content type + */ + public static ContentDomain getContentDomainFromMime(final String mime) { + if (mime.startsWith("text/")) return ContentDomain.TEXT; + if (mime.startsWith("image/")) return ContentDomain.IMAGE; + if (mime.startsWith("audio/")) return ContentDomain.AUDIO; + if (mime.startsWith("video/")) return ContentDomain.VIDEO; + if (mime.startsWith("application/")) return ContentDomain.APP; + return ContentDomain.ALL; + } + public static boolean isPictureMime(final String mimeType) { if (mimeType == null) return false; return mimeType.toUpperCase().startsWith("IMAGE"); diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java index 86400e504..de91810f7 100644 --- a/source/net/yacy/cora/document/id/MultiProtocolURL.java +++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java @@ -269,9 +269,15 @@ public class MultiProtocolURL implements Serializable, Comparable= 0 && entry.size() > maxFileSize) || - entry.url().getContentDomain() == ContentDomain.APP || - (!loadImages && entry.url().getContentDomain() == ContentDomain.IMAGE) || - entry.url().getContentDomain() == ContentDomain.AUDIO || - entry.url().getContentDomain() == ContentDomain.VIDEO || - entry.url().getContentDomain() == ContentDomain.CTRL) { + contentDomain == ContentDomain.APP || + (!loadImages && contentDomain == ContentDomain.IMAGE) || + contentDomain == ContentDomain.AUDIO || + contentDomain == ContentDomain.VIDEO || + contentDomain == ContentDomain.CTRL) { warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry, profile, this.robots); //if (warning != null && this.log.isFine()) this.log.logFine("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true, false) + " - not pushed: " + warning); return null; diff --git a/source/net/yacy/data/ymark/YMarkMetadata.java b/source/net/yacy/data/ymark/YMarkMetadata.java index 3bc88dd7b..955eb80e3 100644 --- a/source/net/yacy/data/ymark/YMarkMetadata.java +++ b/source/net/yacy/data/ymark/YMarkMetadata.java @@ -99,7 +99,8 @@ public class YMarkMetadata { if(this.document == null) { Response response = null; response = loader.load(loader.request(this.uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, agent); - this.document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse()); + Document[] docs = response.parse(); + this.document = Document.mergeDocuments(response.url(), response.getMimeType(), docs); } return this.document; } diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index 80591758a..21e2ab26e 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -102,10 +102,11 @@ public final class Condenser { this.RESULT_FLAGS = new Bitfield(4); // construct flag set for document - if (document.dc_source().getContentDomain() == ContentDomain.IMAGE || !document.getImages().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasimage, true); - if (document.dc_source().getContentDomain() == ContentDomain.AUDIO || !document.getAudiolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasaudio, true); - if (document.dc_source().getContentDomain() == ContentDomain.VIDEO || !document.getVideolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasvideo, true); - if (document.dc_source().getContentDomain() == ContentDomain.APP || !document.getApplinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasapp, true); + ContentDomain contentDomain = document.getContentDomain(); + if (contentDomain == ContentDomain.IMAGE || !document.getImages().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasimage, true); + if (contentDomain == ContentDomain.AUDIO || !document.getAudiolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasaudio, true); + if (contentDomain == ContentDomain.VIDEO || !document.getVideolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasvideo, true); + if (contentDomain == ContentDomain.APP || !document.getApplinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasapp, true); if (document.lat() != 0.0 && document.lon() != 0.0) this.RESULT_FLAGS.set(flag_cat_haslocation, true); this.languageIdentificator = new Identificator(); diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index af82bbb56..6ef0b6262 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -50,6 +50,7 @@ import java.util.TreeSet; import net.yacy.cora.date.ISO8601Formatter; import net.yacy.cora.document.analysis.Classification; +import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; @@ -145,6 +146,17 @@ public class Document { this.date = date == null ? new Date() : date; } + /** + * Get the content domain of a document. This tries to get the content domain from the mime type + * and if this fails it uses alternatively the content domain from the file extension. + * @return the content domain which classifies the content type + */ + public ContentDomain getContentDomain() { + ContentDomain contentDomain = Classification.getContentDomainFromMime(this.mimeType); + if (contentDomain != ContentDomain.ALL) return contentDomain; + return this.dc_source().getContentDomainFromExt(); + } + public Object getParserObject() { return this.parserObject; } @@ -480,7 +492,7 @@ dc_rights this.applinks = new LinkedHashMap(); this.emaillinks = new LinkedHashMap(); final Map collectedImages = new HashMap(); // this is a set that is collected now and joined later to the imagelinks - for (final Map.Entry entry: collectedImages.entrySet()) { + for (final Map.Entry entry: this.images.entrySet()) { if (entry.getKey().getHost().equals(thishost)) this.inboundlinks.put(entry.getKey(), "image"); else this.outboundlinks.put(entry.getKey(), "image"); } for (final AnchorURL url: this.anchors) { diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java index 3e3c73b42..e24acd7ce 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java @@ -38,7 +38,6 @@ import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.DigestURL; -import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.solr.SolrType; import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.cora.order.Base64Order; @@ -92,6 +91,20 @@ public class URIMetadataNode { this.word = searchedWord; this.ranking = ranking; } + + /** + * Get the content domain of a document. This tries to get the content domain from the mime type + * and if this fails it uses alternatively the content domain from the file extension. + * @return the content domain which classifies the content type + */ + public ContentDomain getContentDomain() { + if (this.doc == null) return this.url.getContentDomainFromExt(); + String mime = mime(); + if (mime == null) return this.url.getContentDomainFromExt(); + ContentDomain contentDomain = Classification.getContentDomainFromMime(mime); + if (contentDomain != ContentDomain.ALL) return contentDomain; + return this.url.getContentDomainFromExt(); + } public SolrDocument getDocument() { return this.doc; @@ -183,6 +196,11 @@ public class URIMetadataNode { return Response.docType(a.get(0)); } + public String mime() { + ArrayList mime = getStringList(CollectionSchema.content_type); + return mime == null || mime.size() == 0 ? null : mime.get(0); + } + public byte[] language() { String language = getString(CollectionSchema.language_s); if (language == null || language.length() == 0) return ASCII.getBytes("en"); @@ -203,7 +221,7 @@ public class URIMetadataNode { if (flags == null) { this.flags = new Bitfield(); if (dc_subject() != null && dc_subject().indexOf("indexof") >= 0) this.flags.set(Condenser.flag_cat_indexof, true); - ContentDomain cd = Classification.getContentDomain(MultiProtocolURL.getFileExtension(this.url().getFileName())); + ContentDomain cd = getContentDomain(); if (lon() != 0.0d || lat() != 0.0d) this.flags.set(Condenser.flag_cat_haslocation, true); if (cd == ContentDomain.IMAGE || limage() > 0) this.flags.set(Condenser.flag_cat_hasimage, true); if (cd == ContentDomain.AUDIO || laudio() > 0) this.flags.set(Condenser.flag_cat_hasaudio, true); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 9ad3c8804..e5a36a242 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2553,7 +2553,7 @@ public final class Switchboard extends serverSwitch { ) { // get the hyperlinks final Map hl = Document.getHyperlinks(documents); - boolean loadImages = getConfigBool("crawler.load.image", true); + boolean loadImages = getConfigBool(SwitchboardConstants.CRAWLER_LOAD_IMAGE, true); if (loadImages) hl.putAll(Document.getImagelinks(documents)); // add all media links also to the crawl stack. They will be re-sorted to the NOLOAD queue and indexed afterwards as pure links diff --git a/source/net/yacy/search/SwitchboardConstants.java b/source/net/yacy/search/SwitchboardConstants.java index d6dfd24c8..a4ac06708 100644 --- a/source/net/yacy/search/SwitchboardConstants.java +++ b/source/net/yacy/search/SwitchboardConstants.java @@ -323,6 +323,7 @@ public final class SwitchboardConstants { *

public static final String CRAWLER_THREADS_ACTIVE_MAX = "crawler.MaxActiveThreads"

*

Name of the setting how many active crawler-threads may maximal be running on the same time

*/ + public static final String CRAWLER_LOAD_IMAGE = "crawler.load.image"; public static final String CRAWLER_THREADS_ACTIVE_MAX = "crawler.MaxActiveThreads"; public static final String CRAWLER_FOLLOW_REDIRECTS = "crawler.http.FollowRedirects"; // ignore the target url and follow to the redirect public static final String CRAWLER_RECORD_REDIRECTS = "crawler.http.RecordRedirects"; // record the ignored redirected page to the index store diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index bc1a5a584..73c794b55 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -1040,12 +1040,13 @@ public final class SearchEvent { } // check content domain + ContentDomain contentDomain = page.getContentDomain(); if (this.query.contentdom.getCode() > 0 && ( - (this.query.contentdom == Classification.ContentDomain.IMAGE && page.url().getContentDomain() != Classification.ContentDomain.IMAGE) || - (this.query.contentdom == Classification.ContentDomain.AUDIO && page.url().getContentDomain() != Classification.ContentDomain.AUDIO) || - (this.query.contentdom == Classification.ContentDomain.VIDEO && page.url().getContentDomain() != Classification.ContentDomain.VIDEO) || - (this.query.contentdom == Classification.ContentDomain.APP && page.url().getContentDomain() != Classification.ContentDomain.APP)) && this.query.urlMask_isCatchall) { - if (log.isFine()) log.fine("dropped RWI: wrong contentdom = " + this.query.contentdom + ", domain = " + page.url().getContentDomain()); + (this.query.contentdom == Classification.ContentDomain.IMAGE && contentDomain != Classification.ContentDomain.IMAGE) || + (this.query.contentdom == Classification.ContentDomain.AUDIO && contentDomain != Classification.ContentDomain.AUDIO) || + (this.query.contentdom == Classification.ContentDomain.VIDEO && contentDomain != Classification.ContentDomain.VIDEO) || + (this.query.contentdom == Classification.ContentDomain.APP && contentDomain != Classification.ContentDomain.APP)) && this.query.urlMask_isCatchall) { + if (log.isFine()) log.fine("dropped RWI: wrong contentdom = " + this.query.contentdom + ", domain = " + contentDomain); if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet(); continue; } @@ -1321,7 +1322,8 @@ public final class SearchEvent { } // load snippet - if (page.url().getContentDomain() == Classification.ContentDomain.TEXT || page.url().getContentDomain() == Classification.ContentDomain.ALL) { + ContentDomain contentDomain = page.getContentDomain(); + if (contentDomain == Classification.ContentDomain.TEXT || contentDomain == Classification.ContentDomain.ALL) { // attach text snippet long startTime = System.currentTimeMillis(); final TextSnippet snippet = new TextSnippet(