From 1b4fa2947d365a8ed35f60516f08e2eea68c0b3a Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Wed, 23 Oct 2013 00:16:54 +0200
Subject: [PATCH] - fixed a problem which ocurred when a document was not
 recognized with the right content domain (i.e. identifying that it is an
 image, text etc.) because it used the file extension and not an existing mime
 type assignment. - fixed the new setting that images shall be loaded for a
 better image search. - both fixes together makes it now possible to crawl
 commons.wikimedia.org which makes use of 'funny' document names (i.e. ending
 with .jpg while the document is html)

---
 defaults/yacy.init                            |  2 +-
 htroot/CrawlStartExpert_p.java                |  3 +--
 .../document/analysis/Classification.java     | 21 +++++++++++++++++-
 .../cora/document/id/MultiProtocolURL.java    | 10 +++++++--
 source/net/yacy/crawler/CrawlStacker.java     | 20 ++++++++++++-----
 source/net/yacy/data/ymark/YMarkMetadata.java |  3 ++-
 source/net/yacy/document/Condenser.java       |  9 ++++----
 source/net/yacy/document/Document.java        | 14 +++++++++++-
 .../kelondro/data/meta/URIMetadataNode.java   | 22 +++++++++++++++++--
 source/net/yacy/search/Switchboard.java       |  2 +-
 .../net/yacy/search/SwitchboardConstants.java |  1 +
 source/net/yacy/search/query/SearchEvent.java | 14 +++++++-----
 12 files changed, 94 insertions(+), 27 deletions(-)

diff --git a/defaults/yacy.init b/defaults/yacy.init
index 7a51c6ced..4671e41b7 100644
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@@ -799,7 +799,7 @@ search.verify.delete = true
 # images may be treated either as documents that are shown in search results or as objects
 # that are only visible in special search environments, like image search
 search.excludeintext.image = true
-crawler.load.image = true;
+crawler.load.image = true
 
 # remote search details
 remotesearch.maxcount = 10
diff --git a/htroot/CrawlStartExpert_p.java b/htroot/CrawlStartExpert_p.java
index 32aaef62e..adb2de9f4 100644
--- a/htroot/CrawlStartExpert_p.java
+++ b/htroot/CrawlStartExpert_p.java
@@ -31,14 +31,13 @@ import net.yacy.cora.protocol.ClientIdentification;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.crawler.data.CrawlProfile;
 import net.yacy.search.Switchboard;
-import net.yacy.search.SwitchboardConstants;
 import net.yacy.search.schema.CollectionSchema;
 import net.yacy.server.serverObjects;
 import net.yacy.server.serverSwitch;
 
 public class CrawlStartExpert_p {
 
-    public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, @SuppressWarnings("unused") final serverObjects post, final serverSwitch env) {
+    public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) {
         // return variable that accumulates replacements
         final Switchboard sb = (Switchboard) env;
         final serverObjects prop = new serverObjects();
diff --git a/source/net/yacy/cora/document/analysis/Classification.java b/source/net/yacy/cora/document/analysis/Classification.java
index 470c1e98f..30fe7b8b3 100644
--- a/source/net/yacy/cora/document/analysis/Classification.java
+++ b/source/net/yacy/cora/document/analysis/Classification.java
@@ -149,7 +149,13 @@ public class Classification {
         return textExtSet.contains(ext) || mediaExtSet.contains(ext) || ctrlExtSet.contains(ext);
     }
 
-    public static ContentDomain getContentDomain(final String ext) {
+    /**
+     * Get the content domain of a document according to the file extension.
+     * This can produce wrong results because the extension is a weak hint for the content domain.
+     * If possible, use the mime type, call Classification.getContentDomainFromMime()
+     * @return the content domain which classifies the content type
+     */
+    public static ContentDomain getContentDomainFromExt(final String ext) {
         if (isTextExtension(ext)) return ContentDomain.TEXT;
         if (isImageExtension(ext)) return ContentDomain.IMAGE;
         if (isAudioExtension(ext)) return ContentDomain.AUDIO;
@@ -159,6 +165,19 @@ public class Classification {
         return ContentDomain.ALL;
     }
 
+    /**
+     * Get the content domain of a document according to the mime type.
+     * @return the content domain which classifies the content type
+     */
+    public static ContentDomain getContentDomainFromMime(final String mime) {
+        if (mime.startsWith("text/")) return ContentDomain.TEXT;
+        if (mime.startsWith("image/")) return ContentDomain.IMAGE;
+        if (mime.startsWith("audio/")) return ContentDomain.AUDIO;
+        if (mime.startsWith("video/")) return ContentDomain.VIDEO;
+        if (mime.startsWith("application/")) return ContentDomain.APP;
+        return ContentDomain.ALL;
+    }
+
     public static boolean isPictureMime(final String mimeType) {
         if (mimeType == null) return false;
         return mimeType.toUpperCase().startsWith("IMAGE");
diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java
index 86400e504..de91810f7 100644
--- a/source/net/yacy/cora/document/id/MultiProtocolURL.java
+++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java
@@ -269,9 +269,15 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
     public final boolean isFile()  { return this.protocol.equals("file"); }
     public final boolean isSMB()   { return this.protocol.equals("smb"); }
 
-    public final ContentDomain getContentDomain() {
+    /**
+     * Get the content domain of a document according to the extension.
+     * This can produce wrong results because the extension is a weak hint for the content domain.
+     * If possible, use the mime type, call Classification.getContentDomainFromMime()
+     * @return the content domain which classifies the content type
+     */
+    public final ContentDomain getContentDomainFromExt() {
         if (this.contentDomain == null) {
-            this.contentDomain = Classification.getContentDomain(getFileExtension(this.getFileName()));
+            this.contentDomain = Classification.getContentDomainFromExt(getFileExtension(this.getFileName()));
         }
         return this.contentDomain;
     }
diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java
index d38af935d..c9a5a0a50 100644
--- a/source/net/yacy/crawler/CrawlStacker.java
+++ b/source/net/yacy/crawler/CrawlStacker.java
@@ -60,6 +60,7 @@ import net.yacy.peers.SeedDB;
 import net.yacy.repository.Blacklist.BlacklistType;
 import net.yacy.repository.FilterEngine;
 import net.yacy.search.Switchboard;
+import net.yacy.search.SwitchboardConstants;
 import net.yacy.search.index.Segment;
 import net.yacy.search.schema.CollectionConfiguration;
 
@@ -335,13 +336,20 @@ public final class CrawlStacker {
 
         // check availability of parser and maxfilesize
         String warning = null;
-        boolean loadImages = Switchboard.getSwitchboard().getConfigBool("crawler.load.image", true);
+        boolean loadImages = Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.CRAWLER_LOAD_IMAGE, true);
+        if (!loadImages && Switchboard.getSwitchboard().getConfig(SwitchboardConstants.CRAWLER_LOAD_IMAGE, "").equals("true;")) {
+            // dammit semicolon
+            // TODO: remove this shit later
+            Switchboard.getSwitchboard().setConfig(SwitchboardConstants.CRAWLER_LOAD_IMAGE, true);
+            loadImages = true;
+        }
+        ContentDomain contentDomain = entry.url().getContentDomainFromExt();
         if ((maxFileSize >= 0 && entry.size() > maxFileSize) ||
-            entry.url().getContentDomain() == ContentDomain.APP  ||
-            (!loadImages && entry.url().getContentDomain() == ContentDomain.IMAGE) ||
-            entry.url().getContentDomain() == ContentDomain.AUDIO  ||
-            entry.url().getContentDomain() == ContentDomain.VIDEO ||
-            entry.url().getContentDomain() == ContentDomain.CTRL) {
+            contentDomain == ContentDomain.APP  ||
+            (!loadImages && contentDomain == ContentDomain.IMAGE) ||
+            contentDomain == ContentDomain.AUDIO  ||
+            contentDomain == ContentDomain.VIDEO ||
+            contentDomain == ContentDomain.CTRL) {
             warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry, profile, this.robots);
             //if (warning != null && this.log.isFine()) this.log.logFine("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true, false) + " - not pushed: " + warning);
             return null;
diff --git a/source/net/yacy/data/ymark/YMarkMetadata.java b/source/net/yacy/data/ymark/YMarkMetadata.java
index 3bc88dd7b..955eb80e3 100644
--- a/source/net/yacy/data/ymark/YMarkMetadata.java
+++ b/source/net/yacy/data/ymark/YMarkMetadata.java
@@ -99,7 +99,8 @@ public class YMarkMetadata {
 		if(this.document == null) {
 			Response response = null;
 			response = loader.load(loader.request(this.uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, agent);
-			this.document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
+			Document[] docs = response.parse();
+			this.document = Document.mergeDocuments(response.url(), response.getMimeType(), docs);
 		}
 		return this.document;
 	}
diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java
index 80591758a..21e2ab26e 100644
--- a/source/net/yacy/document/Condenser.java
+++ b/source/net/yacy/document/Condenser.java
@@ -102,10 +102,11 @@ public final class Condenser {
         this.RESULT_FLAGS = new Bitfield(4);
 
         // construct flag set for document
-        if (document.dc_source().getContentDomain() == ContentDomain.IMAGE || !document.getImages().isEmpty())     this.RESULT_FLAGS.set(flag_cat_hasimage, true);
-        if (document.dc_source().getContentDomain() == ContentDomain.AUDIO || !document.getAudiolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasaudio, true);
-        if (document.dc_source().getContentDomain() == ContentDomain.VIDEO || !document.getVideolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasvideo, true);
-        if (document.dc_source().getContentDomain() == ContentDomain.APP   || !document.getApplinks().isEmpty())   this.RESULT_FLAGS.set(flag_cat_hasapp,   true);
+        ContentDomain contentDomain = document.getContentDomain();
+        if (contentDomain == ContentDomain.IMAGE || !document.getImages().isEmpty())     this.RESULT_FLAGS.set(flag_cat_hasimage, true);
+        if (contentDomain == ContentDomain.AUDIO || !document.getAudiolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasaudio, true);
+        if (contentDomain == ContentDomain.VIDEO || !document.getVideolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasvideo, true);
+        if (contentDomain == ContentDomain.APP   || !document.getApplinks().isEmpty())   this.RESULT_FLAGS.set(flag_cat_hasapp,   true);
         if (document.lat() != 0.0 && document.lon() != 0.0) this.RESULT_FLAGS.set(flag_cat_haslocation, true);
 
         this.languageIdentificator = new Identificator();
diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java
index af82bbb56..6ef0b6262 100644
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@@ -50,6 +50,7 @@ import java.util.TreeSet;
 
 import net.yacy.cora.date.ISO8601Formatter;
 import net.yacy.cora.document.analysis.Classification;
+import net.yacy.cora.document.analysis.Classification.ContentDomain;
 import net.yacy.cora.document.encoding.UTF8;
 import net.yacy.cora.document.id.AnchorURL;
 import net.yacy.cora.document.id.DigestURL;
@@ -145,6 +146,17 @@ public class Document {
         this.date = date == null ? new Date() : date;
     }
 
+    /**
+     * Get the content domain of a document. This tries to get the content domain from the mime type
+     * and if this fails it uses alternatively the content domain from the file extension.
+     * @return the content domain which classifies the content type
+     */
+    public ContentDomain getContentDomain() {
+        ContentDomain contentDomain = Classification.getContentDomainFromMime(this.mimeType);
+        if (contentDomain != ContentDomain.ALL) return contentDomain;
+        return this.dc_source().getContentDomainFromExt();
+    }
+    
     public Object getParserObject() {
         return this.parserObject;
     }
@@ -480,7 +492,7 @@ dc_rights
             this.applinks   = new LinkedHashMap<AnchorURL, String>();
             this.emaillinks = new LinkedHashMap<String, String>();
             final Map<AnchorURL, ImageEntry> collectedImages = new HashMap<AnchorURL, ImageEntry>(); // this is a set that is collected now and joined later to the imagelinks
-            for (final Map.Entry<AnchorURL, ImageEntry> entry: collectedImages.entrySet()) {
+            for (final Map.Entry<AnchorURL, ImageEntry> entry: this.images.entrySet()) {
                 if (entry.getKey().getHost().equals(thishost)) this.inboundlinks.put(entry.getKey(), "image"); else this.outboundlinks.put(entry.getKey(), "image");
             }
             for (final AnchorURL url: this.anchors) {
diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java
index 3e3c73b42..e24acd7ce 100644
--- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java
+++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java
@@ -38,7 +38,6 @@ import net.yacy.cora.document.analysis.Classification.ContentDomain;
 import net.yacy.cora.document.encoding.ASCII;
 import net.yacy.cora.document.encoding.UTF8;
 import net.yacy.cora.document.id.DigestURL;
-import net.yacy.cora.document.id.MultiProtocolURL;
 import net.yacy.cora.federate.solr.SolrType;
 import net.yacy.cora.lod.vocabulary.Tagging;
 import net.yacy.cora.order.Base64Order;
@@ -92,6 +91,20 @@ public class URIMetadataNode {
         this.word = searchedWord;
         this.ranking = ranking;
     }
+
+    /**
+     * Get the content domain of a document. This tries to get the content domain from the mime type
+     * and if this fails it uses alternatively the content domain from the file extension.
+     * @return the content domain which classifies the content type
+     */
+    public ContentDomain getContentDomain() {
+        if (this.doc == null) return this.url.getContentDomainFromExt();
+        String mime = mime();
+        if (mime == null) return this.url.getContentDomainFromExt();
+        ContentDomain contentDomain = Classification.getContentDomainFromMime(mime);
+        if (contentDomain != ContentDomain.ALL) return contentDomain;
+        return this.url.getContentDomainFromExt();
+    }
     
     public SolrDocument getDocument() {
         return this.doc;
@@ -183,6 +196,11 @@ public class URIMetadataNode {
         return Response.docType(a.get(0));
     }
 
+    public String mime() {
+        ArrayList<String> mime = getStringList(CollectionSchema.content_type);
+        return mime == null || mime.size() == 0 ? null : mime.get(0);
+    }
+
     public byte[] language() {
         String language = getString(CollectionSchema.language_s);
         if (language == null || language.length() == 0) return ASCII.getBytes("en");
@@ -203,7 +221,7 @@ public class URIMetadataNode {
         if (flags == null) {
             this.flags = new Bitfield();
             if (dc_subject() != null && dc_subject().indexOf("indexof") >= 0) this.flags.set(Condenser.flag_cat_indexof, true);
-            ContentDomain cd = Classification.getContentDomain(MultiProtocolURL.getFileExtension(this.url().getFileName()));
+            ContentDomain cd = getContentDomain();
             if (lon() != 0.0d || lat() != 0.0d) this.flags.set(Condenser.flag_cat_haslocation, true);
             if (cd == ContentDomain.IMAGE || limage() > 0) this.flags.set(Condenser.flag_cat_hasimage, true);
             if (cd == ContentDomain.AUDIO || laudio() > 0) this.flags.set(Condenser.flag_cat_hasaudio, true);
diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java
index 9ad3c8804..e5a36a242 100644
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@@ -2553,7 +2553,7 @@ public final class Switchboard extends serverSwitch {
            ) {
             // get the hyperlinks
             final Map<DigestURL, String> hl = Document.getHyperlinks(documents);
-            boolean loadImages = getConfigBool("crawler.load.image", true);
+            boolean loadImages = getConfigBool(SwitchboardConstants.CRAWLER_LOAD_IMAGE, true);
             if (loadImages) hl.putAll(Document.getImagelinks(documents));
             
             // add all media links also to the crawl stack. They will be re-sorted to the NOLOAD queue and indexed afterwards as pure links
diff --git a/source/net/yacy/search/SwitchboardConstants.java b/source/net/yacy/search/SwitchboardConstants.java
index d6dfd24c8..a4ac06708 100644
--- a/source/net/yacy/search/SwitchboardConstants.java
+++ b/source/net/yacy/search/SwitchboardConstants.java
@@ -323,6 +323,7 @@ public final class SwitchboardConstants {
      * <p><code>public static final String <strong>CRAWLER_THREADS_ACTIVE_MAX</strong> = "crawler.MaxActiveThreads"</code></p>
      * <p>Name of the setting how many active crawler-threads may maximal be running on the same time</p>
      */
+    public static final String CRAWLER_LOAD_IMAGE               = "crawler.load.image";
     public static final String CRAWLER_THREADS_ACTIVE_MAX       = "crawler.MaxActiveThreads";
     public static final String CRAWLER_FOLLOW_REDIRECTS         = "crawler.http.FollowRedirects"; // ignore the target url and follow to the redirect
     public static final String CRAWLER_RECORD_REDIRECTS         = "crawler.http.RecordRedirects"; // record the ignored redirected page to the index store
diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java
index bc1a5a584..73c794b55 100644
--- a/source/net/yacy/search/query/SearchEvent.java
+++ b/source/net/yacy/search/query/SearchEvent.java
@@ -1040,12 +1040,13 @@ public final class SearchEvent {
             }
 
             // check content domain
+            ContentDomain contentDomain = page.getContentDomain();
             if (this.query.contentdom.getCode() > 0 && (
-                (this.query.contentdom == Classification.ContentDomain.IMAGE && page.url().getContentDomain() != Classification.ContentDomain.IMAGE) ||
-                (this.query.contentdom == Classification.ContentDomain.AUDIO && page.url().getContentDomain() != Classification.ContentDomain.AUDIO) ||
-                (this.query.contentdom == Classification.ContentDomain.VIDEO && page.url().getContentDomain() != Classification.ContentDomain.VIDEO) ||
-                (this.query.contentdom == Classification.ContentDomain.APP && page.url().getContentDomain() != Classification.ContentDomain.APP)) && this.query.urlMask_isCatchall) {
-                if (log.isFine()) log.fine("dropped RWI: wrong contentdom = " + this.query.contentdom + ", domain = " + page.url().getContentDomain());
+                (this.query.contentdom == Classification.ContentDomain.IMAGE && contentDomain != Classification.ContentDomain.IMAGE) ||
+                (this.query.contentdom == Classification.ContentDomain.AUDIO && contentDomain != Classification.ContentDomain.AUDIO) ||
+                (this.query.contentdom == Classification.ContentDomain.VIDEO && contentDomain != Classification.ContentDomain.VIDEO) ||
+                (this.query.contentdom == Classification.ContentDomain.APP && contentDomain != Classification.ContentDomain.APP)) && this.query.urlMask_isCatchall) {
+                if (log.isFine()) log.fine("dropped RWI: wrong contentdom = " + this.query.contentdom + ", domain = " + contentDomain);
                 if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
                 continue;
             }
@@ -1321,7 +1322,8 @@ public final class SearchEvent {
         }
 
         // load snippet
-        if (page.url().getContentDomain() == Classification.ContentDomain.TEXT || page.url().getContentDomain() == Classification.ContentDomain.ALL) {
+        ContentDomain contentDomain = page.getContentDomain();
+        if (contentDomain == Classification.ContentDomain.TEXT || contentDomain == Classification.ContentDomain.ALL) {
             // attach text snippet
             long startTime = System.currentTimeMillis();
             final TextSnippet snippet = new TextSnippet(