- fixed a problem which ocurred when a document was not recognized with

the right content domain (i.e. identifying that it is an image, text etc.) because it used the file extension and not an existing mime type assignment. - fixed the new setting that images shall be loaded for a better image search. - both fixes together makes it now possible to crawl commons.wikimedia.org which makes use of 'funny' document names (i.e. ending with .jpg while the document is html)
12 years ago · 1b4fa2947d
parent 82621bead0
commit 1b4fa2947d
12 changed files with 94 additions and 27 deletions
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@ -799,7 +799,7 @@ search.verify.delete = true
 # images may be treated either as documents that are shown in search results or as objects
 # that are only visible in special search environments, like image search
 search.excludeintext.image = true
-crawler.load.image = true;
+crawler.load.image = true

 # remote search details
 remotesearch.maxcount = 10
--- a/htroot/CrawlStartExpert_p.java
+++ b/htroot/CrawlStartExpert_p.java
@ -31,14 +31,13 @@ import net.yacy.cora.protocol.ClientIdentification;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.crawler.data.CrawlProfile;
 import net.yacy.search.Switchboard;
-import net.yacy.search.SwitchboardConstants;
 import net.yacy.search.schema.CollectionSchema;
 import net.yacy.server.serverObjects;
 import net.yacy.server.serverSwitch;

 public class CrawlStartExpert_p {

-    public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, @SuppressWarnings("unused") final serverObjects post, final serverSwitch env) {
+    public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) {
        // return variable that accumulates replacements
        final Switchboard sb = (Switchboard) env;
        final serverObjects prop = new serverObjects();
--- a/source/net/yacy/cora/document/analysis/Classification.java
+++ b/source/net/yacy/cora/document/analysis/Classification.java
@ -149,7 +149,13 @@ public class Classification {
        return textExtSet.contains(ext) || mediaExtSet.contains(ext) || ctrlExtSet.contains(ext);
    }

-    public static ContentDomain getContentDomain(final String ext) {
+    /**
+     * Get the content domain of a document according to the file extension.
+     * This can produce wrong results because the extension is a weak hint for the content domain.
+     * If possible, use the mime type, call Classification.getContentDomainFromMime()
+     * @return the content domain which classifies the content type
+     */
+    public static ContentDomain getContentDomainFromExt(final String ext) {
        if (isTextExtension(ext)) return ContentDomain.TEXT;
        if (isImageExtension(ext)) return ContentDomain.IMAGE;
        if (isAudioExtension(ext)) return ContentDomain.AUDIO;
@ -159,6 +165,19 @@ public class Classification {
        return ContentDomain.ALL;
    }

+    /**
+     * Get the content domain of a document according to the mime type.
+     * @return the content domain which classifies the content type
+     */
+    public static ContentDomain getContentDomainFromMime(final String mime) {
+        if (mime.startsWith("text/")) return ContentDomain.TEXT;
+        if (mime.startsWith("image/")) return ContentDomain.IMAGE;
+        if (mime.startsWith("audio/")) return ContentDomain.AUDIO;
+        if (mime.startsWith("video/")) return ContentDomain.VIDEO;
+        if (mime.startsWith("application/")) return ContentDomain.APP;
+        return ContentDomain.ALL;
+    }
+
    public static boolean isPictureMime(final String mimeType) {
        if (mimeType == null) return false;
        return mimeType.toUpperCase().startsWith("IMAGE");
--- a/source/net/yacy/cora/document/id/MultiProtocolURL.java
+++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java
@ -269,9 +269,15 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
    public final boolean isFile()  { return this.protocol.equals("file"); }
    public final boolean isSMB()   { return this.protocol.equals("smb"); }

-    public final ContentDomain getContentDomain() {
+    /**
+     * Get the content domain of a document according to the extension.
+     * This can produce wrong results because the extension is a weak hint for the content domain.
+     * If possible, use the mime type, call Classification.getContentDomainFromMime()
+     * @return the content domain which classifies the content type
+     */
+    public final ContentDomain getContentDomainFromExt() {
        if (this.contentDomain == null) {
-            this.contentDomain = Classification.getContentDomain(getFileExtension(this.getFileName()));
+            this.contentDomain = Classification.getContentDomainFromExt(getFileExtension(this.getFileName()));
        }
        return this.contentDomain;
    }
--- a/source/net/yacy/crawler/CrawlStacker.java
+++ b/source/net/yacy/crawler/CrawlStacker.java
@ -60,6 +60,7 @@ import net.yacy.peers.SeedDB;
 import net.yacy.repository.Blacklist.BlacklistType;
 import net.yacy.repository.FilterEngine;
 import net.yacy.search.Switchboard;
+import net.yacy.search.SwitchboardConstants;
 import net.yacy.search.index.Segment;
 import net.yacy.search.schema.CollectionConfiguration;

@ -335,13 +336,20 @@ public final class CrawlStacker {

        // check availability of parser and maxfilesize
        String warning = null;
-        boolean loadImages = Switchboard.getSwitchboard().getConfigBool("crawler.load.image", true);
+        boolean loadImages = Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.CRAWLER_LOAD_IMAGE, true);
+        if (!loadImages && Switchboard.getSwitchboard().getConfig(SwitchboardConstants.CRAWLER_LOAD_IMAGE, "").equals("true;")) {
+            // dammit semicolon
+            // TODO: remove this shit later
+            Switchboard.getSwitchboard().setConfig(SwitchboardConstants.CRAWLER_LOAD_IMAGE, true);
+            loadImages = true;
+        }
+        ContentDomain contentDomain = entry.url().getContentDomainFromExt();
        if ((maxFileSize >= 0 && entry.size() > maxFileSize) ||
-            entry.url().getContentDomain() == ContentDomain.APP  ||
-            (!loadImages && entry.url().getContentDomain() == ContentDomain.IMAGE) ||
-            entry.url().getContentDomain() == ContentDomain.AUDIO  ||
-            entry.url().getContentDomain() == ContentDomain.VIDEO ||
-            entry.url().getContentDomain() == ContentDomain.CTRL) {
+            contentDomain == ContentDomain.APP  ||
+            (!loadImages && contentDomain == ContentDomain.IMAGE) ||
+            contentDomain == ContentDomain.AUDIO  ||
+            contentDomain == ContentDomain.VIDEO ||
+            contentDomain == ContentDomain.CTRL) {
            warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry, profile, this.robots);
            //if (warning != null && this.log.isFine()) this.log.logFine("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true, false) + " - not pushed: " + warning);
            return null;
--- a/source/net/yacy/data/ymark/YMarkMetadata.java
+++ b/source/net/yacy/data/ymark/YMarkMetadata.java
@ -99,7 +99,8 @@ public class YMarkMetadata {
 		if(this.document == null) {
 			Response response = null;
 			response = loader.load(loader.request(this.uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, agent);
-			this.document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
+			Document[] docs = response.parse();
+			this.document = Document.mergeDocuments(response.url(), response.getMimeType(), docs);
 		}
 		return this.document;
 	}
--- a/source/net/yacy/document/Condenser.java
+++ b/source/net/yacy/document/Condenser.java
@ -102,10 +102,11 @@ public final class Condenser {
        this.RESULT_FLAGS = new Bitfield(4);

        // construct flag set for document
-        if (document.dc_source().getContentDomain() == ContentDomain.IMAGE || !document.getImages().isEmpty())     this.RESULT_FLAGS.set(flag_cat_hasimage, true);
-        if (document.dc_source().getContentDomain() == ContentDomain.AUDIO || !document.getAudiolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasaudio, true);
-        if (document.dc_source().getContentDomain() == ContentDomain.VIDEO || !document.getVideolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasvideo, true);
-        if (document.dc_source().getContentDomain() == ContentDomain.APP   || !document.getApplinks().isEmpty())   this.RESULT_FLAGS.set(flag_cat_hasapp,   true);
+        ContentDomain contentDomain = document.getContentDomain();
+        if (contentDomain == ContentDomain.IMAGE || !document.getImages().isEmpty())     this.RESULT_FLAGS.set(flag_cat_hasimage, true);
+        if (contentDomain == ContentDomain.AUDIO || !document.getAudiolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasaudio, true);
+        if (contentDomain == ContentDomain.VIDEO || !document.getVideolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasvideo, true);
+        if (contentDomain == ContentDomain.APP   || !document.getApplinks().isEmpty())   this.RESULT_FLAGS.set(flag_cat_hasapp,   true);
        if (document.lat() != 0.0 && document.lon() != 0.0) this.RESULT_FLAGS.set(flag_cat_haslocation, true);

        this.languageIdentificator = new Identificator();
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@ -50,6 +50,7 @@ import java.util.TreeSet;

 import net.yacy.cora.date.ISO8601Formatter;
 import net.yacy.cora.document.analysis.Classification;
+import net.yacy.cora.document.analysis.Classification.ContentDomain;
 import net.yacy.cora.document.encoding.UTF8;
 import net.yacy.cora.document.id.AnchorURL;
 import net.yacy.cora.document.id.DigestURL;
@ -145,6 +146,17 @@ public class Document {
        this.date = date == null ? new Date() : date;
    }

+    /**
+     * Get the content domain of a document. This tries to get the content domain from the mime type
+     * and if this fails it uses alternatively the content domain from the file extension.
+     * @return the content domain which classifies the content type
+     */
+    public ContentDomain getContentDomain() {
+        ContentDomain contentDomain = Classification.getContentDomainFromMime(this.mimeType);
+        if (contentDomain != ContentDomain.ALL) return contentDomain;
+        return this.dc_source().getContentDomainFromExt();
+    }
+    
    public Object getParserObject() {
        return this.parserObject;
    }
@ -480,7 +492,7 @@ dc_rights
            this.applinks   = new LinkedHashMap<AnchorURL, String>();
            this.emaillinks = new LinkedHashMap<String, String>();
            final Map<AnchorURL, ImageEntry> collectedImages = new HashMap<AnchorURL, ImageEntry>(); // this is a set that is collected now and joined later to the imagelinks
-            for (final Map.Entry<AnchorURL, ImageEntry> entry: collectedImages.entrySet()) {
+            for (final Map.Entry<AnchorURL, ImageEntry> entry: this.images.entrySet()) {
                if (entry.getKey().getHost().equals(thishost)) this.inboundlinks.put(entry.getKey(), "image"); else this.outboundlinks.put(entry.getKey(), "image");
            }
            for (final AnchorURL url: this.anchors) {
--- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java
+++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java
@ -38,7 +38,6 @@ import net.yacy.cora.document.analysis.Classification.ContentDomain;
 import net.yacy.cora.document.encoding.ASCII;
 import net.yacy.cora.document.encoding.UTF8;
 import net.yacy.cora.document.id.DigestURL;
-import net.yacy.cora.document.id.MultiProtocolURL;
 import net.yacy.cora.federate.solr.SolrType;
 import net.yacy.cora.lod.vocabulary.Tagging;
 import net.yacy.cora.order.Base64Order;
@ -92,6 +91,20 @@ public class URIMetadataNode {
        this.word = searchedWord;
        this.ranking = ranking;
    }
+
+    /**
+     * Get the content domain of a document. This tries to get the content domain from the mime type
+     * and if this fails it uses alternatively the content domain from the file extension.
+     * @return the content domain which classifies the content type
+     */
+    public ContentDomain getContentDomain() {
+        if (this.doc == null) return this.url.getContentDomainFromExt();
+        String mime = mime();
+        if (mime == null) return this.url.getContentDomainFromExt();
+        ContentDomain contentDomain = Classification.getContentDomainFromMime(mime);
+        if (contentDomain != ContentDomain.ALL) return contentDomain;
+        return this.url.getContentDomainFromExt();
+    }
    
    public SolrDocument getDocument() {
        return this.doc;
@ -183,6 +196,11 @@ public class URIMetadataNode {
        return Response.docType(a.get(0));
    }

+    public String mime() {
+        ArrayList<String> mime = getStringList(CollectionSchema.content_type);
+        return mime == null || mime.size() == 0 ? null : mime.get(0);
+    }
+
    public byte[] language() {
        String language = getString(CollectionSchema.language_s);
        if (language == null || language.length() == 0) return ASCII.getBytes("en");
@ -203,7 +221,7 @@ public class URIMetadataNode {
        if (flags == null) {
            this.flags = new Bitfield();
            if (dc_subject() != null && dc_subject().indexOf("indexof") >= 0) this.flags.set(Condenser.flag_cat_indexof, true);
-            ContentDomain cd = Classification.getContentDomain(MultiProtocolURL.getFileExtension(this.url().getFileName()));
+            ContentDomain cd = getContentDomain();
            if (lon() != 0.0d || lat() != 0.0d) this.flags.set(Condenser.flag_cat_haslocation, true);
            if (cd == ContentDomain.IMAGE || limage() > 0) this.flags.set(Condenser.flag_cat_hasimage, true);
            if (cd == ContentDomain.AUDIO || laudio() > 0) this.flags.set(Condenser.flag_cat_hasaudio, true);
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -2553,7 +2553,7 @@ public final class Switchboard extends serverSwitch {
           ) {
            // get the hyperlinks
            final Map<DigestURL, String> hl = Document.getHyperlinks(documents);
-            boolean loadImages = getConfigBool("crawler.load.image", true);
+            boolean loadImages = getConfigBool(SwitchboardConstants.CRAWLER_LOAD_IMAGE, true);
            if (loadImages) hl.putAll(Document.getImagelinks(documents));
            
            // add all media links also to the crawl stack. They will be re-sorted to the NOLOAD queue and indexed afterwards as pure links
--- a/source/net/yacy/search/SwitchboardConstants.java
+++ b/source/net/yacy/search/SwitchboardConstants.java
@ -323,6 +323,7 @@ public final class SwitchboardConstants {
     * <p><code>public static final String <strong>CRAWLER_THREADS_ACTIVE_MAX</strong> = "crawler.MaxActiveThreads"</code></p>
     * <p>Name of the setting how many active crawler-threads may maximal be running on the same time</p>
     */
+    public static final String CRAWLER_LOAD_IMAGE               = "crawler.load.image";
    public static final String CRAWLER_THREADS_ACTIVE_MAX       = "crawler.MaxActiveThreads";
    public static final String CRAWLER_FOLLOW_REDIRECTS         = "crawler.http.FollowRedirects"; // ignore the target url and follow to the redirect
    public static final String CRAWLER_RECORD_REDIRECTS         = "crawler.http.RecordRedirects"; // record the ignored redirected page to the index store
--- a/source/net/yacy/search/query/SearchEvent.java
+++ b/source/net/yacy/search/query/SearchEvent.java
@ -1040,12 +1040,13 @@ public final class SearchEvent {
            }

            // check content domain
+            ContentDomain contentDomain = page.getContentDomain();
            if (this.query.contentdom.getCode() > 0 && (
-                (this.query.contentdom == Classification.ContentDomain.IMAGE && page.url().getContentDomain() != Classification.ContentDomain.IMAGE) ||
-                (this.query.contentdom == Classification.ContentDomain.AUDIO && page.url().getContentDomain() != Classification.ContentDomain.AUDIO) ||
-                (this.query.contentdom == Classification.ContentDomain.VIDEO && page.url().getContentDomain() != Classification.ContentDomain.VIDEO) ||
-                (this.query.contentdom == Classification.ContentDomain.APP && page.url().getContentDomain() != Classification.ContentDomain.APP)) && this.query.urlMask_isCatchall) {
-                if (log.isFine()) log.fine("dropped RWI: wrong contentdom = " + this.query.contentdom + ", domain = " + page.url().getContentDomain());
+                (this.query.contentdom == Classification.ContentDomain.IMAGE && contentDomain != Classification.ContentDomain.IMAGE) ||
+                (this.query.contentdom == Classification.ContentDomain.AUDIO && contentDomain != Classification.ContentDomain.AUDIO) ||
+                (this.query.contentdom == Classification.ContentDomain.VIDEO && contentDomain != Classification.ContentDomain.VIDEO) ||
+                (this.query.contentdom == Classification.ContentDomain.APP && contentDomain != Classification.ContentDomain.APP)) && this.query.urlMask_isCatchall) {
+                if (log.isFine()) log.fine("dropped RWI: wrong contentdom = " + this.query.contentdom + ", domain = " + contentDomain);
                if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
                continue;
            }
@ -1321,7 +1322,8 @@ public final class SearchEvent {
        }

        // load snippet
-        if (page.url().getContentDomain() == Classification.ContentDomain.TEXT || page.url().getContentDomain() == Classification.ContentDomain.ALL) {
+        ContentDomain contentDomain = page.getContentDomain();
+        if (contentDomain == Classification.ContentDomain.TEXT || contentDomain == Classification.ContentDomain.ALL) {
            // attach text snippet
            long startTime = System.currentTimeMillis();
            final TextSnippet snippet = new TextSnippet(