added an option to put image links to the crawl queue and handle these

like normal documents. Using this option (by default on at this moment;
this might change soon) it is possible to get the exif data into the
search index to be used in image search.
pull/1/head
Michael Peter Christen 12 years ago
parent e8e558a9b7
commit 69f85265e1

@ -796,6 +796,11 @@ search.excludehosth=
# the cases of nocache, iffresh and ifexist causes an index deletion # the cases of nocache, iffresh and ifexist causes an index deletion
search.verify.delete = true search.verify.delete = true
# images may be treated either as documents that are shown in search results or as objects
# that are only visible in special search environments, like image search
search.excludeintext.image = true
crawler.load.image = true;
# remote search details # remote search details
remotesearch.maxcount = 10 remotesearch.maxcount = 10
remotesearch.maxtime = 3000 remotesearch.maxtime = 3000

@ -336,9 +336,10 @@ public final class CrawlStacker {
// check availability of parser and maxfilesize // check availability of parser and maxfilesize
String warning = null; String warning = null;
boolean loadImages = Switchboard.getSwitchboard().getConfigBool("crawler.load.image", true);
if ((maxFileSize >= 0 && entry.size() > maxFileSize) || if ((maxFileSize >= 0 && entry.size() > maxFileSize) ||
entry.url().getContentDomain() == ContentDomain.APP || entry.url().getContentDomain() == ContentDomain.APP ||
entry.url().getContentDomain() == ContentDomain.IMAGE || (!loadImages && entry.url().getContentDomain() == ContentDomain.IMAGE) ||
entry.url().getContentDomain() == ContentDomain.AUDIO || entry.url().getContentDomain() == ContentDomain.AUDIO ||
entry.url().getContentDomain() == ContentDomain.VIDEO || entry.url().getContentDomain() == ContentDomain.VIDEO ||
entry.url().getContentDomain() == ContentDomain.CTRL) { entry.url().getContentDomain() == ContentDomain.CTRL) {

@ -2512,10 +2512,12 @@ public final class Switchboard extends serverSwitch {
) { ) {
// get the hyperlinks // get the hyperlinks
final Map<DigestURI, String> hl = Document.getHyperlinks(documents); final Map<DigestURI, String> hl = Document.getHyperlinks(documents);
boolean loadImages = getConfigBool("crawler.load.image", true);
if (loadImages) hl.putAll(Document.getImagelinks(documents));
// add all media links also to the crawl stack. They will be re-sorted to the NOLOAD queue and indexed afterwards as pure links // add all media links also to the crawl stack. They will be re-sorted to the NOLOAD queue and indexed afterwards as pure links
if (response.profile().directDocByURL()) { if (response.profile().directDocByURL()) {
hl.putAll(Document.getImagelinks(documents)); if (!loadImages) hl.putAll(Document.getImagelinks(documents));
hl.putAll(Document.getApplinks(documents)); hl.putAll(Document.getApplinks(documents));
hl.putAll(Document.getVideolinks(documents)); hl.putAll(Document.getVideolinks(documents));
hl.putAll(Document.getAudiolinks(documents)); hl.putAll(Document.getAudiolinks(documents));

Loading…
Cancel
Save