load image only if their parser flag is activated

pull/1/head
Michael Peter Christen 12 years ago
parent b2c329929f
commit 234a974955

@ -55,6 +55,7 @@ import net.yacy.crawler.retrieval.HTTPLoader;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.retrieval.SMBLoader;
import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.document.TextParser;
import net.yacy.kelondro.data.citation.CitationReference;
import net.yacy.kelondro.rwi.IndexCell;
import net.yacy.kelondro.workflow.WorkflowProcessor;
@ -347,17 +348,10 @@ public final class CrawlStacker {
// check availability of parser and maxfilesize
String warning = null;
boolean loadImages = Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.CRAWLER_LOAD_IMAGE, true);
if (!loadImages && Switchboard.getSwitchboard().getConfig(SwitchboardConstants.CRAWLER_LOAD_IMAGE, "").equals("true;")) {
// dammit semicolon
// TODO: remove this shit later
Switchboard.getSwitchboard().setConfig(SwitchboardConstants.CRAWLER_LOAD_IMAGE, true);
loadImages = true;
}
ContentDomain contentDomain = entry.url().getContentDomainFromExt();
if ((maxFileSize >= 0 && entry.size() > maxFileSize) ||
contentDomain == ContentDomain.APP ||
(!loadImages && contentDomain == ContentDomain.IMAGE) ||
(contentDomain == ContentDomain.IMAGE && TextParser.supportsExtension(entry.url()) != null) ||
contentDomain == ContentDomain.AUDIO ||
contentDomain == ContentDomain.VIDEO ||
contentDomain == ContentDomain.CTRL) {

@ -2556,12 +2556,16 @@ public final class Switchboard extends serverSwitch {
) {
// get the hyperlinks
final Map<DigestURL, String> hl = Document.getHyperlinks(documents);
boolean loadImages = getConfigBool(SwitchboardConstants.CRAWLER_LOAD_IMAGE, true);
if (loadImages) hl.putAll(Document.getImagelinks(documents));
for (Map.Entry<DigestURL, String> entry: Document.getImagelinks(documents).entrySet()) {
if (TextParser.supportsExtension(entry.getKey()) == null) hl.put(entry.getKey(), entry.getValue());
}
// add all media links also to the crawl stack. They will be re-sorted to the NOLOAD queue and indexed afterwards as pure links
if (response.profile().directDocByURL()) {
if (!loadImages) hl.putAll(Document.getImagelinks(documents));
for (Map.Entry<DigestURL, String> entry: Document.getImagelinks(documents).entrySet()) {
if (TextParser.supportsExtension(entry.getKey()) != null) hl.put(entry.getKey(), entry.getValue());
}
hl.putAll(Document.getApplinks(documents));
hl.putAll(Document.getVideolinks(documents));
hl.putAll(Document.getAudiolinks(documents));

@ -323,7 +323,6 @@ public final class SwitchboardConstants {
* <p><code>public static final String <strong>CRAWLER_THREADS_ACTIVE_MAX</strong> = "crawler.MaxActiveThreads"</code></p>
* <p>Name of the setting how many active crawler-threads may maximal be running on the same time</p>
*/
public static final String CRAWLER_LOAD_IMAGE = "crawler.load.image";
public static final String CRAWLER_THREADS_ACTIVE_MAX = "crawler.MaxActiveThreads";
public static final String CRAWLER_FOLLOW_REDIRECTS = "crawler.http.FollowRedirects"; // ignore the target url and follow to the redirect
public static final String CRAWLER_RECORD_REDIRECTS = "crawler.http.RecordRedirects"; // record the ignored redirected page to the index store

Loading…
Cancel
Save