- apply directDocByURL to NOLOAD Queue

- choose pushing to NOLOAD as default for site crawl
pull/1/head
Michael Peter Christen 13 years ago
parent 5c66880be2
commit 19efbf1b0f

@ -95,7 +95,7 @@
<dt><label>Dynamic URLs</label></dt> <dt><label>Dynamic URLs</label></dt>
<dd> <dd>
<input type="checkbox" name="crawlingQ" id="crawlingQ" #(crawlingQChecked)#::checked="checked"#(/crawlingQChecked)# /> allow <a href="http://en.wikipedia.org/wiki/Query_string">query-strings</a> (urls with a '?' in the path) <input type="checkbox" name="crawlingQ" id="crawlingQ" #(crawlingQChecked)#::checked="checked"#(/crawlingQChecked)# /> allow <a href="http://en.wikipedia.org/wiki/Query_string">query-strings</a> (urls with a '?' in the path)
<input type="hidden" name="directDocByURL" id="directDocByURL" value="on" />
<input type="hidden" name="storeHTCache" id="storeHTCache" value="on" /> <input type="hidden" name="storeHTCache" id="storeHTCache" value="on" />
<input type="hidden" name="cachePolicy" id="cachePolicy" value="iffresh" /> <input type="hidden" name="cachePolicy" id="cachePolicy" value="iffresh" />
<input type="hidden" name="indexText" id="indexText" value="on" /> <input type="hidden" name="indexText" id="indexText" value="on" />

@ -184,7 +184,7 @@ public class Crawler_p {
env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth)); env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8; if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8;
final boolean directDocByURL = "on".equals(post.get("directDocByURL", "off")); final boolean directDocByURL = "on".equals(post.get("directDocByURL", "on")); // catch also all linked media documents without loading them
env.setConfig("crawlingDirectDocByURL", directDocByURL); env.setConfig("crawlingDirectDocByURL", directDocByURL);
// recrawl // recrawl

@ -2355,10 +2355,12 @@ public final class Switchboard extends serverSwitch
final Map<MultiProtocolURI, String> hl = Document.getHyperlinks(documents); final Map<MultiProtocolURI, String> hl = Document.getHyperlinks(documents);
// add all media links also to the crawl stack. They will be re-sorted to the NOLOAD queue and indexed afterwards as pure links // add all media links also to the crawl stack. They will be re-sorted to the NOLOAD queue and indexed afterwards as pure links
if (response.profile().directDocByURL()) {
hl.putAll(Document.getImagelinks(documents)); hl.putAll(Document.getImagelinks(documents));
hl.putAll(Document.getApplinks(documents)); hl.putAll(Document.getApplinks(documents));
hl.putAll(Document.getVideolinks(documents)); hl.putAll(Document.getVideolinks(documents));
hl.putAll(Document.getAudiolinks(documents)); hl.putAll(Document.getAudiolinks(documents));
}
// insert those hyperlinks to the crawler // insert those hyperlinks to the crawler
MultiProtocolURI nextUrl; MultiProtocolURI nextUrl;

Loading…
Cancel
Save