fixed a number of small bugs:

- better crawl star for files paths and smb paths
- added time-out wrapper for dns resolving and reverse resolving to prevent blockings
- fixed intranet scanner result list check boxes
- prevented htcache usage in case of file and smb crawling (not necessary, documents are locally available)
- fixed rss feed loader
- fixes sitemap loader which had not been restricted to single files (crawl-depth must be zero)
- clearing of crawl result lists when a network switch was done
- higher maximum file size for crawler

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7214 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent f6eebb6f99
commit 2c549ae341

@ -674,10 +674,10 @@ crawler.clientTimeout=9000
crawler.http.acceptEncoding=gzip crawler.http.acceptEncoding=gzip
crawler.http.acceptLanguage=en-us,en;q=0.5 crawler.http.acceptLanguage=en-us,en;q=0.5
crawler.http.acceptCharset=ISO-8859-1,utf-8;q=0.7,*;q=0.7 crawler.http.acceptCharset=ISO-8859-1,utf-8;q=0.7,*;q=0.7
crawler.http.maxFileSize=1048576 crawler.http.maxFileSize=10485760
# ftp crawler specific settings; size in bytes # ftp crawler specific settings; size in bytes
crawler.ftp.maxFileSize=1048576 crawler.ftp.maxFileSize=10485760
# smb crawler specific settings: maximum size # smb crawler specific settings: maximum size
crawler.smb.maxFileSize=100000000 crawler.smb.maxFileSize=100000000

@ -92,7 +92,6 @@ public class CrawlProfileEditor_p {
labels.add(new eentry(CrawlProfile.INDEX_TEXT, "Index Text", false, eentry.BOOLEAN)); labels.add(new eentry(CrawlProfile.INDEX_TEXT, "Index Text", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.INDEX_MEDIA, "Index Media", false, eentry.BOOLEAN)); labels.add(new eentry(CrawlProfile.INDEX_MEDIA, "Index Media", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.STORE_HTCACHE, "Store in HTCache", false, eentry.BOOLEAN)); labels.add(new eentry(CrawlProfile.STORE_HTCACHE, "Store in HTCache", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.STORE_TXCACHE, "Store in TXCache", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.REMOTE_INDEXING, "Remote Indexing", false, eentry.BOOLEAN)); labels.add(new eentry(CrawlProfile.REMOTE_INDEXING, "Remote Indexing", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.XSSTOPW, "Static stop-words", false, eentry.BOOLEAN)); labels.add(new eentry(CrawlProfile.XSSTOPW, "Static stop-words", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.XDSTOPW, "Dynamic stop-words", false, eentry.BOOLEAN)); labels.add(new eentry(CrawlProfile.XDSTOPW, "Dynamic stop-words", false, eentry.BOOLEAN));

@ -188,15 +188,16 @@ public class CrawlResults {
try { try {
urle = sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).load(entry.getKey().getBytes(), null, 0); urle = sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).load(entry.getKey().getBytes(), null, 0);
if (urle == null) { if (urle == null) {
Log.logWarning("PLASMA", "CrawlResults: URL not in index with url hash "+ entry.getKey()); Log.logWarning("PLASMA", "CrawlResults: URL not in index with url hash " + entry.getKey());
urlstr = null; urlstr = null;
urltxt = null; urltxt = null;
metadata = null; metadata = null;
} else { continue;
metadata = urle.metadata();
urlstr = metadata.url().toNormalform(false, true);
urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL
} }
metadata = urle.metadata();
urlstr = metadata.url().toNormalform(false, true);
urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL
initiatorSeed = entry.getValue() == null || entry.getValue().initiatorHash == null ? null : sb.peers.getConnected(new String(entry.getValue().initiatorHash)); initiatorSeed = entry.getValue() == null || entry.getValue().initiatorHash == null ? null : sb.peers.getConnected(new String(entry.getValue().initiatorHash));
executorSeed = entry.getValue() == null || entry.getValue().executorHash == null ? null : sb.peers.getConnected(new String(entry.getValue().executorHash)); executorSeed = entry.getValue() == null || entry.getValue().executorHash == null ? null : sb.peers.getConnected(new String(entry.getValue().executorHash));

@ -36,7 +36,7 @@
#(/notintranet)# #(/notintranet)#
#(servertable)#:: #(servertable)#::
<form id="servertable" action="CrawlStartIntranet_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8" ><fieldset> <form id="servertable" name="servertable" action="CrawlStartIntranet_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8" ><fieldset>
<legend><label for="servertable">Available Intranet Server</label></legend> <legend><label for="servertable">Available Intranet Server</label></legend>
<table class="sortable" border="0" cellpadding="2" cellspacing="1"> <table class="sortable" border="0" cellpadding="2" cellspacing="1">
<tr class="TableHeader" valign="bottom"> <tr class="TableHeader" valign="bottom">

@ -45,8 +45,7 @@
<span id="robotsOK"></span><img align="top" src="/env/grafics/empty.gif" name="ajax" alt="empty" /> <span id="robotsOK"></span><img align="top" src="/env/grafics/empty.gif" name="ajax" alt="empty" />
</td> </td>
</tr><tr> </tr><tr>
<td><input type="radio" name="crawlingMode" id="sitelist" value="sitelist" disabled="disabled" <td><input type="radio" name="crawlingMode" id="sitelist" value="sitelist" disabled="disabled" />Link-List of URL</td>
onmousedown="document.getElementById('rangeDomain').disabled=true;document.getElementById('rangeSubpath').disabled=true;document.getElementById('crawlingDomMaxCheck').disabled=true;document.getElementById('crawlingDomMaxPages').disabled=true;document.getElementById('crawlingQ').disabled=true;"/>Link-List of URL</td>
<td><div id="sitelistURLs"></div></td> <td><div id="sitelistURLs"></div></td>
</tr><tr> </tr><tr>
<td><input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled" <td><input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled"

@ -138,14 +138,13 @@ public class Crawler_p {
final boolean fullDomain = post.get("range", "wide").equals("domain"); // special property in simple crawl start final boolean fullDomain = post.get("range", "wide").equals("domain"); // special property in simple crawl start
final boolean subPath = post.get("range", "wide").equals("subpath"); // special property in simple crawl start final boolean subPath = post.get("range", "wide").equals("subpath"); // special property in simple crawl start
// set the crawl filter // set the crawl filter
String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL); String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL);
String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER); String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER);
if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted
// special cases: // special cases:
if (crawlingStartURL!= null && fullDomain) { if (crawlingStartURL!= null && fullDomain) {
newcrawlingMustMatch = ".*" + crawlingStartURL.getHost() + ".*"; newcrawlingMustMatch = crawlingStartURL.isFile() ? "file:///.*" : crawlingStartURL.isSMB() ? "smb://.*" : ".*" + crawlingStartURL.getHost() + ".*";
} }
if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf('/')) > 0) { if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf('/')) > 0) {
newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*"; newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*";
@ -203,7 +202,8 @@ public class Crawler_p {
final boolean indexMedia = post.get("indexMedia", "off").equals("on"); final boolean indexMedia = post.get("indexMedia", "off").equals("on");
env.setConfig("indexMedia", (indexMedia) ? "true" : "false"); env.setConfig("indexMedia", (indexMedia) ? "true" : "false");
final boolean storeHTCache = post.get("storeHTCache", "off").equals("on"); boolean storeHTCache = post.get("storeHTCache", "off").equals("on");
if (crawlingStartURL.isFile() || crawlingStartURL.isSMB()) storeHTCache = false;
env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false"); env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false");
final String cachePolicyString = post.get("cachePolicy", "iffresh"); final String cachePolicyString = post.get("cachePolicy", "iffresh");
@ -247,15 +247,21 @@ public class Crawler_p {
// stack url // stack url
sb.crawler.profilesPassiveCrawls.remove(crawlingStartURL.hash()); // if there is an old entry, delete it sb.crawler.profilesPassiveCrawls.remove(crawlingStartURL.hash()); // if there is an old entry, delete it
final CrawlProfile pe = new CrawlProfile( final CrawlProfile pe = new CrawlProfile(
(crawlingStartURL.getHost() == null) ? Long.toHexString(System.currentTimeMillis()) : crawlingStartURL.getHost(), (crawlingStartURL.getHost() == null) ? crawlingStartURL.toNormalform(true, false) : crawlingStartURL.getHost(),
crawlingStartURL, crawlingStartURL,
newcrawlingMustMatch, newcrawlingMustMatch,
newcrawlingMustNotMatch, newcrawlingMustNotMatch,
newcrawlingdepth, newcrawlingdepth,
crawlingIfOlder, crawlingDomMaxPages, crawlingIfOlder,
crawlingDomMaxPages,
crawlingQ, crawlingQ,
indexText, indexMedia, indexText, indexMedia,
storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cachePolicy); storeHTCache,
crawlOrder,
xsstopw,
xdstopw,
xpstopw,
cachePolicy);
sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe); sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
final String reasonString = sb.crawlStacker.stackCrawl(new Request( final String reasonString = sb.crawlStacker.stackCrawl(new Request(
sb.peers.mySeed().hash.getBytes(), sb.peers.mySeed().hash.getBytes(),
@ -352,7 +358,8 @@ public class Crawler_p {
final Map<MultiProtocolURI, String> hyperlinks = scraper.getAnchors(); final Map<MultiProtocolURI, String> hyperlinks = scraper.getAnchors();
final DigestURI crawlURL = new DigestURI("file://" + file.toString(), null); final DigestURI crawlURL = new DigestURI("file://" + file.toString(), null);
final CrawlProfile profile = new CrawlProfile( final CrawlProfile profile = new CrawlProfile(
fileName, crawlURL, fileName,
crawlURL,
newcrawlingMustMatch, newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_NEVER,
newcrawlingdepth, newcrawlingdepth,
@ -362,9 +369,10 @@ public class Crawler_p {
indexText, indexText,
indexMedia, indexMedia,
storeHTCache, storeHTCache,
true,
crawlOrder, crawlOrder,
xsstopw, xdstopw, xpstopw, xsstopw,
xdstopw,
xpstopw,
cachePolicy); cachePolicy);
sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile); sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
@ -405,15 +413,21 @@ public class Crawler_p {
try { try {
final DigestURI sitemapURL = new DigestURI(sitemapURLStr, null); final DigestURI sitemapURL = new DigestURI(sitemapURLStr, null);
final CrawlProfile pe = new CrawlProfile( final CrawlProfile pe = new CrawlProfile(
sitemapURLStr, sitemapURL, sitemapURLStr,
newcrawlingMustMatch, sitemapURL,
CrawlProfile.MATCH_ALL,
CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_NEVER,
newcrawlingdepth, 0,
crawlingIfOlder, crawlingDomMaxPages, crawlingIfOlder,
crawlingQ, crawlingDomMaxPages,
indexText, indexMedia, true,
storeHTCache, true, crawlOrder, indexText,
xsstopw, xdstopw, xpstopw, indexMedia,
storeHTCache,
crawlOrder,
xsstopw,
xdstopw,
xpstopw,
cachePolicy); cachePolicy);
sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe); sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, pe); final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, pe);
@ -431,7 +445,7 @@ public class Crawler_p {
// download document // download document
ContentScraper scraper = null; ContentScraper scraper = null;
scraper = sb.loader.parseResource(sitelistURL, CrawlProfile.CacheStrategy.IFFRESH); scraper = sb.loader.parseResource(sitelistURL, CrawlProfile.CacheStrategy.IFFRESH);
String title = scraper.getTitle(); // String title = scraper.getTitle();
// String description = scraper.getDescription(); // String description = scraper.getDescription();
// get links and generate filter // get links and generate filter
@ -444,7 +458,7 @@ public class Crawler_p {
// put links onto crawl queue // put links onto crawl queue
final CrawlProfile profile = new CrawlProfile( final CrawlProfile profile = new CrawlProfile(
title == null || title.length() == 0 ? sitelistURL.getHost() : title, sitelistURL.getHost(),
sitelistURL, sitelistURL,
newcrawlingMustMatch, newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_NEVER,
@ -455,9 +469,10 @@ public class Crawler_p {
indexText, indexText,
indexMedia, indexMedia,
storeHTCache, storeHTCache,
true,
crawlOrder, crawlOrder,
xsstopw, xdstopw, xpstopw, xsstopw,
xdstopw,
xpstopw,
cachePolicy); cachePolicy);
sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile); sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);

@ -157,7 +157,6 @@ public class QuickCrawlLink_p {
indexText, indexText,
indexMedia, indexMedia,
storeHTCache, storeHTCache,
true,
remoteIndexing, remoteIndexing,
xsstopw, xsstopw,
xdstopw, xdstopw,

@ -36,6 +36,7 @@ import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException; import java.util.regex.PatternSyntaxException;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.order.Digest; import net.yacy.kelondro.order.Digest;
@ -111,7 +112,7 @@ public class SettingsAck_p {
final serverCore theServerCore = (serverCore) env.getThread("10_httpd"); final serverCore theServerCore = (serverCore) env.getThread("10_httpd");
try { try {
final InetSocketAddress theNewAddress = theServerCore.generateSocketAddress(port); final InetSocketAddress theNewAddress = theServerCore.generateSocketAddress(port);
final String hostName = theNewAddress.getHostName(); final String hostName = Domains.getHostName(theNewAddress.getAddress());
prop.put("info_restart", "1"); prop.put("info_restart", "1");
prop.put("info_restart_ip",(hostName.equals("0.0.0.0"))? "localhost" : hostName); prop.put("info_restart_ip",(hostName.equals("0.0.0.0"))? "localhost" : hostName);
prop.put("info_restart_port", theNewAddress.getPort()); prop.put("info_restart_port", theNewAddress.getPort());

@ -26,11 +26,13 @@ public class getpageinfo_p {
prop.put("robots-allowed", "3"); //unknown prop.put("robots-allowed", "3"); //unknown
prop.put("sitemap", ""); prop.put("sitemap", "");
prop.put("favicon",""); prop.put("favicon","");
prop.put("sitelist", "");
prop.put("filter", ".*");
// default actions // default actions
String actions="title,robots"; String actions="title,robots";
if(post!=null && post.containsKey("url")){ if (post != null && post.containsKey("url")) {
if(post.containsKey("actions")) if(post.containsKey("actions"))
actions=post.get("actions"); actions=post.get("actions");
String url=post.get("url"); String url=post.get("url");
@ -97,7 +99,7 @@ public class getpageinfo_p {
prop.putXML("filter", filter.length() > 0 ? filter.substring(1) : ".*"); prop.putXML("filter", filter.length() > 0 ? filter.substring(1) : ".*");
} }
} }
if(actions.indexOf("robots")>=0){ if (actions.indexOf("robots")>=0) {
try { try {
final DigestURI theURL = new DigestURI(url, null); final DigestURI theURL = new DigestURI(url, null);

@ -48,14 +48,14 @@ function handleResponse(){
sitemap=response.getElementsByTagName("sitemap")[0].firstChild.nodeValue; sitemap=response.getElementsByTagName("sitemap")[0].firstChild.nodeValue;
} }
document.getElementsByName("sitemapURL")[0].value=sitemap; document.getElementsByName("sitemapURL")[0].value=sitemap;
document.getElementById("sitemap").disabled=false; if (sitemap) document.getElementById("sitemap").disabled=false;
} }
sitelist=""; sitelist="";
if (response.getElementsByTagName("sitelist")[0].firstChild!=null){ if (response.getElementsByTagName("sitelist")[0].firstChild!=null){
sitelist=response.getElementsByTagName("sitelist")[0].firstChild.nodeValue; sitelist=response.getElementsByTagName("sitelist")[0].firstChild.nodeValue;
} }
document.getElementById("sitelistURLs").innerHTML = sitelist; document.getElementById("sitelistURLs").innerHTML = sitelist;
document.getElementById("sitelist").disabled=false; if (sitelist) document.getElementById("sitelist").disabled=false;
// clear the ajax image // clear the ajax image
document.getElementsByName("ajax")[0].setAttribute("src", AJAX_OFF); document.getElementsByName("ajax")[0].setAttribute("src", AJAX_OFF);

@ -53,7 +53,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public static final String INDEX_TEXT = "indexText"; public static final String INDEX_TEXT = "indexText";
public static final String INDEX_MEDIA = "indexMedia"; public static final String INDEX_MEDIA = "indexMedia";
public static final String STORE_HTCACHE = "storeHTCache"; public static final String STORE_HTCACHE = "storeHTCache";
public static final String STORE_TXCACHE = "storeTXCache";
public static final String REMOTE_INDEXING = "remoteIndexing"; public static final String REMOTE_INDEXING = "remoteIndexing";
public static final String XSSTOPW = "xsstopw"; public static final String XSSTOPW = "xsstopw";
public static final String XDSTOPW = "xdstopw"; public static final String XDSTOPW = "xdstopw";
@ -64,17 +63,22 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
private Pattern mustmatch = null, mustnotmatch = null; private Pattern mustmatch = null, mustnotmatch = null;
public CrawlProfile(final String name, final DigestURI startURL, public CrawlProfile(
final String name,
final DigestURI startURL,
final String mustmatch, final String mustmatch,
final String mustnotmatch, final String mustnotmatch,
final int depth, final int depth,
final long recrawlIfOlder /*date*/, final long recrawlIfOlder /*date*/,
final int domMaxPages, final int domMaxPages,
final boolean crawlingQ, final boolean crawlingQ,
final boolean indexText, final boolean indexMedia, final boolean indexText,
final boolean storeHTCache, final boolean storeTXCache, final boolean indexMedia,
final boolean storeHTCache,
final boolean remoteIndexing, final boolean remoteIndexing,
final boolean xsstopw, final boolean xdstopw, final boolean xpstopw, final boolean xsstopw,
final boolean xdstopw,
final boolean xpstopw,
final CacheStrategy cacheStrategy) { final CacheStrategy cacheStrategy) {
super(40); super(40);
if (name == null || name.length() == 0) throw new NullPointerException("name must not be null"); if (name == null || name.length() == 0) throw new NullPointerException("name must not be null");
@ -91,7 +95,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
put(INDEX_TEXT, indexText); put(INDEX_TEXT, indexText);
put(INDEX_MEDIA, indexMedia); put(INDEX_MEDIA, indexMedia);
put(STORE_HTCACHE, storeHTCache); put(STORE_HTCACHE, storeHTCache);
put(STORE_TXCACHE, storeTXCache);
put(REMOTE_INDEXING, remoteIndexing); put(REMOTE_INDEXING, remoteIndexing);
put(XSSTOPW, xsstopw); // exclude static stop-words put(XSSTOPW, xsstopw); // exclude static stop-words
put(XDSTOPW, xdstopw); // exclude dynamic stop-word put(XDSTOPW, xdstopw); // exclude dynamic stop-word
@ -218,11 +221,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if (r == null) return false; if (r == null) return false;
return (r.equals(Boolean.TRUE.toString())); return (r.equals(Boolean.TRUE.toString()));
} }
public boolean storeTXCache() {
final String r = get(STORE_TXCACHE);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean remoteIndexing() { public boolean remoteIndexing() {
final String r = get(REMOTE_INDEXING); final String r = get(REMOTE_INDEXING);
if (r == null) return false; if (r == null) return false;

@ -170,7 +170,7 @@ public final class CrawlSwitchboard {
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, false,
true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/, true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/, true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/,
true, true, true,
false /*getConfigBool(PROXY_INDEXING_REMOTE, false)*/, true, true, true, false /*getConfigBool(PROXY_INDEXING_REMOTE, false)*/, true, true, true,
CrawlProfile.CacheStrategy.IFFRESH); CrawlProfile.CacheStrategy.IFFRESH);
this.profilesActiveCrawls.put(this.defaultProxyProfile.handle().getBytes(), this.defaultProxyProfile); this.profilesActiveCrawls.put(this.defaultProxyProfile.handle().getBytes(), this.defaultProxyProfile);
@ -178,38 +178,38 @@ public final class CrawlSwitchboard {
if (this.defaultRemoteProfile == null) { if (this.defaultRemoteProfile == null) {
// generate new default entry for remote crawling // generate new default entry for remote crawling
this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH); -1, -1, true, true, true, false, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
this.profilesActiveCrawls.put(this.defaultRemoteProfile.handle().getBytes(), this.defaultRemoteProfile); this.profilesActiveCrawls.put(this.defaultRemoteProfile.handle().getBytes(), this.defaultRemoteProfile);
} }
if (this.defaultTextSnippetLocalProfile == null) { if (this.defaultTextSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling // generate new default entry for snippet fetch and optional crawling
this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH); CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
this.profilesActiveCrawls.put(this.defaultTextSnippetLocalProfile.handle().getBytes(), this.defaultTextSnippetLocalProfile); this.profilesActiveCrawls.put(this.defaultTextSnippetLocalProfile.handle().getBytes(), this.defaultTextSnippetLocalProfile);
} }
if (this.defaultTextSnippetGlobalProfile == null) { if (this.defaultTextSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling // generate new default entry for snippet fetch and optional crawling
this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(this.defaultTextSnippetGlobalProfile.handle().getBytes(), this.defaultTextSnippetGlobalProfile); this.profilesActiveCrawls.put(this.defaultTextSnippetGlobalProfile.handle().getBytes(), this.defaultTextSnippetGlobalProfile);
} }
this.defaultTextSnippetGlobalProfile.setCacheStrategy(CrawlProfile.CacheStrategy.IFEXIST); this.defaultTextSnippetGlobalProfile.setCacheStrategy(CrawlProfile.CacheStrategy.IFEXIST);
if (this.defaultMediaSnippetLocalProfile == null) { if (this.defaultMediaSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling // generate new default entry for snippet fetch and optional crawling
this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, false, false, true, false, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(this.defaultMediaSnippetLocalProfile.handle().getBytes(), this.defaultMediaSnippetLocalProfile); this.profilesActiveCrawls.put(this.defaultMediaSnippetLocalProfile.handle().getBytes(), this.defaultMediaSnippetLocalProfile);
} }
if (this.defaultMediaSnippetGlobalProfile == null) { if (this.defaultMediaSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling // generate new default entry for snippet fetch and optional crawling
this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(this.defaultMediaSnippetGlobalProfile.handle().getBytes(), this.defaultMediaSnippetGlobalProfile); this.profilesActiveCrawls.put(this.defaultMediaSnippetGlobalProfile.handle().getBytes(), this.defaultMediaSnippetGlobalProfile);
} }
if (this.defaultSurrogateProfile == null) { if (this.defaultSurrogateProfile == null) {
// generate new default entry for surrogate parsing // generate new default entry for surrogate parsing
this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE); CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE);
this.profilesActiveCrawls.put(this.defaultSurrogateProfile.handle().getBytes(), this.defaultSurrogateProfile); this.profilesActiveCrawls.put(this.defaultSurrogateProfile.handle().getBytes(), this.defaultSurrogateProfile);
} }
} }

@ -159,6 +159,10 @@ public final class ResultURLs {
return resultDomains.get(stack); return resultDomains.get(stack);
} }
public void clearStacks() {
for (EventOrigin origin: EventOrigin.values()) clearStack(origin);
}
public synchronized void clearStack(final EventOrigin stack) { public synchronized void clearStack(final EventOrigin stack) {
final Map<String, InitExecEntry> resultStack = getStack(stack); final Map<String, InitExecEntry> resultStack = getStack(stack);
if (resultStack != null) resultStack.clear(); if (resultStack != null) resultStack.clear();

@ -118,7 +118,11 @@ public final class HTTPLoader {
// FIXME: 30*-handling (bottom) is never reached // FIXME: 30*-handling (bottom) is never reached
// we always get the final content because httpClient.followRedirects = true // we always get the final content because httpClient.followRedirects = true
if (responseBody != null && (code == 200 || code == 203)) { if (responseBody == null) {
// no response, reject file
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "no response body (you may increase the maxmimum file size)");
throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
} else if (code == 200 || code == 203) {
// the transfer is ok // the transfer is ok
// we write the new cache entry to file system directly // we write the new cache entry to file system directly
@ -180,7 +184,7 @@ public final class HTTPLoader {
} }
} else { } else {
// if the response has not the right response type then reject file // if the response has not the right response type then reject file
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "wrong http status code " + code + ")"); sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "wrong http status code " + code);
throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString()); throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
} }
return response; return response;

@ -858,6 +858,9 @@ public final class Switchboard extends serverSwitch {
this.queuesRoot = new File(new File(indexPrimaryPath, networkName), "QUEUES"); this.queuesRoot = new File(new File(indexPrimaryPath, networkName), "QUEUES");
this.networkRoot.mkdirs(); this.networkRoot.mkdirs();
this.queuesRoot.mkdirs(); this.queuesRoot.mkdirs();
// clear statistic data
this.crawlResults.clearStacks();
// relocate // relocate
this.crawlQueues.relocate(this.queuesRoot); // cannot be closed because the busy threads are working with that object this.crawlQueues.relocate(this.queuesRoot); // cannot be closed because the busy threads are working with that object

@ -225,22 +225,9 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
// parse file url // parse file url
String h = url.substring(p + 1); String h = url.substring(p + 1);
if (h.startsWith("//")) { if (h.startsWith("//")) {
// host may be given, but may be also empty // no host given
final int q = h.indexOf('/', 2); host = null;
if (q <= 0) { path = h.substring(2);
// no host given
host = null;
path = h.substring(2);
} else {
host = h.substring(2, q);
if (host.length() == 0 || host.equals("localhost")) host = null;
h = h.substring(q);
char c = h.charAt(2);
if (c == ':' || c == '|')
path = h.substring(1);
else
path = h;
}
} else { } else {
host = null; host = null;
if (h.length() > 0 && h.charAt(0) == '/') { if (h.length() > 0 && h.charAt(0) == '/') {

@ -23,11 +23,20 @@ package net.yacy.cora.protocol;
import java.net.InetAddress; import java.net.InetAddress;
import java.net.UnknownHostException; import java.net.UnknownHostException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections; import java.util.Collections;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.CancellationException;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import net.yacy.cora.storage.ARC; import net.yacy.cora.storage.ARC;
@ -454,6 +463,80 @@ public class Domains {
return false; return false;
} }
public static String getHostName(final InetAddress i) {
Collection<String> hosts = nameCacheHit.getKeys(i);
if (hosts.size() > 0) return hosts.iterator().next();
// call i.getHostName() using concurrency to interrupt execution in case of a time-out
final Callable<String> callable = new Callable<String>() {
public String call() { return i.getHostName(); }
};
ExecutorService service = Executors.newSingleThreadExecutor();
final Future<String> taskFuture = service.submit(callable);
Runnable t = new Runnable() {
public void run() { taskFuture.cancel(true); }
};
service.execute(t);
service.shutdown();
try {
return taskFuture.get(500, TimeUnit.MILLISECONDS);
} catch (CancellationException e) {
// callable was interrupted
return i.getHostAddress();
} catch (InterruptedException e) {
// service was shutdown
return i.getHostAddress();
} catch(ExecutionException e) {
// callable failed unexpectedly
return i.getHostAddress();
} catch (TimeoutException e) {
// time-out
return i.getHostAddress();
}
}
public static InetAddress dnsResolve(final String hostx) {
if ((hostx == null) || (hostx.length() == 0)) return null;
final String host = hostx.toLowerCase().trim();
// try to simply parse the address
InetAddress ip = parseInetAddress(host);
if (ip != null) return ip;
// try to resolve host by doing a name cache lookup
ip = nameCacheHit.get(host);
if (ip != null) return ip;
if (nameCacheMiss.containsKey(host)) return null;
// call dnsResolveNetBased(host) using concurrency to interrupt execution in case of a time-out
final Callable<InetAddress> callable = new Callable<InetAddress>() {
public InetAddress call() { return dnsResolveNetBased(host); }
};
ExecutorService service = Executors.newSingleThreadExecutor();
final Future<InetAddress> taskFuture = service.submit(callable);
Runnable t = new Runnable() {
public void run() { taskFuture.cancel(true); }
};
service.execute(t);
service.shutdown();
try {
return taskFuture.get(500, TimeUnit.MILLISECONDS);
} catch (CancellationException e) {
// callable was interrupted
return null;
} catch (InterruptedException e) {
// service was shutdown
return null;
} catch(ExecutionException e) {
// callable failed unexpectedly
return null;
} catch (TimeoutException e) {
// time-out
return null;
}
}
private static final InetAddress parseInetAddress(final String ip) { private static final InetAddress parseInetAddress(final String ip) {
if (ip == null) return null; if (ip == null) return null;
if (ip.length() < 8) return null; if (ip.length() < 8) return null;
@ -474,33 +557,21 @@ public class Domains {
return null; return null;
} }
} }
public static InetAddress dnsResolve(String host) { private static InetAddress dnsResolveNetBased(String host) {
if ((host == null) || (host.length() == 0)) return null;
host = host.toLowerCase().trim();
// try to simply parse the address
InetAddress ip = parseInetAddress(host);
if (ip != null) return ip;
// try to resolve host by doing a name cache lookup
ip = nameCacheHit.get(host);
if (ip != null) return ip;
if (nameCacheMiss.containsKey(host)) return null;
//System.out.println("***DEBUG dnsResolve(" + host + ")");
try { try {
boolean doCaching = true; boolean doCaching = true;
ip = InetAddress.getByName(host); // this makes the DNS request to backbone InetAddress ip = InetAddress.getByName(host); // this makes the DNS request to backbone
if ((ip == null) || if ((ip == null) ||
(ip.isLoopbackAddress()) || (ip.isLoopbackAddress()) ||
(nameCacheNoCachingList.containsKey(host)) (nameCacheNoCachingList.containsKey(host))
) { ) {
doCaching = false; doCaching = false;
} else { } else {
if (matchesList(host, nameCacheNoCachingPatterns)) { if (matchesList(host, nameCacheNoCachingPatterns)) {
nameCacheNoCachingList.put(host, PRESENT); nameCacheNoCachingList.put(host, PRESENT);
doCaching = false; doCaching = false;
} }
} }
if (doCaching && ip != null) { if (doCaching && ip != null) {
@ -519,6 +590,7 @@ public class Domains {
return null; return null;
} }
/** /**
* Returns the number of entries in the nameCacheHit map * Returns the number of entries in the nameCacheHit map
* *
@ -565,7 +637,7 @@ public class Domains {
public void run() { public void run() {
String lhn = localHostName; String lhn = localHostName;
try { try {
lhn = InetAddress.getLocalHost().getHostName(); lhn = getHostName(InetAddress.getLocalHost());
} catch (UnknownHostException e) {} } catch (UnknownHostException e) {}
try { try {
localHostAddresses = InetAddress.getAllByName(lhn); localHostAddresses = InetAddress.getAllByName(lhn);
@ -656,7 +728,8 @@ public class Domains {
// finally check if there are other local IP addresses that are not in // finally check if there are other local IP addresses that are not in
// the standard IP range // the standard IP range
for (int i = 0; i < localHostAddresses.length; i++) { for (int i = 0; i < localHostAddresses.length; i++) {
if (localHostAddresses[i].getHostName().equals(host)) return true; String hostname = getHostName(localHostAddresses[i]);
if (hostname != null && hostname.equals(host)) return true;
if (localHostAddresses[i].getHostAddress().equals(host)) return true; if (localHostAddresses[i].getHostAddress().equals(host)) return true;
} }

@ -121,7 +121,8 @@ public class Scanner extends Thread {
private void addProtocol(String protocol, boolean bigrange) { private void addProtocol(String protocol, boolean bigrange) {
for (InetAddress i: genlist(bigrange)) { for (InetAddress i: genlist(bigrange)) {
try { try {
this.scanqueue.put(new MultiProtocolURI(protocol + "://" + i.getHostName() + "/"));
this.scanqueue.put(new MultiProtocolURI(protocol + "://" + Domains.getHostName(i) + "/"));
} catch (MalformedURLException e) { } catch (MalformedURLException e) {
Log.logException(e); Log.logException(e);
} catch (InterruptedException e) { } catch (InterruptedException e) {

@ -21,6 +21,7 @@
package net.yacy.cora.storage; package net.yacy.cora.storage;
import java.util.Collection;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
@ -62,14 +63,21 @@ public interface ARC<K, V> extends Iterable<Map.Entry<K, V>> {
* @return the value * @return the value
*/ */
public V get(K s); public V get(K s);
/**
* check if the map contains the value
* @param value
* @return the keys that have the given value
*/
public Collection<K> getKeys(V value);
/** /**
* check if the map contains the key * check if the map contains the key
* @param s * @param key
* @return * @return true if the map contains the key
*/ */
public boolean containsKey(K s); public boolean containsKey(K key);
/** /**
* remove an entry from the cache * remove an entry from the cache
* @param s * @param s

@ -21,6 +21,8 @@
package net.yacy.cora.storage; package net.yacy.cora.storage;
import java.util.AbstractMap; import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator; import java.util.Comparator;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
@ -105,6 +107,17 @@ public final class ConcurrentARC<K, V> extends AbstractMap<K, V> implements Map<
return this.arc[getPartition(s)].get((K) s); return this.arc[getPartition(s)].get((K) s);
} }
/**
* check if the map contains the value
* @param value
* @return the keys that have the given value
*/
public Collection<K> getKeys(V value) {
ArrayList<K> keys = new ArrayList<K>();
for (int i = 0; i < this.arc.length; i++) keys.addAll(this.arc[i].getKeys(value));
return keys;
}
/** /**
* check if the map contains the key * check if the map contains the key
* @param s * @param s

@ -22,6 +22,8 @@
package net.yacy.cora.storage; package net.yacy.cora.storage;
import java.util.AbstractMap; import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.Map;
@ -98,6 +100,26 @@ abstract class SimpleARC<K, V> extends AbstractMap<K, V> implements Map<K, V>, I
assert (this.levelB.size() <= cacheSize); // the cache should shrink automatically assert (this.levelB.size() <= cacheSize); // the cache should shrink automatically
return v; return v;
} }
/**
* check if the map contains the value
* @param value
* @return the keys that have the given value
*/
public Collection<K> getKeys(V value) {
ArrayList<K> keys = new ArrayList<K>();
synchronized (this.levelB) {
for (Map.Entry<K, V> entry: this.levelB.entrySet()) {
if (value.equals(entry.getValue())) keys.add(entry.getKey());
}
}
synchronized (this) {
for (Map.Entry<K, V> entry: this.levelA.entrySet()) {
if (value.equals(entry.getValue())) keys.add(entry.getKey());
}
}
return keys;
}
/** /**
* check if the map contains the key * check if the map contains the key

@ -305,6 +305,7 @@ public class Table implements Index, Iterable<Row.Entry> {
assert table == null || table.size() == index.size() : "table.size() = " + table.size() + ", index.size() = " + index.size(); assert table == null || table.size() == index.size() : "table.size() = " + table.size() + ", index.size() = " + index.size();
} }
final HashMap<String, String> map = new HashMap<String, String>(8); final HashMap<String, String> map = new HashMap<String, String>(8);
if (index == null) return map; // possibly closed or beeing closed
map.put("tableSize", Integer.toString(index.size())); map.put("tableSize", Integer.toString(index.size()));
map.put("tableKeyChunkSize", Integer.toString(index.row().objectsize)); map.put("tableKeyChunkSize", Integer.toString(index.row().objectsize));
map.put("tableKeyMem", Integer.toString(index.row().objectsize * index.size())); map.put("tableKeyMem", Integer.toString(index.row().objectsize * index.size()));

Loading…
Cancel
Save