Merge branch 'master' of git://gitorious.org/yacy/rc1 into blacklist_structure

pull/1/head
Felix Ableitner 12 years ago
commit 376f9cd9d0

Binary file not shown.

Before

Width:  |  Height:  |  Size: 4.5 KiB

After

Width:  |  Height:  |  Size: 22 KiB

@ -226,7 +226,9 @@ Section "Uninstall"
RMDir /r "$INSTDIR\classes"
RMDir /r "$INSTDIR\defaults"
RMDir /r "$INSTDIR\htroot"
RMDir /r "$INSTDIR\langstats"
RMDir /r "$INSTDIR\lib"
RMDir /r "$INSTDIR\libbuild"
RMDir /r "$INSTDIR\libx"
RMDir /r "$INSTDIR\locales"
RMDir /r "$INSTDIR\ranking"

@ -3,7 +3,7 @@ javacSource=1.6
javacTarget=1.6
# Release Configuration
releaseVersion=1.51
releaseVersion=1.52
stdReleaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
sourceReleaseFile=yacy_src_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
releaseFileParentDir=yacy

@ -90,9 +90,10 @@
and back compatibility is not guaranteed. Names with both leading and
trailing underscores (e.g. _version_) are reserved.
-->
<field name="id" type="string" indexed="true" stored="true" required="true" />
<field name="sku" type="text_en_splitting_tight" indexed="true" stored="true" omitNorms="true"/>
<field name="sku" type="string" indexed="true" stored="true" omitNorms="true"/>
<!--<field name="sku" type="text_en_splitting_tight" indexed="true" stored="true" omitNorms="true"/>-->
<field name="name" type="text_general" indexed="true" stored="true"/>
<field name="manu" type="text_general" indexed="true" stored="true" omitNorms="true"/>
<field name="cat" type="string" indexed="true" stored="true" multiValued="true"/>

@ -44,6 +44,9 @@ java.util.logging.FileHandler.level = ALL
java.util.logging.FileHandler.formatter=net.yacy.kelondro.logging.SimpleLogFormatter
java.util.logging.FileHandler.encoding=UTF-8
# Properties for crawler
net.yacy.crawler.robots.RobotsTxt.level = SEVERE
# Properties for the GuiHandler
net.yacy.kelondro.logging.GuiHandler.level = ALL
net.yacy.kelondro.logging.GuiHandler.formatter=net.yacy.kelondro.logging.SimpleLogFormatter
@ -79,18 +82,42 @@ httpclient.wire.level = OFF
org.apache.pdfbox.level = INFO
# Properties for solr
org.apache.solr.core.Config.level = INFO
org.apache.solr.client.solrj.impl.HttpClientUtil.level = SEVERE
org.apache.solr.core.Config.level = SEVERE
org.apache.solr.core.CoreContainer.level = SEVERE
org.apache.solr.core.JmxMonitoredMap.level = SEVERE
org.apache.solr.core.RequestHandlers.level = SEVERE
org.apache.solr.core.SolrConfig.level = SEVERE
org.apache.solr.core.SolrCore.level = SEVERE
org.apache.solr.core.SolrResourceLoader.level = INFO
org.apache.solr.core.SolrResourceLoader.level = SEVERE
org.apache.solr.core.CachingDirectoryFactory.level = OFF
org.apache.solr.util.plugin.AbstractPluginLoader.level = INFO
org.apache.solr.schema.IndexSchema.level = INFO
org.apache.solr.schema.FieldTypePluginLoader.level = INFO
org.apache.solr.handler.component.HttpShardHandlerFactory.level = SEVERE
org.apache.solr.handler.component.QueryElevationComponent.level = SEVERE
org.apache.solr.handler.component.SearchHandler.level = SEVERE
org.apache.solr.handler.component.SpellCheckComponent.level = SEVERE
org.apache.solr.handler.loader.XMLLoader.level = SEVERE
org.apache.solr.handler.ReplicationHandler.level = SEVERE
org.apache.solr.handler.UpdateRequestHandler.level = INFO
org.apache.solr.handler.loader.XMLLoader.level = INFO
org.apache.solr.search.SolrIndexSearcher.level = INFO
org.apache.solr.response.XSLTResponseWriter.level = SEVERE
org.apache.solr.schema.FileExchangeRateProvider.level = SEVERE
org.apache.solr.schema.IndexSchema.level = SEVERE
org.apache.solr.search.SolrIndexSearcher.level = SEVERE
org.apache.solr.spelling.DirectSolrSpellChecker.level = SEVERE
org.apache.solr.update.processor.LogUpdateProcessor.level = OFF
org.apache.solr.update.DefaultSolrCoreState.level = SEVERE
org.apache.solr.update.SolrCoreState.level = SEVERE
org.apache.solr.update.SolrIndexWriter.level = INFO
org.apache.solr.update.UpdateHandler.level = SEVERE
# Properties for jena
com.hp.hpl.jena.util.FileManager.level = SEVERE
com.hp.hpl.jena.util.LocationMapper.level = SEVERE
com.hp.hpl.jena.util.LocatorClassLoader.level = SEVERE
com.hp.hpl.jena.util.LocatorFile.level = SEVERE
com.hp.hpl.jena.util.SystemUtils.level = SEVERE
# Properties for the YaCy solr interface
net.yacy.cora.federate.solr.connector.SolrServerConnector.level = INFO
@ -98,3 +125,6 @@ net.yacy.cora.federate.solr.connector.SolrServerConnector.level = INFO
# java properties
javax.management.misc.level = INFO
javax.management.mbeanserver.level = INFO
# Properties for Collection
CollectionConfiguration.CRHost.level = SEVERE

@ -118,6 +118,8 @@
<p>manual update:<br/>apt-get update &amp;&amp; apt-get install yacy</p>
<p>automatic update: add the following line to /etc/crontab<br/>0 6 * * * root apt-get update &amp;&amp; apt-get -y --force-yes install yacy</p>
::
<p>YaCy has been installed to the Program Files directory. Automatic update is not possible.<br/>Download and install the latest version from the web page <a href="http://www.yacy.net/">http://www.yacy.net/</a></p>
::
#(/candeploy)#
#%env/templates/footer.template%#

@ -55,6 +55,9 @@ public class ConfigUpdate_p {
if (yacyBuildProperties.isPkgManager()) {
prop.put("candeploy", "2");
return prop;
} else if (OS.isWindows && sb.appPath.toString().indexOf("Program Files") > -1) {
prop.put("candeploy", "3");
return prop;
} else if (OS.canExecUnix || OS.isWindows) {
// we can deploy a new system with (i.e.)
// cd DATA/RELEASE;tar xfz $1;cp -Rf yacy/* ../../;rm -Rf yacy

@ -28,7 +28,6 @@ import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.retrieval.Response;
import net.yacy.crawler.robots.RobotsTxtEntry;
@ -88,19 +87,19 @@ public class CrawlCheck_p {
robotsEntry = sb.robots.getEntry(u, sb.peers.myBotIDs());
if (robotsEntry == null) {
prop.put("table_list_" + row + "_robots", "no robots");
prop.put("table_list_" + row + "_crawldelay", CrawlQueues.queuedMinLoadDelay + " ms");
prop.put("table_list_" + row + "_crawldelay", ClientIdentification.minLoadDelay() + " ms");
prop.put("table_list_" + row + "_sitemap", "");
} else {
robotsAllowed = !robotsEntry.isDisallowed(u);
prop.put("table_list_" + row + "_robots", "robots exist: " + (robotsAllowed ? "crawl allowed" : "url disallowed"));
prop.put("table_list_" + row + "_crawldelay", Math.max(CrawlQueues.queuedMinLoadDelay, robotsEntry.getCrawlDelayMillis()) + " ms");
prop.put("table_list_" + row + "_crawldelay", Math.max(ClientIdentification.minLoadDelay(), robotsEntry.getCrawlDelayMillis()) + " ms");
prop.put("table_list_" + row + "_sitemap", robotsEntry.getSitemap() == null ? "-" : robotsEntry.getSitemap().toNormalform(true));
}
// try to load the url
if (robotsAllowed) try {
Request request = sb.loader.request(u, true, false);
final Response response = sb.loader.load(request, CacheStrategy.NOCACHE, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
final Response response = sb.loader.load(request, CacheStrategy.NOCACHE, BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
if (response == null) {
prop.put("table_list_" + row + "_access", "no response");
} else {

@ -43,7 +43,6 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.data.ZURL.FailCategory;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.retrieval.SitemapImporter;
@ -288,7 +287,7 @@ public class Crawler_p {
// download document
Document scraper;
try {
scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
// get links and generate filter
for (DigestURI u: scraper.getAnchors().keySet()) {
newRootURLs.add(u);
@ -445,6 +444,7 @@ public class Crawler_p {
0,
0,
0),
null,
sb.peers.mySeed().hash.getBytes(),
new Date(),
1,

@ -27,7 +27,6 @@ import net.yacy.cora.geo.OpenGeoDBLocation;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.retrieval.Response;
import net.yacy.document.LibraryProvider;
import net.yacy.kelondro.data.meta.DigestURI;
@ -67,7 +66,7 @@ public class DictionaryLoader_p {
if (post.containsKey("geon0Load")) {
// load from the net
try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.GEON0.file());
LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON0.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON0.file(), null, -1));
@ -109,7 +108,7 @@ public class DictionaryLoader_p {
if (post.containsKey("geon1Load")) {
// load from the net
try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.GEON1.file());
LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON1.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON1.file(), null, -1));
@ -151,7 +150,7 @@ public class DictionaryLoader_p {
if (post.containsKey("geon2Load")) {
// load from the net
try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON2.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON2.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.GEON2.file());
LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON2.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON2.file(), null, 100000));
@ -193,7 +192,7 @@ public class DictionaryLoader_p {
if (post.containsKey("geo1Load")) {
// load from the net
try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEODB1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEODB1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.GEODB1.file());
LibraryProvider.geoLoc.deactivateLocalization(LibraryProvider.Dictionary.GEODB1.nickname);
@ -236,7 +235,7 @@ public class DictionaryLoader_p {
if (post.containsKey("drw0Load")) {
// load from the net
try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.DRW0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.DRW0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.DRW0.file());
LibraryProvider.activateDeReWo();
@ -280,7 +279,7 @@ public class DictionaryLoader_p {
if (post.containsKey("pnd0Load")) {
// load from the net
try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.PND0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.PND0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.PND0.file());
LibraryProvider.activatePND();

@ -53,7 +53,7 @@ public class IndexReIndexMonitor_p {
prop.put("showstartbutton", 0);
}
} else {
if (post != null && post.containsKey("reindexnow")) {
if (post != null && post.containsKey("reindexnow") && sb.index.fulltext().connectedLocalSolr()) {
migration.reindexToschema(sb);
prop.put("showstartbutton", 0);
prop.put("querysize", "0");

@ -42,7 +42,6 @@ import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.HarvestProcess;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.retrieval.RSSLoader;
import net.yacy.crawler.retrieval.Response;
import net.yacy.data.WorkTables;
@ -267,7 +266,7 @@ public class Load_RSS_p {
RSSReader rss = null;
if (url != null) try {
prop.put("url", url.toNormalform(true));
final Response response = sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
final Response response = sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
final byte[] resource = response == null ? null : response.getContent();
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
} catch (final IOException e) {

@ -32,6 +32,7 @@ import java.util.Date;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.Memory;
import net.yacy.kelondro.io.ByteCount;
import net.yacy.kelondro.util.Formatter;
import net.yacy.kelondro.util.MemoryControl;
@ -317,6 +318,7 @@ public class Status
prop.put("totalMemory", Formatter.bytesToString(MemoryControl.total()));
prop.put("maxMemory", Formatter.bytesToString(MemoryControl.maxMemory()));
prop.put("processors", WorkflowProcessor.availableCPU);
prop.put("load", Memory.load());
// proxy traffic
//prop.put("trafficIn",bytesToString(httpdByteCountInputStream.getGlobalCount()));

@ -3,18 +3,17 @@
<legend>System Status</legend>
<dl>
<dt>Process</dt>
<dd>#[versionpp]#
<dt>System</dt>
<dd>YaCy version #[versionpp]#
#(peerStatistics)#
Unknown
::
<div>Uptime: #[uptime]#</div>
#(/peerStatistics)#
</dd>
<dt>System Resources</dt>
<dd>Processors: #[processors]#</dd>
#(/peerStatistics)#
<div>Processors: #[processors]#</div>
<div>Load: #[load]#</div>
</dd>
<dt>Protection</dt>
<dd>#(protection)#
<strong>Password is missing.</strong>

@ -45,7 +45,6 @@ import net.yacy.cora.lod.vocabulary.YaCyMetadata;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.crawler.data.Cache;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.retrieval.Response;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
@ -169,7 +168,7 @@ public class ViewFile {
Response response = null;
try {
response = sb.loader.load(sb.loader.request(url, true, false), authorized ? CacheStrategy.IFEXIST : CacheStrategy.CACHEONLY, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
response = sb.loader.load(sb.loader.request(url, true, false), authorized ? CacheStrategy.IFEXIST : CacheStrategy.CACHEONLY, Integer.MAX_VALUE, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
} catch (final IOException e) {
prop.put("error", "4");
prop.put("error_errorText", "error loading resource: " + e.getMessage());

@ -39,7 +39,6 @@ import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.storage.ConcurrentARC;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.data.URLLicense;
import net.yacy.document.ImageParser;
import net.yacy.kelondro.data.meta.DigestURI;
@ -105,7 +104,7 @@ public class ViewImage {
if (image == null) {
byte[] resourceb = null;
if (url != null) try {
resourceb = sb.loader.loadContent(sb.loader.request(url, false, true), CacheStrategy.IFEXIST, BlacklistType.SEARCH, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
resourceb = sb.loader.loadContent(sb.loader.request(url, false, true), CacheStrategy.IFEXIST, BlacklistType.SEARCH, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
} catch (final IOException e) {
ConcurrentLog.fine("ViewImage", "cannot load: " + e.getMessage());
}

@ -37,7 +37,6 @@ import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.robots.RobotsTxtEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.repository.Blacklist.BlacklistType;
@ -97,7 +96,7 @@ public class getpageinfo {
}
net.yacy.document.Document scraper = null;
if (u != null) try {
scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
} catch (final IOException e) {
ConcurrentLog.logException(e);
// bad things are possible, i.e. that the Server responds with "403 Bad Behavior"

@ -37,7 +37,6 @@ import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.robots.RobotsTxtEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.repository.Blacklist.BlacklistType;
@ -97,7 +96,7 @@ public class getpageinfo_p {
}
net.yacy.document.Document scraper = null;
if (u != null) try {
scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
} catch (final IOException e) {
ConcurrentLog.logException(e);
// bad things are possible, i.e. that the Server responds with "403 Bad Behavior"

@ -26,9 +26,9 @@ import java.util.Iterator;
import java.util.Map;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.crawler.data.Latency;
import net.yacy.crawler.data.NoticedURL;
import net.yacy.crawler.data.Latency.Host;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
@ -52,7 +52,7 @@ public class latency_p {
prop.put("domains_" + c + "_count", host.count());
prop.put("domains_" + c + "_average", host.average());
prop.put("domains_" + c + "_robots", host.robotsDelay());
prop.put("domains_" + c + "_flux", host.flux(NoticedURL.minimumGlobalDeltaInit));
prop.put("domains_" + c + "_flux", host.flux(ClientIdentification.minimumGlobalDeltaInit));
c++;
}
prop.put("domains", c);

@ -26,6 +26,9 @@
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.io.ByteCount;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.kelondro.workflow.WorkflowProcessor;
@ -100,6 +103,29 @@ public class status_p {
prop.putNum("noloadCrawlSize", sb.crawlQueues.noloadCrawlJobSize());
prop.put("noloadCrawlState", STATE_RUNNING);
// generate crawl profile table
int count = 0;
final int domlistlength = (post == null) ? 160 : post.getInt("domlistlength", 160);
CrawlProfile profile;
// put active crawls into list
String hosts = "";
for (final byte[] h: sb.crawler.getActive()) {
profile = sb.crawler.getActive(h);
if (CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) continue;
profile.putProfileEntry("crawlProfiles_list_", prop, true, false, count, domlistlength);
RowHandleSet urlhashes = sb.crawler.getURLHashes(h);
prop.put("crawlProfiles_list_" + count + "_count", urlhashes == null ? "unknown" : Integer.toString(urlhashes.size()));
if (profile.urlMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN) {
hosts = hosts + "," + profile.name();
}
count++;
}
prop.put("crawlProfiles_list", count);
prop.put("crawlProfiles_count", count);
prop.put("crawlProfiles", count == 0 ? 0 : 1);
prop.put("postprocessingRunning", Switchboard.postprocessingRunning ? 1 : 0);
// return rewrite properties
return prop;
}

@ -49,5 +49,24 @@
<size>#[noloadCrawlSize]#</size>
<state>#[noloadCrawlState]#</state>
</noloadcrawlerqueue>
<!-- crawl profile list -->
#(crawlProfiles)#::
<crawls count="#[count]#">
#{list}#
<crawl>
<name>#[name]#</name>
<count>#[count]#</count>
<handle>#[handle]#</handle>
<depth>#[depth]#</depth>
<status>#(terminateButton)#terminated::alive#(/terminateButton)#</status>
</crawl>
#{/list}#
</crawls>
#(/crawlProfiles)#
<postprocessing>
<status>#(postprocessingRunning)#idle::busy#(/postprocessingRunning)#</status>
</postprocessing>
</status>

@ -35,7 +35,6 @@ import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.kelondro.data.citation.CitationReference;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.rwi.IndexCell;
@ -98,7 +97,7 @@ public class webstructure {
prop.put("references", 1);
net.yacy.document.Document scraper = null;
if (url != null) try {
scraper = sb.loader.loadDocument(url, CacheStrategy.IFEXIST, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
scraper = sb.loader.loadDocument(url, CacheStrategy.IFEXIST, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
} catch (final IOException e) {
ConcurrentLog.logException(e);
}

Before

Width:  |  Height:  |  Size: 884 B

After

Width:  |  Height:  |  Size: 884 B

Before

Width:  |  Height:  |  Size: 838 B

After

Width:  |  Height:  |  Size: 838 B

Before

Width:  |  Height:  |  Size: 2.4 KiB

After

Width:  |  Height:  |  Size: 2.4 KiB

Before

Width:  |  Height:  |  Size: 854 B

After

Width:  |  Height:  |  Size: 854 B

Before

Width:  |  Height:  |  Size: 209 B

After

Width:  |  Height:  |  Size: 209 B

Before

Width:  |  Height:  |  Size: 2.8 KiB

After

Width:  |  Height:  |  Size: 2.8 KiB

Before

Width:  |  Height:  |  Size: 668 B

After

Width:  |  Height:  |  Size: 668 B

Before

Width:  |  Height:  |  Size: 673 B

After

Width:  |  Height:  |  Size: 673 B

Before

Width:  |  Height:  |  Size: 1.9 KiB

After

Width:  |  Height:  |  Size: 1.9 KiB

Before

Width:  |  Height:  |  Size: 2.0 KiB

After

Width:  |  Height:  |  Size: 2.0 KiB

Before

Width:  |  Height:  |  Size: 5.4 KiB

After

Width:  |  Height:  |  Size: 5.4 KiB

Before

Width:  |  Height:  |  Size: 5.7 KiB

After

Width:  |  Height:  |  Size: 5.7 KiB

Before

Width:  |  Height:  |  Size: 5.6 KiB

After

Width:  |  Height:  |  Size: 5.6 KiB

Before

Width:  |  Height:  |  Size: 2.0 KiB

After

Width:  |  Height:  |  Size: 2.0 KiB

Before

Width:  |  Height:  |  Size: 70 B

After

Width:  |  Height:  |  Size: 70 B

Before

Width:  |  Height:  |  Size: 326 B

After

Width:  |  Height:  |  Size: 326 B

Before

Width:  |  Height:  |  Size: 326 B

After

Width:  |  Height:  |  Size: 326 B

Before

Width:  |  Height:  |  Size: 2.2 KiB

After

Width:  |  Height:  |  Size: 2.2 KiB

@ -163,6 +163,7 @@ public final class crawlReceipt {
sb.crawlQueues.delegatedURL.remove(entry.hash()); // the delegated work is transformed into an error case
sb.crawlQueues.errorURL.push(
entry.toBalancerEntry(iam),
null,
youare.getBytes(),
null,
0,

Before

Width:  |  Height:  |  Size: 49 B

After

Width:  |  Height:  |  Size: 49 B

Before

Width:  |  Height:  |  Size: 1.9 KiB

After

Width:  |  Height:  |  Size: 1.9 KiB

Before

Width:  |  Height:  |  Size: 532 B

After

Width:  |  Height:  |  Size: 532 B

Before

Width:  |  Height:  |  Size: 970 B

After

Width:  |  Height:  |  Size: 970 B

Before

Width:  |  Height:  |  Size: 1012 B

After

Width:  |  Height:  |  Size: 1012 B

Before

Width:  |  Height:  |  Size: 1.1 KiB

After

Width:  |  Height:  |  Size: 1.1 KiB

Before

Width:  |  Height:  |  Size: 86 B

After

Width:  |  Height:  |  Size: 86 B

Before

Width:  |  Height:  |  Size: 10 KiB

After

Width:  |  Height:  |  Size: 10 KiB

Before

Width:  |  Height:  |  Size: 970 B

After

Width:  |  Height:  |  Size: 970 B

Before

Width:  |  Height:  |  Size: 1.0 KiB

After

Width:  |  Height:  |  Size: 1.0 KiB

Before

Width:  |  Height:  |  Size: 1.0 KiB

After

Width:  |  Height:  |  Size: 1.0 KiB

Before

Width:  |  Height:  |  Size: 78 B

After

Width:  |  Height:  |  Size: 78 B

Before

Width:  |  Height:  |  Size: 984 B

After

Width:  |  Height:  |  Size: 984 B

Before

Width:  |  Height:  |  Size: 1.9 KiB

After

Width:  |  Height:  |  Size: 1.9 KiB

Before

Width:  |  Height:  |  Size: 562 B

After

Width:  |  Height:  |  Size: 562 B

Before

Width:  |  Height:  |  Size: 1.1 KiB

After

Width:  |  Height:  |  Size: 1.1 KiB

Before

Width:  |  Height:  |  Size: 970 B

After

Width:  |  Height:  |  Size: 970 B

Before

Width:  |  Height:  |  Size: 1.1 KiB

After

Width:  |  Height:  |  Size: 1.1 KiB

Before

Width:  |  Height:  |  Size: 86 B

After

Width:  |  Height:  |  Size: 86 B

Before

Width:  |  Height:  |  Size: 1008 B

After

Width:  |  Height:  |  Size: 1008 B

Before

Width:  |  Height:  |  Size: 1.0 KiB

After

Width:  |  Height:  |  Size: 1.0 KiB

Before

Width:  |  Height:  |  Size: 1018 B

After

Width:  |  Height:  |  Size: 1018 B

Before

Width:  |  Height:  |  Size: 997 B

After

Width:  |  Height:  |  Size: 997 B

Before

Width:  |  Height:  |  Size: 506 B

After

Width:  |  Height:  |  Size: 506 B

Before

Width:  |  Height:  |  Size: 518 B

After

Width:  |  Height:  |  Size: 518 B

Before

Width:  |  Height:  |  Size: 315 B

After

Width:  |  Height:  |  Size: 315 B

@ -82,6 +82,7 @@ public class urls {
// place url to notice-url db
sb.crawlQueues.delegatedURL.push(
entry,
null,
sb.peers.mySeed().hash.getBytes(),
new Date(),
0,

@ -100,8 +100,8 @@ public class yacysearch {
final Switchboard sb = (Switchboard) env;
sb.localSearchLastAccess = System.currentTimeMillis();
final boolean searchAllowed =
sb.getConfigBool("publicSearchpage", true) || sb.verifyAuthentication(header);
final boolean authorized = sb.verifyAuthentication(header);
final boolean searchAllowed = sb.getConfigBool("publicSearchpage", true) || authorized;
boolean authenticated = sb.adminAuthenticated(header) >= 2;
if ( !authenticated ) {
@ -161,7 +161,7 @@ public class yacysearch {
boolean p2pmode = sb.peers != null && sb.peers.sizeConnected() > 0 && indexReceiveGranted;
boolean global = post == null || (post.get("resource", "local").equals("global") && p2pmode);
boolean stealthmode = p2pmode && !global;
prop.put("topmenu_resource-select", stealthmode ? 2 : global ? 1 : 0);
prop.put("topmenu_resource-select", !authorized ? 0 : stealthmode ? 2 : global ? 1 : 0);
if ( post == null || indexSegment == null || env == null || !searchAllowed ) {
if (indexSegment == null) ConcurrentLog.info("yacysearch", "indexSegment == null");

@ -125,7 +125,9 @@ public class InstanceMirror {
public EmbeddedSolrConnector getDefaultEmbeddedConnector() {
if (this.defaultEmbeddedConnector != null) return this.defaultEmbeddedConnector;
this.defaultEmbeddedConnector = this.solr0 == null ? null : new EmbeddedSolrConnector(this.solr0);
this.embeddedCache.put(this.getDefaultCoreName(), this.defaultEmbeddedConnector);
String coreName = this.getDefaultCoreName();
if (coreName == null) return null;
this.embeddedCache.put(coreName, this.defaultEmbeddedConnector);
return this.defaultEmbeddedConnector;
}

@ -27,7 +27,10 @@ package net.yacy.cora.protocol;
public class ClientIdentification {
public static final long MIN_LOAD_DELAY = 500;
public static final int DEFAULT_TIMEOUT = 10000;
public static final int minimumLocalDeltaInit = 10; // the minimum time difference between access of the same local domain
public static final int minimumGlobalDeltaInit = 500; // the minimum time difference between access of the same global domain
/**
* provide system information (this is part of YaCy protocol)
@ -116,4 +119,8 @@ public class ClientIdentification {
return location;
}
public static long minLoadDelay() {
return MIN_LOAD_DELAY;
}
}

@ -20,6 +20,9 @@
package net.yacy.cora.util;
import java.lang.management.ManagementFactory;
import java.lang.management.ThreadInfo;
public class Memory {
private static final Runtime runtime = Runtime.getRuntime();
@ -45,7 +48,7 @@ public class Memory {
* @return bytes
*/
public static final long maxMemory() {
return runtime.maxMemory();
return runtime.maxMemory(); // can be Long.MAX_VALUE if unlimited
}
/**
@ -63,5 +66,34 @@ public class Memory {
public static final long used() {
return total() - free();
}
/**
* get the system load within the last minute
* @return the system load or a negative number if the load is not available
*/
public static double load() {
return ManagementFactory.getOperatingSystemMXBean().getSystemLoadAverage();
}
/**
* find out the number of thread deadlocks. WARNING: this is a time-consuming task
* @return the number of deadlocked threads
*/
public static long deadlocks() {
long[] deadlockIDs = ManagementFactory.getThreadMXBean().findDeadlockedThreads();
if (deadlockIDs == null) return 0;
return deadlockIDs.length;
}
/**
* write deadlocked threads as to the log as warning
*/
public static void logDeadlocks() {
long[] deadlockIDs = ManagementFactory.getThreadMXBean().findDeadlockedThreads();
if (deadlockIDs == null) return;
ThreadInfo[] infos = ManagementFactory.getThreadMXBean().getThreadInfo(deadlockIDs, true, true);
for (ThreadInfo ti : infos) {
ConcurrentLog.warn("DEADLOCKREPORT", ti.toString());
}
}
}

@ -267,7 +267,7 @@ public class Balancer {
* @throws IOException
* @throws SpaceExceededException
*/
public String push(final Request entry, final RobotsTxt robots) throws IOException, SpaceExceededException {
public String push(final Request entry, CrawlProfile profile, final RobotsTxt robots) throws IOException, SpaceExceededException {
assert entry != null;
final byte[] hash = entry.url().hash();
synchronized (this) {
@ -278,6 +278,11 @@ public class Balancer {
if (this.double_push_check.size() > MAX_DOUBLE_PUSH_CHECK || MemoryControl.shortStatus()) this.double_push_check.clear();
this.double_push_check.put(hash);
// increase dom counter
if (profile != null && profile.domMaxPages() != Integer.MAX_VALUE && profile.domMaxPages() > 0) {
profile.domInc(entry.url().getHost());
}
// add to index
final int s = this.urlFileIndex.size();
this.urlFileIndex.put(entry.toRow());

@ -149,7 +149,8 @@ public final class CrawlStacker {
// if the url was rejected we store it into the error URL db
if (rejectReason != null) {
this.nextQueue.errorURL.push(entry, ASCII.getBytes(this.peers.mySeed().hash), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1);
final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(entry.profileHandle()));
this.nextQueue.errorURL.push(entry, profile, ASCII.getBytes(this.peers.mySeed().hash), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1);
}
} catch (final Exception e) {
CrawlStacker.this.log.warn("Error while processing stackCrawl entry.\n" + "Entry: " + entry.toString() + "Error: " + e.toString(), e);
@ -341,30 +342,25 @@ public final class CrawlStacker {
entry.url().getContentDomain() == ContentDomain.AUDIO ||
entry.url().getContentDomain() == ContentDomain.VIDEO ||
entry.url().getContentDomain() == ContentDomain.CTRL) {
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry, this.robots);
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry, profile, this.robots);
//if (warning != null && this.log.isFine()) this.log.logFine("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true, false) + " - not pushed: " + warning);
return null;
}
// add domain to profile domain list
if (profile.domMaxPages() != Integer.MAX_VALUE && profile.domMaxPages() > 0) {
profile.domInc(entry.url().getHost());
}
if (global) {
// it may be possible that global == true and local == true, so do not check an error case against it
if (proxy) this.log.warn("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, proxy = true, initiator = proxy" + ", profile.handle = " + profile.handle());
if (remote) this.log.warn("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle());
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.GLOBAL, entry, this.robots);
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.GLOBAL, entry, profile, this.robots);
} else if (local) {
if (proxy) this.log.warn("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, proxy = true, initiator = proxy" + ", profile.handle = " + profile.handle());
if (remote) this.log.warn("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle());
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LOCAL, entry, this.robots);
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LOCAL, entry, profile, this.robots);
} else if (proxy) {
if (remote) this.log.warn("URL '" + entry.url().toString() + "' has conflicting initiator properties: proxy = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle());
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LOCAL, entry, this.robots);
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LOCAL, entry, profile, this.robots);
} else if (remote) {
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.REMOTE, entry, this.robots);
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.REMOTE, entry, profile, this.robots);
}
if (warning != null && this.log.isFine()) this.log.fine("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true) + " - not pushed: " + warning);

@ -63,7 +63,6 @@ import net.yacy.search.SwitchboardConstants;
public class CrawlQueues {
public static final long queuedMinLoadDelay = 500;
private static final String ERROR_DB_FILENAME = "urlError4.db";
private static final String DELEGATED_DB_FILENAME = "urlDelegated4.db";
@ -588,6 +587,7 @@ public class CrawlQueues {
+ this.noticeURL.stackSize(NoticedURL.StackType.REMOTE) + "]";
try {
final Request urlEntry = this.noticeURL.pop(NoticedURL.StackType.REMOTE, true, this.sb.crawler, this.sb.robots);
if (urlEntry == null) return false;
final String profileHandle = urlEntry.profileHandle();
// System.out.println("DEBUG plasmaSwitchboard.processCrawling:
// profileHandle = " + profileHandle + ", urlEntry.url = " +
@ -612,6 +612,7 @@ public class CrawlQueues {
private Request request;
private final Integer code;
private final long start;
private final CrawlProfile profile;
private Loader(final Request entry) {
this.start = System.currentTimeMillis();
@ -619,6 +620,7 @@ public class CrawlQueues {
this.request.setStatus("worker-initialized", WorkflowJob.STATUS_INITIATED);
this.code = Integer.valueOf(entry.hashCode());
this.setPriority(Thread.MIN_PRIORITY); // http requests from the crawler should not cause that other functions work worse
this.profile = CrawlQueues.this.sb.crawler.getActive(UTF8.getBytes(this.request.profileHandle()));
}
private long age() {
@ -637,6 +639,7 @@ public class CrawlQueues {
//if (log.isFine()) log.logFine("Crawling of URL '" + request.url().toString() + "' disallowed by robots.txt.");
CrawlQueues.this.errorURL.push(
this.request,
profile,
ASCII.getBytes(CrawlQueues.this.sb.peers.mySeed().hash),
new Date(),
1,
@ -652,8 +655,7 @@ public class CrawlQueues {
// returns null if everything went fine, a fail reason string if a problem occurred
try {
this.request.setStatus("loading", WorkflowJob.STATUS_RUNNING);
final CrawlProfile e = CrawlQueues.this.sb.crawler.getActive(UTF8.getBytes(this.request.profileHandle()));
final Response response = CrawlQueues.this.sb.loader.load(this.request, e == null ? CacheStrategy.IFEXIST : e.cacheStrategy(), BlacklistType.CRAWLER, queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
final Response response = CrawlQueues.this.sb.loader.load(this.request, profile == null ? CacheStrategy.IFEXIST : profile.cacheStrategy(), BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
if (response == null) {
this.request.setStatus("error", WorkflowJob.STATUS_FINISHED);
if (CrawlQueues.this.log.isFine()) {
@ -677,6 +679,7 @@ public class CrawlQueues {
if (result != null) {
CrawlQueues.this.errorURL.push(
this.request,
profile,
ASCII.getBytes(CrawlQueues.this.sb.peers.mySeed().hash),
new Date(),
1,
@ -690,6 +693,7 @@ public class CrawlQueues {
} catch (final Exception e) {
CrawlQueues.this.errorURL.push(
this.request,
profile,
ASCII.getBytes(CrawlQueues.this.sb.peers.mySeed().hash),
new Date(),
1,

@ -39,6 +39,7 @@ import net.yacy.cora.order.Base64Order;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.crawler.Balancer;
import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.crawler.retrieval.Request;
@ -51,9 +52,6 @@ public class NoticedURL {
LOCAL, GLOBAL, REMOTE, NOLOAD;
}
private static final int minimumLocalDeltaInit = 10; // the minimum time difference between access of the same local domain
public static final int minimumGlobalDeltaInit = 500; // the minimum time difference between access of the same global domain
private Balancer coreStack; // links found by crawling to depth-1
private Balancer limitStack; // links found by crawling at target depth
private Balancer remoteStack; // links from remote crawl orders
@ -65,11 +63,11 @@ public class NoticedURL {
final boolean useTailCache,
final boolean exceed134217727) {
ConcurrentLog.info("NoticedURL", "CREATING STACKS at " + cachePath.toString());
this.coreStack = new Balancer(cachePath, "urlNoticeCoreStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, myAgentIDs, useTailCache, exceed134217727);
this.limitStack = new Balancer(cachePath, "urlNoticeLimitStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, myAgentIDs, useTailCache, exceed134217727);
this.coreStack = new Balancer(cachePath, "urlNoticeCoreStack", ClientIdentification.minimumLocalDeltaInit, ClientIdentification.minimumGlobalDeltaInit, myAgentIDs, useTailCache, exceed134217727);
this.limitStack = new Balancer(cachePath, "urlNoticeLimitStack", ClientIdentification.minimumLocalDeltaInit, ClientIdentification.minimumGlobalDeltaInit, myAgentIDs, useTailCache, exceed134217727);
//overhangStack = new plasmaCrawlBalancer(overhangStackFile);
this.remoteStack = new Balancer(cachePath, "urlNoticeRemoteStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, myAgentIDs, useTailCache, exceed134217727);
this.noloadStack = new Balancer(cachePath, "urlNoticeNoLoadStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, myAgentIDs, useTailCache, exceed134217727);
this.remoteStack = new Balancer(cachePath, "urlNoticeRemoteStack", ClientIdentification.minimumLocalDeltaInit, ClientIdentification.minimumGlobalDeltaInit, myAgentIDs, useTailCache, exceed134217727);
this.noloadStack = new Balancer(cachePath, "urlNoticeNoLoadStack", ClientIdentification.minimumLocalDeltaInit, ClientIdentification.minimumGlobalDeltaInit, myAgentIDs, useTailCache, exceed134217727);
}
public int getMinimumLocalDelta() {
@ -172,13 +170,13 @@ public class NoticedURL {
* @param entry
* @return null if this was successful or a String explaining what went wrong in case of an error
*/
public String push(final StackType stackType, final Request entry, final RobotsTxt robots) {
public String push(final StackType stackType, final Request entry, CrawlProfile profile, final RobotsTxt robots) {
try {
switch (stackType) {
case LOCAL: return this.coreStack.push(entry, robots);
case GLOBAL: return this.limitStack.push(entry, robots);
case REMOTE: return this.remoteStack.push(entry, robots);
case NOLOAD: return this.noloadStack.push(entry, robots);
case LOCAL: return this.coreStack.push(entry, profile, robots);
case GLOBAL: return this.limitStack.push(entry, profile, robots);
case REMOTE: return this.remoteStack.push(entry, profile, robots);
case NOLOAD: return this.noloadStack.push(entry, profile, robots);
default: return "stack type unknown";
}
} catch (final Exception er) {
@ -271,7 +269,7 @@ public class NoticedURL {
try {
final Request entry = pop(fromStack, false, cs, robots);
if (entry != null) {
final String warning = push(toStack, entry, robots);
final String warning = push(toStack, entry, null, robots);
if (warning != null) {
ConcurrentLog.warn("NoticedURL", "shift from " + fromStack + " to " + toStack + ": " + warning);
}

@ -172,6 +172,7 @@ public class ZURL implements Iterable<ZURL.Entry> {
public void push(
final Request bentry,
final CrawlProfile profile,
final byte[] executor,
final Date workdate,
final int workcount,
@ -190,7 +191,7 @@ public class ZURL implements Iterable<ZURL.Entry> {
if (this.fulltext.getDefaultConnector() != null && failCategory.store) {
// send the error to solr
try {
SolrInputDocument errorDoc = this.fulltext.getDefaultConfiguration().err(bentry.url(), failCategory.name() + " " + reason, failCategory.failType, httpcode);
SolrInputDocument errorDoc = this.fulltext.getDefaultConfiguration().err(bentry.url(), profile == null ? null : profile.collections(), failCategory.name() + " " + reason, failCategory.failType, httpcode);
this.fulltext.getDefaultConnector().add(errorDoc);
} catch (final IOException e) {
ConcurrentLog.warn("SOLR", "failed to send error " + bentry.url().toNormalform(true) + " to solr: " + e.getMessage());

@ -101,6 +101,7 @@ public class FTPLoader {
// create new ftp client
final FTPClient ftpClient = new FTPClient();
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
// get a connection
if (openConnection(ftpClient, entryUrl)) {
// test if the specified file is a directory
@ -130,7 +131,6 @@ public class FTPLoader {
final ResponseHeader responseHeader = new ResponseHeader(200);
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
response = new Response(
request,
requestHeader,
@ -156,7 +156,7 @@ public class FTPLoader {
if (berr.size() > 0 || response == null) {
// some error logging
final String detail = (berr.size() > 0) ? "Errorlog: " + berr.toString() : "";
this.sb.crawlQueues.errorURL.push(request, ASCII.getBytes(this.sb.peers.mySeed().hash), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, " ftp server download, " + detail, -1);
this.sb.crawlQueues.errorURL.push(request, profile, ASCII.getBytes(this.sb.peers.mySeed().hash), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, " ftp server download, " + detail, -1);
throw new IOException("FTPLoader: Unable to download URL '" + request.url().toString() + "': " + detail);
}

@ -70,20 +70,20 @@ public final class HTTPLoader {
this.socketTimeout = (int) sb.getConfigLong("crawler.clientTimeout", 30000);
}
public Response load(final Request entry, final int maxFileSize, final BlacklistType blacklistType, int timeout) throws IOException {
public Response load(final Request entry, CrawlProfile profile, final int maxFileSize, final BlacklistType blacklistType, int timeout) throws IOException {
Latency.updateBeforeLoad(entry.url());
final long start = System.currentTimeMillis();
final Response doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize, blacklistType, timeout);
final Response doc = load(entry, profile, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize, blacklistType, timeout);
Latency.updateAfterLoad(entry.url(), System.currentTimeMillis() - start);
return doc;
}
private Response load(final Request request, final int retryCount, final int maxFileSize, final BlacklistType blacklistType, int timeout) throws IOException {
private Response load(final Request request, CrawlProfile profile, final int retryCount, final int maxFileSize, final BlacklistType blacklistType, int timeout) throws IOException {
byte[] myHash = ASCII.getBytes(this.sb.peers.mySeed().hash);
if (retryCount < 0) {
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
throw new IOException("retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.");
}
@ -99,7 +99,7 @@ public final class HTTPLoader {
// check if url is in blacklist
final String hostlow = host.toLowerCase();
if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) {
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
}
@ -146,7 +146,7 @@ public final class HTTPLoader {
redirectionUrlString = redirectionUrlString == null ? "" : redirectionUrlString.trim();
if (redirectionUrlString.isEmpty()) {
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusCode);
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusCode);
throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
}
@ -160,32 +160,32 @@ public final class HTTPLoader {
this.sb.webStructure.generateCitationReference(url, redirectionUrl);
if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_RECORD_REDIRECTS, true)) {
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusCode);
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusCode);
}
if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
// if we are already doing a shutdown we don't need to retry crawling
if (Thread.currentThread().isInterrupted()) {
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode);
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode);
throw new IOException("CRAWLER Retry of URL=" + requestURLString + " aborted because of server shutdown.");
}
// check if the url was already loaded
if (Cache.has(redirectionUrl.hash())) { // customer request
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection to double content", statusCode);
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection to double content", statusCode);
throw new IOException("CRAWLER Redirection of URL=" + requestURLString + " ignored. The url appears already in htcache");
}
// retry crawling with new url
request.redirectURL(redirectionUrl);
return load(request, retryCount - 1, maxFileSize, blacklistType, timeout);
return load(request, profile, retryCount - 1, maxFileSize, blacklistType, timeout);
}
// we don't want to follow redirects
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
} else if (responseBody == null) {
// no response, reject file
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", statusCode);
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", statusCode);
throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
} else if (statusCode == 200 || statusCode == 203) {
// the transfer is ok
@ -196,12 +196,11 @@ public final class HTTPLoader {
// check length again in case it was not possible to get the length before loading
if (maxFileSize >= 0 && contentLength > maxFileSize) {
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode);
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode);
throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)");
}
// create a new cache entry
final CrawlProfile profile = request.profileHandle() == null ? null : this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
response = new Response(
request,
requestHeader,
@ -214,7 +213,7 @@ public final class HTTPLoader {
return response;
} else {
// if the response has not the right response type then reject file
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
}
}

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save