enhanced crawl start response time

for very very large crawl start lists
pull/436/head
Michael Peter Christen 3 years ago
parent 1bab4ffe20
commit ef5a71a592

@ -56,12 +56,10 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException; import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.CrawlSwitchboard; import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.crawler.FileCrawlStarterTask; import net.yacy.crawler.FileCrawlStarterTask;
import net.yacy.crawler.data.Cache;
import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.CrawlProfile.CrawlAttribute; import net.yacy.crawler.data.CrawlProfile.CrawlAttribute;
import net.yacy.crawler.data.NoticedURL.StackType; import net.yacy.crawler.data.NoticedURL.StackType;
import net.yacy.crawler.retrieval.SitemapImporter; import net.yacy.crawler.retrieval.SitemapImporter;
import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.data.WorkTables; import net.yacy.data.WorkTables;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.VocabularyScraper; import net.yacy.document.VocabularyScraper;
@ -265,7 +263,7 @@ public class Crawler_p {
final String sitemapURLStr = post.get("sitemapURL",""); final String sitemapURLStr = post.get("sitemapURL","");
final String crawlingStart0 = post.get("crawlingURL","").trim(); // the crawljob start url final String crawlingStart0 = post.get("crawlingURL","").trim(); // the crawljob start url
final String[] rootURLs0 = crawlingStart0.indexOf('\n') > 0 || crawlingStart0.indexOf('\r') > 0 ? crawlingStart0.split("[\\r\\n]+") : crawlingStart0.split(Pattern.quote("|")); final String[] rootURLs0 = crawlingStart0.indexOf('\n') > 0 || crawlingStart0.indexOf('\r') > 0 ? crawlingStart0.split("[\\r\\n]+") : crawlingStart0.split(Pattern.quote("|"));
Set<DigestURL> rootURLs = new HashSet<>(); final List<DigestURL> rootURLs = new ArrayList<>();
String crawlName = ""; String crawlName = "";
if (crawlingFile == null) { if (crawlingFile == null) {
final StringBuilder crawlNameBuilder = new StringBuilder(); // for large crawl queues this can be pretty large final StringBuilder crawlNameBuilder = new StringBuilder(); // for large crawl queues this can be pretty large
@ -301,17 +299,6 @@ public class Crawler_p {
for (final DigestURL u: rootURLs) if (u.isFile()) {fullDomain = false; subPath = true; break;} for (final DigestURL u: rootURLs) if (u.isFile()) {fullDomain = false; subPath = true; break;}
} }
// delete old robots entries
for (final DigestURL ru : rootURLs) {
sb.robots.delete(ru);
try {
if (ru.getHost() != null) { // might be null for file://
Cache.delete(RobotsTxt.robotsURL(RobotsTxt.getHostPort(ru)).hash());
}
} catch (final IOException e) {}
}
try {sb.robots.clear();} catch (final IOException e) {} // to be safe: clear all.
// set the crawl filter // set the crawl filter
String ipMustMatch = post.get("ipMustmatch", CrawlProfile.MATCH_ALL_STRING); String ipMustMatch = post.get("ipMustmatch", CrawlProfile.MATCH_ALL_STRING);
final String ipMustNotMatch = post.get("ipMustnotmatch", CrawlProfile.MATCH_NEVER_STRING); final String ipMustNotMatch = post.get("ipMustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
@ -398,7 +385,7 @@ public class Crawler_p {
if ("sitelist".equals(crawlingMode)) { if ("sitelist".equals(crawlingMode)) {
newcrawlingMustNotMatch = CrawlProfile.MATCH_NEVER_STRING; newcrawlingMustNotMatch = CrawlProfile.MATCH_NEVER_STRING;
final Set<DigestURL> newRootURLs = new HashSet<>(); final List<DigestURL> newRootURLs = new ArrayList<>();
for (final DigestURL sitelistURL: rootURLs) { for (final DigestURL sitelistURL: rootURLs) {
// download document // download document
Document scraper; Document scraper;
@ -412,7 +399,8 @@ public class Crawler_p {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
} }
} }
rootURLs = newRootURLs; rootURLs.clear();
rootURLs.addAll(newRootURLs);
crawlingMode = "url"; crawlingMode = "url";
if ((fullDomain || subPath) && newcrawlingdepth > 0) newcrawlingMustMatch = CrawlProfile.MATCH_ALL_STRING; // to prevent that there is a restriction on the original urls if ((fullDomain || subPath) && newcrawlingdepth > 0) newcrawlingMustMatch = CrawlProfile.MATCH_ALL_STRING; // to prevent that there is a restriction on the original urls
} }
@ -440,7 +428,7 @@ public class Crawler_p {
if (fullDomain) { if (fullDomain) {
siteFilter = CrawlProfile.siteFilter(rootURLs); siteFilter = CrawlProfile.siteFilter(rootURLs);
if (deleteold) { if (deleteold) {
sb.index.fulltext().deleteStaleDomainHashes(hosthashes, deleteageDate); sb.index.fulltext().deleteStaleDomainHashes(hosthashes, deleteageDate); // takes long time for long lists
} }
} else if (subPath) { } else if (subPath) {
siteFilter = CrawlProfile.subpathFilter(rootURLs); siteFilter = CrawlProfile.subpathFilter(rootURLs);

@ -448,7 +448,7 @@ public final class Switchboard extends serverSwitch {
if (t != null) { if (t != null) {
t.setFacet(false); t.setFacet(false);
} else { } else {
log.config("search.result.show.vocabulary.omit configuration value contains an unknown vocabulary name : " + o); Switchboard.this.log.config("search.result.show.vocabulary.omit configuration value contains an unknown vocabulary name : " + o);
} }
} }
@ -459,7 +459,7 @@ public final class Switchboard extends serverSwitch {
if (t != null) { if (t != null) {
t.setMatchFromLinkedData(true); t.setMatchFromLinkedData(true);
} else { } else {
log.config(SwitchboardConstants.VOCABULARIES_MATCH_LINKED_DATA_NAMES Switchboard.this.log.config(SwitchboardConstants.VOCABULARIES_MATCH_LINKED_DATA_NAMES
+ " configuration value contains an unknown vocabulary name : " + vocName); + " configuration value contains an unknown vocabulary name : " + vocName);
} }
} }
@ -470,7 +470,7 @@ public final class Switchboard extends serverSwitch {
}.start(); }.start();
// define the "non-password password" // define the "non-password password"
emptyPasswordAdminAccount = encodeDigestAuth(getConfig(SwitchboardConstants.ADMIN_ACCOUNT_USER_NAME,"admin"), ""); this.emptyPasswordAdminAccount = encodeDigestAuth(getConfig(SwitchboardConstants.ADMIN_ACCOUNT_USER_NAME,"admin"), "");
// init the language detector // init the language detector
this.log.config("Loading language profiles"); this.log.config("Loading language profiles");
@ -663,9 +663,9 @@ public final class Switchboard extends serverSwitch {
join.getMulticastConfig().setEnabled(true); join.getMulticastConfig().setEnabled(true);
Config config = new Config().setClusterName("YaCyP2P").setInstanceName("Peer").setNetworkConfig(networkConfig); Config config = new Config().setClusterName("YaCyP2P").setInstanceName("Peer").setNetworkConfig(networkConfig);
config.getCPSubsystemConfig().setCPMemberCount(3); config.getCPSubsystemConfig().setCPMemberCount(3);
localcluster_hazelcast = Hazelcast.newHazelcastInstance(config); this.localcluster_hazelcast = Hazelcast.newHazelcastInstance(config);
String uuid = localcluster_hazelcast.getCluster().getLocalMember().getUuid().toString(); String uuid = this.localcluster_hazelcast.getCluster().getLocalMember().getUuid().toString();
localcluster_hazelcast.getMap("status").put(uuid, Memory.status()); this.localcluster_hazelcast.getMap("status").put(uuid, Memory.status());
// load domainList // load domainList
try { try {
@ -923,7 +923,7 @@ public final class Switchboard extends serverSwitch {
this.log.config("Parser: Initializing Mime Type deny list"); this.log.config("Parser: Initializing Mime Type deny list");
final boolean enableAudioTags = getConfigBool("parser.enableAudioTags", false); final boolean enableAudioTags = getConfigBool("parser.enableAudioTags", false);
log.config("Parser: parser.enableAudioTags= "+enableAudioTags); this.log.config("Parser: parser.enableAudioTags= "+enableAudioTags);
final Set<String> denyExt = getConfigSet(SwitchboardConstants.PARSER_EXTENSIONS_DENY); final Set<String> denyExt = getConfigSet(SwitchboardConstants.PARSER_EXTENSIONS_DENY);
final Set<String> denyMime = getConfigSet(SwitchboardConstants.PARSER_MIME_DENY); final Set<String> denyMime = getConfigSet(SwitchboardConstants.PARSER_MIME_DENY);
@ -1287,7 +1287,7 @@ public final class Switchboard extends serverSwitch {
"720_ccimport", "720_ccimport",
"Content Control Import", "Content Control Import",
"this is the content control import thread", "this is the content control import thread",
null, null,
InstantBusyThread.createFromRunnable( InstantBusyThread.createFromRunnable(
new SMWListSyncThread(this, sb.getConfig("contentcontrol.bookmarklist", "contentcontrol"), new SMWListSyncThread(this, sb.getConfig("contentcontrol.bookmarklist", "contentcontrol"),
"Category:Content Source", "/?Url/?Filter/?Category/?Modification date", "Category:Content Source", "/?Url/?Filter/?Category/?Modification date",
@ -2197,7 +2197,7 @@ public final class Switchboard extends serverSwitch {
} finally { } finally {
moved = infile.renameTo(outfile); moved = infile.renameTo(outfile);
if (zis != null) try {zis.close();} catch (final IOException e) { if (zis != null) try {zis.close();} catch (final IOException e) {
log.warn("Could not close zip input stream on file " + infile); this.log.warn("Could not close zip input stream on file " + infile);
} }
} }
return moved; return moved;
@ -2212,7 +2212,7 @@ public final class Switchboard extends serverSwitch {
} }
moved = infile.renameTo(outfile); moved = infile.renameTo(outfile);
} catch (IOException ex) { } catch (IOException ex) {
log.warn("IO Error processing warc file " + infile); this.log.warn("IO Error processing warc file " + infile);
} }
return moved; return moved;
} else if (s.endsWith(".jsonlist") || s.endsWith(".flatjson")) { } else if (s.endsWith(".jsonlist") || s.endsWith(".flatjson")) {
@ -2236,7 +2236,7 @@ public final class Switchboard extends serverSwitch {
try ( try (
/* Resources automatically closed by this try-with-resources statement */ /* Resources automatically closed by this try-with-resources statement */
final FileOutputStream fileOutStream = new FileOutputStream(gzfile); final FileOutputStream fileOutStream = new FileOutputStream(gzfile);
final OutputStream os = new BufferedOutputStream(new GZIPOutputStream(fileOutStream, 65536){{def.setLevel(Deflater.BEST_COMPRESSION);}}); final OutputStream os = new BufferedOutputStream(new GZIPOutputStream(fileOutStream, 65536){{this.def.setLevel(Deflater.BEST_COMPRESSION);}});
final FileInputStream fileInStream = new FileInputStream(outfile); final FileInputStream fileInStream = new FileInputStream(outfile);
final BufferedInputStream bis = new BufferedInputStream(fileInStream); final BufferedInputStream bis = new BufferedInputStream(fileInStream);
) { ) {
@ -2251,11 +2251,11 @@ public final class Switchboard extends serverSwitch {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
} }
} }
log.info("processed surrogate " + infile); this.log.info("processed surrogate " + infile);
} }
} }
if (is != null) try {is.close();} catch (IOException e) { if (is != null) try {is.close();} catch (IOException e) {
log.warn("Could not close input stream on file " + infile); this.log.warn("Could not close input stream on file " + infile);
} }
} }
return moved; return moved;
@ -2264,7 +2264,7 @@ public final class Switchboard extends serverSwitch {
private boolean processSurrogateJson(File infile, File outfile) { private boolean processSurrogateJson(File infile, File outfile) {
// parse a file that can be generated with yacy_grid_parser // parse a file that can be generated with yacy_grid_parser
// see https://github.com/yacy/yacy_grid_parser/blob/master/README.md // see https://github.com/yacy/yacy_grid_parser/blob/master/README.md
log.info("processing json surrogate " + infile); this.log.info("processing json surrogate " + infile);
long starttime = System.currentTimeMillis(); long starttime = System.currentTimeMillis();
boolean moved = false; boolean moved = false;
@ -2409,7 +2409,7 @@ public final class Switchboard extends serverSwitch {
moved = infile.renameTo(outfile); moved = infile.renameTo(outfile);
} catch (IOException | JSONException ex) { } catch (IOException | JSONException ex) {
log.warn("IO Error processing flatjson file " + infile); this.log.warn("IO Error processing flatjson file " + infile);
} finally { } finally {
/* Properly release file system resources even in failure cases */ /* Properly release file system resources even in failure cases */
if(br != null) { if(br != null) {
@ -2417,19 +2417,19 @@ public final class Switchboard extends serverSwitch {
try { try {
br.close(); br.close();
} catch (IOException e) { } catch (IOException e) {
log.warn("Could not close reader on file " + infile); this.log.warn("Could not close reader on file " + infile);
} }
} else if(fis != null) { } else if(fis != null) {
/* no buffered reader : maybe a case of exhausted memory. Anyway file input stream has to be closed. */ /* no buffered reader : maybe a case of exhausted memory. Anyway file input stream has to be closed. */
try { try {
fis.close(); fis.close();
} catch (IOException e) { } catch (IOException e) {
log.warn("Could not close input stream on file " + infile); this.log.warn("Could not close input stream on file " + infile);
} }
} }
} }
log.info("finished processing json surrogate: " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds"); this.log.info("finished processing json surrogate: " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds");
return moved; return moved;
} }
@ -2471,7 +2471,7 @@ public final class Switchboard extends serverSwitch {
} }
/* Update the ResultURLS stack for monitoring */ /* Update the ResultURLS stack for monitoring */
final byte[] myPeerHash = ASCII.getBytes(peers.mySeed().hash); final byte[] myPeerHash = ASCII.getBytes(Switchboard.this.peers.mySeed().hash);
ResultURLs.stack( ResultURLs.stack(
ASCII.String(rootURL.hash()), ASCII.String(rootURL.hash()),
rootURL.getHost(), rootURL.getHost(),
@ -2490,19 +2490,19 @@ public final class Switchboard extends serverSwitch {
final Document document = entry.document(); final Document document = entry.document();
final Request request = final Request request =
new Request( new Request(
ASCII.getBytes(peers.mySeed().hash), ASCII.getBytes(Switchboard.this.peers.mySeed().hash),
entry.getIdentifier(true), entry.getIdentifier(true),
null, null,
"", "",
entry.getDate(), entry.getDate(),
crawler.defaultSurrogateProfile.handle(), Switchboard.this.crawler.defaultSurrogateProfile.handle(),
0, 0,
crawler.defaultSurrogateProfile.timezoneOffset()); Switchboard.this.crawler.defaultSurrogateProfile.timezoneOffset());
final Response response = new Response(request, null, null, crawler.defaultSurrogateProfile, false, null); final Response response = new Response(request, null, null, Switchboard.this.crawler.defaultSurrogateProfile, false, null);
final IndexingQueueEntry queueEntry = final IndexingQueueEntry queueEntry =
new IndexingQueueEntry(response, new Document[] {document}, null); new IndexingQueueEntry(response, new Document[] {document}, null);
indexingCondensementProcessor.enQueue(queueEntry); Switchboard.this.indexingCondensementProcessor.enQueue(queueEntry);
} }
if (shallTerminate()) break; if (shallTerminate()) break;
} }
@ -2652,7 +2652,7 @@ public final class Switchboard extends serverSwitch {
if (!"off".equals(kind)) { if (!"off".equals(kind)) {
String action = row.get(WorkTables.TABLE_API_COL_APICALL_EVENT_ACTION, "startup"); String action = row.get(WorkTables.TABLE_API_COL_APICALL_EVENT_ACTION, "startup");
if ("startup".equals(action)) { if ("startup".equals(action)) {
if (startupAction) { if (this.startupAction) {
pks.add(UTF8.String(row.getPK())); pks.add(UTF8.String(row.getPK()));
if ("once".equals(kind)) { if ("once".equals(kind)) {
row.put(WorkTables.TABLE_API_COL_APICALL_EVENT_KIND, "off"); row.put(WorkTables.TABLE_API_COL_APICALL_EVENT_KIND, "off");
@ -2677,7 +2677,7 @@ public final class Switchboard extends serverSwitch {
} catch (final IOException e) { } catch (final IOException e) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
} }
startupAction = false; this.startupAction = false;
// execute api calls // execute api calls
final Map<String, Integer> callResult = this.tables.execAPICalls("localhost", getLocalPort(), pks, getConfig(SwitchboardConstants.ADMIN_ACCOUNT_USER_NAME, "admin"), getConfig(SwitchboardConstants.ADMIN_ACCOUNT_B64MD5, "")); final Map<String, Integer> callResult = this.tables.execAPICalls("localhost", getLocalPort(), pks, getConfig(SwitchboardConstants.ADMIN_ACCOUNT_USER_NAME, "admin"), getConfig(SwitchboardConstants.ADMIN_ACCOUNT_B64MD5, ""));
@ -2711,13 +2711,13 @@ public final class Switchboard extends serverSwitch {
// write a thread dump to log path // write a thread dump to log path
try { try {
File tdlog = new File(dataPath, "DATA/LOG/threaddump.txt"); File tdlog = new File(this.dataPath, "DATA/LOG/threaddump.txt");
PrintWriter out = new PrintWriter(tdlog); PrintWriter out = new PrintWriter(tdlog);
String threaddump = ThreadDump.threaddump(this, true, 0, false, 0); String threaddump = ThreadDump.threaddump(this, true, 0, false, 0);
out.println(threaddump); out.println(threaddump);
out.close(); out.close();
} catch (IOException e) { } catch (IOException e) {
log.info("cannot write threaddump", e); this.log.info("cannot write threaddump", e);
} }
// clear caches if necessary // clear caches if necessary
@ -2733,7 +2733,7 @@ public final class Switchboard extends serverSwitch {
long cs = this.index.fulltext().collectionSize(); long cs = this.index.fulltext().collectionSize();
if (cs > getConfigInt(SwitchboardConstants.GREEDYLEARNING_LIMIT_DOCCOUNT, 0)) { if (cs > getConfigInt(SwitchboardConstants.GREEDYLEARNING_LIMIT_DOCCOUNT, 0)) {
setConfig(SwitchboardConstants.GREEDYLEARNING_ACTIVE, false); setConfig(SwitchboardConstants.GREEDYLEARNING_ACTIVE, false);
log.info("finishing greedy learning phase, size=" +cs); this.log.info("finishing greedy learning phase, size=" +cs);
} }
} }
@ -2926,7 +2926,7 @@ public final class Switchboard extends serverSwitch {
try { try {
fileIn.close(); fileIn.close();
} catch (final Exception e ) { } catch (final Exception e ) {
log.warn("Could not close input stream on file " + profileFile); this.log.warn("Could not close input stream on file " + profileFile);
} }
} }
} }
@ -2960,19 +2960,19 @@ public final class Switchboard extends serverSwitch {
int proccount = 0; int proccount = 0;
if (!this.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL)) { if (!this.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL)) {
Fulltext fulltext = index.fulltext(); Fulltext fulltext = this.index.fulltext();
CollectionConfiguration collection1Configuration = fulltext.getDefaultConfiguration(); CollectionConfiguration collection1Configuration = fulltext.getDefaultConfiguration();
boolean process_key_exist = collection1Configuration.contains(CollectionSchema.process_sxt); boolean process_key_exist = collection1Configuration.contains(CollectionSchema.process_sxt);
if (!process_key_exist) log.info("postprocessing deactivated: field process_sxt is not enabled"); if (!process_key_exist) this.log.info("postprocessing deactivated: field process_sxt is not enabled");
boolean reference_index_exist = (index.connectedCitation() || fulltext.useWebgraph()); boolean reference_index_exist = (this.index.connectedCitation() || fulltext.useWebgraph());
if (!reference_index_exist) log.info("postprocessing deactivated: no reference index avilable; activate citation index or webgraph"); if (!reference_index_exist) this.log.info("postprocessing deactivated: no reference index avilable; activate citation index or webgraph");
boolean minimum_ram_fullfilled = MemoryControl.available() > getConfigLong("postprocessing.minimum_ram", 0); boolean minimum_ram_fullfilled = MemoryControl.available() > getConfigLong("postprocessing.minimum_ram", 0);
if (!minimum_ram_fullfilled) log.info("postprocessing deactivated: no enough ram (" + MemoryControl.available() + "), needed " + getConfigLong("postprocessing.minimum_ram", 0) + ", to force change field postprocessing.minimum_ram"); if (!minimum_ram_fullfilled) this.log.info("postprocessing deactivated: no enough ram (" + MemoryControl.available() + "), needed " + getConfigLong("postprocessing.minimum_ram", 0) + ", to force change field postprocessing.minimum_ram");
boolean minimum_load_fullfilled = Memory.getSystemLoadAverage() < getConfigFloat("postprocessing.maximum_load", 0); boolean minimum_load_fullfilled = Memory.getSystemLoadAverage() < getConfigFloat("postprocessing.maximum_load", 0);
if (!minimum_load_fullfilled) log.info("postprocessing deactivated: too high load (" + Memory.getSystemLoadAverage() + ") > " + getConfigFloat("postprocessing.maximum_load", 0) + ", to force change field postprocessing.maximum_load"); if (!minimum_load_fullfilled) this.log.info("postprocessing deactivated: too high load (" + Memory.getSystemLoadAverage() + ") > " + getConfigFloat("postprocessing.maximum_load", 0) + ", to force change field postprocessing.maximum_load");
boolean postprocessing = process_key_exist && reference_index_exist && minimum_ram_fullfilled && minimum_load_fullfilled; boolean postprocessing = process_key_exist && reference_index_exist && minimum_ram_fullfilled && minimum_load_fullfilled;
if (!postprocessing) log.info("postprocessing deactivated: constraints violated"); if (!postprocessing) this.log.info("postprocessing deactivated: constraints violated");
if (allCrawlsFinished) { if (allCrawlsFinished) {
// refresh the search cache // refresh the search cache
@ -2981,12 +2981,12 @@ public final class Switchboard extends serverSwitch {
if (postprocessing) { if (postprocessing) {
// run postprocessing on all profiles // run postprocessing on all profiles
ReferenceReportCache rrCache = index.getReferenceReportCache(); ReferenceReportCache rrCache = this.index.getReferenceReportCache();
proccount += collection1Configuration.postprocessing(index, rrCache, null, getConfigBool("postprocessing.partialUpdate", true)); proccount += collection1Configuration.postprocessing(this.index, rrCache, null, getConfigBool("postprocessing.partialUpdate", true));
this.index.fulltext().commit(true); // without a commit the success is not visible in the monitoring this.index.fulltext().commit(true); // without a commit the success is not visible in the monitoring
} }
this.crawler.cleanProfiles(this.crawler.getActiveProfiles()); this.crawler.cleanProfiles(this.crawler.getActiveProfiles());
log.info("cleanup post-processed " + proccount + " documents"); this.log.info("cleanup post-processed " + proccount + " documents");
} else { } else {
Set<String> deletionCandidates = collection1Configuration.contains(CollectionSchema.harvestkey_s.getSolrFieldName()) ? Set<String> deletionCandidates = collection1Configuration.contains(CollectionSchema.harvestkey_s.getSolrFieldName()) ?
this.crawler.getFinishedProfiles(this.crawlQueues) : new HashSet<String>(); this.crawler.getFinishedProfiles(this.crawlQueues) : new HashSet<String>();
@ -2994,13 +2994,13 @@ public final class Switchboard extends serverSwitch {
if (cleanupByHarvestkey > 0) { if (cleanupByHarvestkey > 0) {
if (postprocessing) { if (postprocessing) {
// run postprocessing on these profiles // run postprocessing on these profiles
ReferenceReportCache rrCache = index.getReferenceReportCache(); ReferenceReportCache rrCache = this.index.getReferenceReportCache();
for (String profileHash: deletionCandidates) proccount += collection1Configuration.postprocessing(index, rrCache, profileHash, getConfigBool("postprocessing.partialUpdate", true)); for (String profileHash: deletionCandidates) proccount += collection1Configuration.postprocessing(this.index, rrCache, profileHash, getConfigBool("postprocessing.partialUpdate", true));
this.index.fulltext().commit(true); // without a commit the success is not visible in the monitoring this.index.fulltext().commit(true); // without a commit the success is not visible in the monitoring
} }
this.crawler.cleanProfiles(deletionCandidates); this.crawler.cleanProfiles(deletionCandidates);
log.info("cleanup removed " + cleanupByHarvestkey + " crawl profiles, post-processed " + proccount + " documents"); this.log.info("cleanup removed " + cleanupByHarvestkey + " crawl profiles, post-processed " + proccount + " documents");
} }
} }
} }
@ -3075,7 +3075,7 @@ public final class Switchboard extends serverSwitch {
} }
setConfig(jobType + "_isPaused", "true"); setConfig(jobType + "_isPaused", "true");
setConfig(jobType + "_isPaused_cause", cause); setConfig(jobType + "_isPaused_cause", cause);
log.warn("Crawl job '" + jobType + "' is paused: " + cause); this.log.warn("Crawl job '" + jobType + "' is paused: " + cause);
} }
/** /**
@ -3120,7 +3120,7 @@ public final class Switchboard extends serverSwitch {
} }
if ( documents == null ) { if ( documents == null ) {
return null; return null;
} }
return new IndexingQueueEntry(in.queueEntry, documents, null); return new IndexingQueueEntry(in.queueEntry, documents, null);
} }
@ -3302,7 +3302,7 @@ public final class Switchboard extends serverSwitch {
// rewrite the url // rewrite the url
String u0 = LibraryProvider.urlRewriter.apply(u); String u0 = LibraryProvider.urlRewriter.apply(u);
if (!u.equals(u0)) { if (!u.equals(u0)) {
log.info("REWRITE of url = \"" + u + "\" to \"" + u0 + "\""); this.log.info("REWRITE of url = \"" + u + "\" to \"" + u0 + "\"");
u = u0; u = u0;
} }
//Matcher m = rewritePattern.matcher(u); //Matcher m = rewritePattern.matcher(u);
@ -3483,7 +3483,7 @@ public final class Switchboard extends serverSwitch {
} }
/** /**
* *
* @param queueEntry * @param queueEntry
* @param collections * @param collections
* @param document * @param document
@ -3507,7 +3507,7 @@ public final class Switchboard extends serverSwitch {
final DigestURL referrerURL = queueEntry.referrerURL(); final DigestURL referrerURL = queueEntry.referrerURL();
EventOrigin processCase = queueEntry.processCase(this.peers.mySeed().hash); EventOrigin processCase = queueEntry.processCase(this.peers.mySeed().hash);
/* This entry may have been locally created by the MediaWiki dump reader : /* This entry may have been locally created by the MediaWiki dump reader :
* we can distinguish the case here from a regular local crawl with the crawl profile used */ * we can distinguish the case here from a regular local crawl with the crawl profile used */
if(this.crawler != null && queueEntry.profile() == this.crawler.defaultSurrogateProfile) { if(this.crawler != null && queueEntry.profile() == this.crawler.defaultSurrogateProfile) {
processCase = EventOrigin.SURROGATES; processCase = EventOrigin.SURROGATES;
@ -3630,7 +3630,7 @@ public final class Switchboard extends serverSwitch {
/** /**
* Check that the given Solr document matches the eventual crawl profil Solr * Check that the given Solr document matches the eventual crawl profil Solr
* query filters. * query filters.
* *
* @param profile * @param profile
* the eventual crawl profile. * the eventual crawl profile.
* @param document * @param document
@ -3748,38 +3748,63 @@ public final class Switchboard extends serverSwitch {
try {Cache.delete(urlhash);} catch (IOException e) {} try {Cache.delete(urlhash);} catch (IOException e) {}
} }
public void stackURLs(Set<DigestURL> rootURLs, final CrawlProfile profile, final Set<DigestURL> successurls, final Map<DigestURL,String> failurls) { public void stackURLs(final Collection<DigestURL> rootURLs, final CrawlProfile profile, final Set<DigestURL> successurls, final Map<DigestURL,String> failurls) {
if (rootURLs == null || rootURLs.size() == 0) return; if (rootURLs == null || rootURLs.size() == 0) return;
if (rootURLs.size() == 1) { if (rootURLs.size() == 1) {
// for single stack requests, do not use the multithreading overhead; // for single stack requests, do not use the multithreading overhead;
final DigestURL turl = rootURLs.iterator().next(); final DigestURL url = rootURLs.iterator().next();
// delete robots entry
sb.robots.delete(url);
try {
if (url.getHost() != null) { // might be null for file://
Cache.delete(RobotsTxt.robotsURL(RobotsTxt.getHostPort(url)).hash());
}
} catch (final IOException e) {}
// stack
String failreason; String failreason;
if ((failreason = Switchboard.this.stackUrl(profile, turl)) == null) successurls.add(turl); else failurls.put(turl, failreason); if ((failreason = Switchboard.this.stackUrl(profile, url)) == null) successurls.add(url); else failurls.put(url, failreason);
return; return;
} }
final ArrayList<Thread> stackthreads = new ArrayList<Thread>(); // do this concurrently
int maxthreads = 5 * Runtime.getRuntime().availableProcessors(); // do this concurrently
for (DigestURL url: rootURLs) { int threads = Math.min(rootURLs.size(), Math.min(50, Runtime.getRuntime().availableProcessors() * 2 + 1)); // it makes sense to have more threads than cores because those threads do a lot of waiting during IO
final DigestURL turl = url; this.log.info("stackURLs: starting " + threads + " threads for " + rootURLs.size() + " root urls.");
Thread t = new Thread("Switchboard.stackURLs") { final BlockingQueue<DigestURL> rootURLsQueue = new ArrayBlockingQueue<>(rootURLs.size());
for (DigestURL u: rootURLs) try {rootURLsQueue.put(u);} catch (InterruptedException e) {}
for (int i = 0; i < threads; i++) {
final String name = "Switchboard.stackURLs-" + i + "-" + profile.handle();
Thread t = new Thread(name) {
@Override @Override
public void run() { public void run() {
String failreason; DigestURL url;
if ((failreason = Switchboard.this.stackUrl(profile, turl)) == null) successurls.add(turl); else failurls.put(turl, failreason); int successc = 0, failc = 0;
while ((url = rootURLsQueue.poll()) != null) {
// delete robots entry
sb.robots.delete(url);
try {
if (url.getHost() != null) { // might be null for file://
Cache.delete(RobotsTxt.robotsURL(RobotsTxt.getHostPort(url)).hash());
}
} catch (final IOException e) {}
// stack
String failreason;
if ((failreason = Switchboard.this.stackUrl(profile, url)) == null) {
successurls.add(url);
successc++;
} else {
failurls.put(url, failreason);
failc++;
}
this.setName(name); // the name is constantly overwritten by the http client
}
Switchboard.this.log.info("stackURLs: terminated stack thread " + name + " with " + successc + " success and " + failc + " fail stackings.");
} }
}; };
t.start(); t.start(); // we let the thread dangling around here. It's better than a timeout in the http request.
stackthreads.add(t);
if (stackthreads.size() > maxthreads) {
Thread w = stackthreads.get(0);
while (w.isAlive()) {
try {Thread.sleep(100);} catch (final InterruptedException e) {}
}
stackthreads.remove(0);
}
} }
final long waitingtime = 10 + (30000 / rootURLs.size()); // at most wait only halve an minute to prevent that the crawl start runs into a time-out
for (Thread t: stackthreads) try {t.join(waitingtime);} catch (final InterruptedException e) {}
} }
/** /**
@ -3811,7 +3836,7 @@ public final class Switchboard extends serverSwitch {
if (url.isFTP()) { if (url.isFTP()) {
try { try {
this.crawler.putActive(handle, profile); this.crawler.putActive(handle, profile);
/* put ftp site entries on the crawl stack, /* put ftp site entries on the crawl stack,
* using the crawl profile depth to control how many children folders of the url are stacked */ * using the crawl profile depth to control how many children folders of the url are stacked */
this.crawlStacker.enqueueEntriesFTP( this.crawlStacker.enqueueEntriesFTP(
this.peers.mySeed().hash.getBytes(), this.peers.mySeed().hash.getBytes(),
@ -3957,13 +3982,13 @@ public final class Switchboard extends serverSwitch {
final Document[] documents = response.parse(); final Document[] documents = response.parse();
if (documents != null) { if (documents != null) {
for (final Document document: documents) { for (final Document document: documents) {
final CrawlProfile profile = crawler.get(ASCII.getBytes(request.profileHandle())); final CrawlProfile profile = Switchboard.this.crawler.get(ASCII.getBytes(request.profileHandle()));
if (document.indexingDenied() && (profile == null || profile.obeyHtmlRobotsNoindex())) { if (document.indexingDenied() && (profile == null || profile.obeyHtmlRobotsNoindex())) {
throw new Parser.Failure("indexing is denied", url); throw new Parser.Failure("indexing is denied", url);
} }
final Condenser condenser = new Condenser( final Condenser condenser = new Condenser(
document, null, true, true, LibraryProvider.dymLib, true, document, null, true, true, LibraryProvider.dymLib, true,
Switchboard.this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.dates_in_content_dts), Switchboard.this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.dates_in_content_dts),
searchEvent == null ? 0 : searchEvent.query.timezoneOffset); searchEvent == null ? 0 : searchEvent.query.timezoneOffset);
ResultImages.registerImages(url, document, true); ResultImages.registerImages(url, document, true);
Switchboard.this.webStructure.generateCitationReference(url, document); Switchboard.this.webStructure.generateCitationReference(url, document);
@ -4083,7 +4108,7 @@ public final class Switchboard extends serverSwitch {
// as this stays true as long as authenticated browser is open (even after restart of YaCy) add a timeout check to look at credentials again // as this stays true as long as authenticated browser is open (even after restart of YaCy) add a timeout check to look at credentials again
// TODO: same is true for credential checks below (at least with BASIC auth -> login should expire at least on restart // TODO: same is true for credential checks below (at least with BASIC auth -> login should expire at least on restart
if (requestHeader.isUserInRole(UserDB.AccessRight.ADMIN_RIGHT.toString())) { if (requestHeader.isUserInRole(UserDB.AccessRight.ADMIN_RIGHT.toString())) {
if (adminAuthenticationLastAccess + 60000 > System.currentTimeMillis()) // 1 minute if (this.adminAuthenticationLastAccess + 60000 > System.currentTimeMillis()) // 1 minute
return 4; // hard-authenticated, quick return return 4; // hard-authenticated, quick return
} }
@ -4091,19 +4116,19 @@ public final class Switchboard extends serverSwitch {
final String adminAccountUserName = getConfig(SwitchboardConstants.ADMIN_ACCOUNT_USER_NAME, "admin"); final String adminAccountUserName = getConfig(SwitchboardConstants.ADMIN_ACCOUNT_USER_NAME, "admin");
final String adminAccountBase64MD5 = getConfig(SwitchboardConstants.ADMIN_ACCOUNT_B64MD5, ""); final String adminAccountBase64MD5 = getConfig(SwitchboardConstants.ADMIN_ACCOUNT_B64MD5, "");
if ( adminAccountBase64MD5.isEmpty() ) { if ( adminAccountBase64MD5.isEmpty() ) {
adminAuthenticationLastAccess = System.currentTimeMillis(); this.adminAuthenticationLastAccess = System.currentTimeMillis();
return 2; // no password stored; this should not happen for older peers return 2; // no password stored; this should not happen for older peers
} }
// authorization in case that administrators have stored an empty password; this authorizes all users as admin regardless of the give auth // authorization in case that administrators have stored an empty password; this authorizes all users as admin regardless of the give auth
if (adminAccountBase64MD5.equals(emptyPasswordAdminAccount)) { if (adminAccountBase64MD5.equals(this.emptyPasswordAdminAccount)) {
return 3; // everyone is admin from everywhere return 3; // everyone is admin from everywhere
} }
// authorization for localhost, only if flag is set to grant localhost access as admin // authorization for localhost, only if flag is set to grant localhost access as admin
final boolean accessFromLocalhost = requestHeader.accessFromLocalhost(); final boolean accessFromLocalhost = requestHeader.accessFromLocalhost();
if (accessFromLocalhost && getConfigBool(SwitchboardConstants.ADMIN_ACCOUNT_FOR_LOCALHOST, false)) { if (accessFromLocalhost && getConfigBool(SwitchboardConstants.ADMIN_ACCOUNT_FOR_LOCALHOST, false)) {
adminAuthenticationLastAccess = System.currentTimeMillis(); this.adminAuthenticationLastAccess = System.currentTimeMillis();
return 3; // soft-authenticated for localhost return 3; // soft-authenticated for localhost
} }
@ -4129,7 +4154,7 @@ public final class Switchboard extends serverSwitch {
// String username = requestHeader.getUserPrincipal().getName(); // String username = requestHeader.getUserPrincipal().getName();
// if ((username.equalsIgnoreCase(sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_USER_NAME, "admin"))) // if ((username.equalsIgnoreCase(sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_USER_NAME, "admin")))
// || (sb.userDB.getEntry(username).hasRight(AccessRight.ADMIN_RIGHT))) // || (sb.userDB.getEntry(username).hasRight(AccessRight.ADMIN_RIGHT)))
adminAuthenticationLastAccess = System.currentTimeMillis(); this.adminAuthenticationLastAccess = System.currentTimeMillis();
return 4; // has admin right return 4; // has admin right
} }
} }
@ -4138,13 +4163,13 @@ public final class Switchboard extends serverSwitch {
// authorization by encoded password, only for localhost access // authorization by encoded password, only for localhost access
String pass = Base64Order.standardCoder.encodeString(adminAccountUserName + ":" + adminAccountBase64MD5); String pass = Base64Order.standardCoder.encodeString(adminAccountUserName + ":" + adminAccountBase64MD5);
if ( accessFromLocalhost && (pass.equals(realmValue)) ) { // assume realmValue as is in cfg if ( accessFromLocalhost && (pass.equals(realmValue)) ) { // assume realmValue as is in cfg
adminAuthenticationLastAccess = System.currentTimeMillis(); this.adminAuthenticationLastAccess = System.currentTimeMillis();
return 3; // soft-authenticated for localhost return 3; // soft-authenticated for localhost
} }
// authorization by hit in userDB (authtype username:encodedpassword - handed over by DefaultServlet) // authorization by hit in userDB (authtype username:encodedpassword - handed over by DefaultServlet)
if ( this.userDB.hasAdminRight(requestHeader, requestHeader.getCookies()) ) { if ( this.userDB.hasAdminRight(requestHeader, requestHeader.getCookies()) ) {
adminAuthenticationLastAccess = System.currentTimeMillis(); this.adminAuthenticationLastAccess = System.currentTimeMillis();
return 4; //return, because 4=max return 4; //return, because 4=max
} }
@ -4157,20 +4182,20 @@ public final class Switchboard extends serverSwitch {
realmtmp = realmtmp.substring(0, i + 1) + sb.getConfig(SwitchboardConstants.ADMIN_REALM,"YaCy") + ":" + realmtmp.substring(i + 1); realmtmp = realmtmp.substring(0, i + 1) + sb.getConfig(SwitchboardConstants.ADMIN_REALM,"YaCy") + ":" + realmtmp.substring(i + 1);
if (adminAccountBase64MD5.substring(4).equals(Digest.encodeMD5Hex(realmtmp))) { if (adminAccountBase64MD5.substring(4).equals(Digest.encodeMD5Hex(realmtmp))) {
adminAuthenticationLastAccess = System.currentTimeMillis(); this.adminAuthenticationLastAccess = System.currentTimeMillis();
return 4; // hard-authenticated, all ok return 4; // hard-authenticated, all ok
} }
} else { } else {
// handle DIGEST auth (realmValue = adminAccountBase (set for lecacyHeader in DefaultServlet for authenticated requests) // handle DIGEST auth (realmValue = adminAccountBase (set for lecacyHeader in DefaultServlet for authenticated requests)
if (adminAccountBase64MD5.equals(realmValue)) { if (adminAccountBase64MD5.equals(realmValue)) {
adminAuthenticationLastAccess = System.currentTimeMillis(); this.adminAuthenticationLastAccess = System.currentTimeMillis();
return 4; // hard-authenticated, all ok return 4; // hard-authenticated, all ok
} }
} }
} else { } else {
// handle old option adminAccountBase64MD5="xxxxxxx" = encodeMD55Hex(encodeB64("adminname:password") // handle old option adminAccountBase64MD5="xxxxxxx" = encodeMD55Hex(encodeB64("adminname:password")
if (adminAccountBase64MD5.equals(Digest.encodeMD5Hex(realmValue))) { if (adminAccountBase64MD5.equals(Digest.encodeMD5Hex(realmValue))) {
adminAuthenticationLastAccess = System.currentTimeMillis(); this.adminAuthenticationLastAccess = System.currentTimeMillis();
return 4; // hard-authenticated, all ok return 4; // hard-authenticated, all ok
} }
} }

Loading…
Cancel
Save