fixed and enhanced postprocessing

pull/1/head
Michael Peter Christen 11 years ago
parent 219d5934a4
commit 6842783761

@ -186,6 +186,7 @@ import net.yacy.repository.Blacklist;
import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.repository.FilterEngine; import net.yacy.repository.FilterEngine;
import net.yacy.repository.LoaderDispatcher; import net.yacy.repository.LoaderDispatcher;
import net.yacy.search.index.Fulltext;
import net.yacy.search.index.Segment; import net.yacy.search.index.Segment;
import net.yacy.search.index.Segment.ClickdepthCache; import net.yacy.search.index.Segment.ClickdepthCache;
import net.yacy.search.index.Segment.ReferenceReportCache; import net.yacy.search.index.Segment.ReferenceReportCache;
@ -196,6 +197,7 @@ import net.yacy.search.ranking.RankingProfile;
import net.yacy.search.schema.CollectionConfiguration; import net.yacy.search.schema.CollectionConfiguration;
import net.yacy.search.schema.CollectionSchema; import net.yacy.search.schema.CollectionSchema;
import net.yacy.search.schema.WebgraphConfiguration; import net.yacy.search.schema.WebgraphConfiguration;
import net.yacy.search.schema.WebgraphSchema;
import net.yacy.server.serverCore; import net.yacy.server.serverCore;
import net.yacy.server.serverSwitch; import net.yacy.server.serverSwitch;
import net.yacy.server.http.RobotsTxtConfig; import net.yacy.server.http.RobotsTxtConfig;
@ -2011,7 +2013,10 @@ public final class Switchboard extends serverSwitch {
return c; return c;
} }
public static boolean postprocessingRunning = false; public static boolean postprocessingRunning = false;
// if started, the following values are assigned for [collection1, webgraph]:
public static long[] postprocessingStartTime = new long[]{0,0}; // the start time for the processing; not started = 0
public static int[] postprocessingCount = new int[]{0,0}; // number of documents to be processed
public boolean cleanupJob() { public boolean cleanupJob() {
@ -2272,43 +2277,21 @@ public final class Switchboard extends serverSwitch {
if (getConfigBool("triplestore.persistent", false)) { if (getConfigBool("triplestore.persistent", false)) {
JenaTripleStore.saveAll(); JenaTripleStore.saveAll();
} }
// clean up profiles // clean up profiles
checkInterruption(); checkInterruption();
// if no crawl is running and processing is activated: // if no crawl is running and processing is activated:
// execute the (post-) processing steps for all entries that have a process tag assigned // execute the (post-) processing steps for all entries that have a process tag assigned
Fulltext fulltext = index.fulltext();
CollectionConfiguration collection1Configuration = fulltext.getDefaultConfiguration();
WebgraphConfiguration webgraphConfiguration = fulltext.getWebgraphConfiguration();
if (!this.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL)) { if (!this.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL)) {
// we optimize first because that is useful for postprocessing
int proccount = 0; int proccount = 0;
ReferenceReportCache rrCache = index.getReferenceReportCache(); boolean allCrawlsFinished = this.crawler.allCrawlsFinished(this.crawlQueues);
ClickdepthCache clickdepthCache = index.getClickdepthCache(rrCache); if (allCrawlsFinished) {
if (index.fulltext().getDefaultConfiguration().contains(CollectionSchema.harvestkey_s.getSolrFieldName())) {
Set<String> deletionCandidates = this.crawler.getFinishesProfiles(this.crawlQueues);
int cleanup = deletionCandidates.size();
if (cleanup > 0) {
// run postprocessing on these profiles
postprocessingRunning = true;
for (String profileHash: deletionCandidates) {
proccount += index.fulltext().getDefaultConfiguration().postprocessing(index, rrCache, clickdepthCache, profileHash);
proccount += index.fulltext().getWebgraphConfiguration().postprocessing(index, clickdepthCache, profileHash);
}
this.crawler.cleanProfiles(deletionCandidates);
log.info("cleanup removed " + cleanup + " crawl profiles, post-processed " + proccount + " documents");
}
} else {
if (this.crawler.allCrawlsFinished(this.crawlQueues)) {
// run postprocessing on all profiles
postprocessingRunning = true;
proccount += index.fulltext().getDefaultConfiguration().postprocessing(index, rrCache, clickdepthCache, null);
proccount += index.fulltext().getWebgraphConfiguration().postprocessing(index, clickdepthCache, null);
this.crawler.cleanProfiles(this.crawler.getActiveProfiles());
log.info("cleanup post-processed " + proccount + " documents");
}
}
if (this.crawler.allCrawlsFinished(this.crawlQueues)) {
postprocessingRunning = true; postprocessingRunning = true;
// flush caches // flush caches
Domains.clear(); Domains.clear();
@ -2319,7 +2302,7 @@ public final class Switchboard extends serverSwitch {
long idleAdmin = System.currentTimeMillis() - this.adminAuthenticationLastAccess; long idleAdmin = System.currentTimeMillis() - this.adminAuthenticationLastAccess;
long deltaOptimize = System.currentTimeMillis() - this.optimizeLastRun; long deltaOptimize = System.currentTimeMillis() - this.optimizeLastRun;
boolean optimizeRequired = deltaOptimize > 60000 * 60 * 3; // 3 hours boolean optimizeRequired = deltaOptimize > 60000 * 60 * 3; // 3 hours
int opts = Math.max(1, (int) (index.fulltext().collectionSize() / 5000000)); int opts = Math.max(1, (int) (fulltext.collectionSize() / 5000000));
log.info("Solr auto-optimization: idleSearch=" + idleSearch + ", idleAdmin=" + idleAdmin + ", deltaOptimize=" + deltaOptimize + ", proccount=" + proccount); log.info("Solr auto-optimization: idleSearch=" + idleSearch + ", idleAdmin=" + idleAdmin + ", deltaOptimize=" + deltaOptimize + ", proccount=" + proccount);
if (idleAdmin > 600000) { if (idleAdmin > 600000) {
@ -2331,12 +2314,62 @@ public final class Switchboard extends serverSwitch {
if (optimizeRequired) { if (optimizeRequired) {
if (idleSearch < 600000) opts++; // < 10 minutes idle time will cause a optimization with one more Segment which is small an quick if (idleSearch < 600000) opts++; // < 10 minutes idle time will cause a optimization with one more Segment which is small an quick
log.info("Solr auto-optimization: running solr.optimize(" + opts + ")"); log.info("Solr auto-optimization: running solr.optimize(" + opts + ")");
index.fulltext().optimize(opts); fulltext.optimize(opts);
this.optimizeLastRun = System.currentTimeMillis(); this.optimizeLastRun = System.currentTimeMillis();
} }
} }
} }
ReferenceReportCache rrCache = index.getReferenceReportCache();
ClickdepthCache clickdepthCache = index.getClickdepthCache(rrCache);
Set<String> deletionCandidates = collection1Configuration.contains(CollectionSchema.harvestkey_s.getSolrFieldName()) ?
this.crawler.getFinishesProfiles(this.crawlQueues) : new HashSet<String>();
int cleanupByHarvestkey = deletionCandidates.size();
boolean processCollection = collection1Configuration.contains(CollectionSchema.process_sxt) && (index.connectedCitation() || fulltext.writeToWebgraph());
boolean processWebgraph = webgraphConfiguration.contains(WebgraphSchema.process_sxt) && fulltext.writeToWebgraph();
if ((processCollection || processWebgraph) && (cleanupByHarvestkey > 0 || allCrawlsFinished)) {
//full optimization of webgraph, if exists
if (fulltext.writeToWebgraph()) fulltext.getWebgraphConnector().optimize(1);
if (cleanupByHarvestkey > 0) {
// run postprocessing on these profiles
postprocessingRunning = true;
postprocessingStartTime[0] = System.currentTimeMillis();
try {postprocessingCount[0] = (int) fulltext.getDefaultConnector().getCountByQuery(CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]");} catch (IOException e) {}
for (String profileHash: deletionCandidates) proccount += collection1Configuration.postprocessing(index, rrCache, clickdepthCache, profileHash);
postprocessingStartTime[0] = 0;
try {postprocessingCount[0] = (int) fulltext.getDefaultConnector().getCountByQuery(CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]");} catch (IOException e) {} // should be zero but you never know
if (processWebgraph) {
postprocessingStartTime[1] = System.currentTimeMillis();
try {postprocessingCount[1] = (int) fulltext.getWebgraphConnector().getCountByQuery(WebgraphSchema.process_sxt.getSolrFieldName() + ":[* TO *]");} catch (IOException e) {}
for (String profileHash: deletionCandidates) proccount += webgraphConfiguration.postprocessing(index, clickdepthCache, profileHash);
postprocessingStartTime[1] = 0;
try {postprocessingCount[1] = (int) fulltext.getWebgraphConnector().getCountByQuery(WebgraphSchema.process_sxt.getSolrFieldName() + ":[* TO *]");} catch (IOException e) {}
}
this.crawler.cleanProfiles(deletionCandidates);
log.info("cleanup removed " + cleanupByHarvestkey + " crawl profiles, post-processed " + proccount + " documents");
} else if (allCrawlsFinished) {
// run postprocessing on all profiles
postprocessingRunning = true;
postprocessingStartTime[0] = System.currentTimeMillis();
try {postprocessingCount[0] = (int) fulltext.getDefaultConnector().getCountByQuery(CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]");} catch (IOException e) {}
proccount += collection1Configuration.postprocessing(index, rrCache, clickdepthCache, null);
postprocessingStartTime[0] = 0;
try {postprocessingCount[0] = (int) fulltext.getDefaultConnector().getCountByQuery(CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]");} catch (IOException e) {} // should be zero but you never know
if (processWebgraph) {
postprocessingStartTime[1] = System.currentTimeMillis();
try {postprocessingCount[1] = (int) fulltext.getWebgraphConnector().getCountByQuery(WebgraphSchema.process_sxt.getSolrFieldName() + ":[* TO *]");} catch (IOException e) {}
proccount += webgraphConfiguration.postprocessing(index, clickdepthCache, null);
postprocessingStartTime[1] = 0;
try {postprocessingCount[1] = (int) fulltext.getWebgraphConnector().getCountByQuery(WebgraphSchema.process_sxt.getSolrFieldName() + ":[* TO *]");} catch (IOException e) {}
}
this.crawler.cleanProfiles(this.crawler.getActiveProfiles());
log.info("cleanup post-processed " + proccount + " documents");
}
}
postprocessingStartTime = new long[]{0,0}; // the start time for the processing; not started = 0
postprocessingRunning = false; postprocessingRunning = false;
} }

@ -890,8 +890,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (!segment.connectedCitation() && !segment.fulltext().writeToWebgraph()) return 0; if (!segment.connectedCitation() && !segment.fulltext().writeToWebgraph()) return 0;
SolrConnector collectionConnector = segment.fulltext().getDefaultConnector(); SolrConnector collectionConnector = segment.fulltext().getDefaultConnector();
SolrConnector webgraphConnector = segment.fulltext().getWebgraphConnector(); SolrConnector webgraphConnector = segment.fulltext().getWebgraphConnector();
collectionConnector.commit(true); // make sure that we have latest information that can be found collectionConnector.commit(false); // make sure that we have latest information that can be found
if (webgraphConnector != null) webgraphConnector.commit(true); if (webgraphConnector != null) webgraphConnector.commit(false);
Map<byte[], CRV> ranking = new TreeMap<byte[], CRV>(Base64Order.enhancedCoder); Map<byte[], CRV> ranking = new TreeMap<byte[], CRV>(Base64Order.enhancedCoder);
ReversibleScoreMap<String> hostscore = null; ReversibleScoreMap<String> hostscore = null;
try { try {
@ -1241,6 +1241,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
} else { } else {
// Output a warning that d[] is empty // Output a warning that d[] is empty
ConcurrentLog.warn("COLLECTION", "d[] is empty, iid=" + iid); ConcurrentLog.warn("COLLECTION", "d[] is empty, iid=" + iid);
break;
} }
} }
} }

Loading…
Cancel
Save