From 6f0baaa309bc7814b1a037ce8b7f357949789c52 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Fri, 4 Jan 2013 16:37:39 +0100 Subject: [PATCH] added the clickdepth post-processing: some links may have 'shortcuts' to already calculated click depths. There are then calculated if the crawl buffer is empty and therefore no new 'shortcuts' can be discovered. The status of the clickdepth stack (to-be-processed) can be seen using a solr search command like this: http://localhost:8090/solr/select?q=process_sxt:[*%20TO%20*]&start=0&rows=30&fl=sku,clickdepth_i,process_sxt --- htroot/HostBrowser.java | 2 +- source/net/yacy/search/Switchboard.java | 46 +++++++++++++++++++ .../yacy/search/index/SolrConfiguration.java | 2 +- 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index edba7f5e0..550494fa8 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -270,7 +270,7 @@ public class HostBrowser { String errortype = (String) doc.getFieldValue(YaCySchema.failtype_s.getSolrFieldName()); FailType error = errortype == null ? null : FailType.valueOf(errortype); Integer cd = (Integer) doc.getFieldValue(YaCySchema.clickdepth_i.getSolrFieldName()); - if (cd != null) clickdepth.add(ASCII.getBytes((String) doc.getFieldValue(YaCySchema.id.getSolrFieldName())), cd.intValue()); + if (cd != null && cd.intValue() >= 0) clickdepth.add(ASCII.getBytes((String) doc.getFieldValue(YaCySchema.id.getSolrFieldName())), cd.intValue()); if (u.startsWith(path)) { if (delete) { deleteIDs.add(ASCII.getBytes((String) doc.getFieldValue(YaCySchema.id.getSolrFieldName()))); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 7fad29e8a..ce2b5ca16 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -69,6 +69,7 @@ import java.util.SortedMap; import java.util.SortedSet; import java.util.TreeMap; import java.util.TreeSet; +import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutionException; import java.util.concurrent.LinkedBlockingQueue; @@ -81,6 +82,8 @@ import java.util.zip.GZIPOutputStream; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; +import org.apache.solr.client.solrj.util.ClientUtils; +import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrInputDocument; import net.yacy.contentcontrol.ContentControlFilterUpdateThread; @@ -95,7 +98,9 @@ import net.yacy.cora.document.UTF8; import net.yacy.cora.document.WordCache; import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.federate.solr.Boost; +import net.yacy.cora.federate.solr.ProcessType; import net.yacy.cora.federate.solr.YaCySchema; +import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; import net.yacy.cora.federate.solr.connector.ShardSelection; import net.yacy.cora.federate.solr.connector.ShardSolrConnector; import net.yacy.cora.federate.solr.connector.SolrConnector; @@ -2223,6 +2228,47 @@ public final class Switchboard extends serverSwitch { JenaTripleStore.saveAll(); } + // if no crawl is running and processing is activated: + // execute the (post-) processing steps for all entries that have a process tag assigned + if (this.crawlQueues.coreCrawlJobSize() == 0 && index.fulltext().getSolrScheme().contains(YaCySchema.process_sxt)) { + // that means we must search for those entries. + index.fulltext().getSolr().commit(); // make sure that we have latest information that can be found + BlockingQueue docs = index.fulltext().getSolr().concurrentQuery(YaCySchema.process_sxt.getSolrFieldName() + ":[* TO *]", 0, 1000, 60000, 10); + SolrDocument doc; + int proccount_clickdepth = 0; + int proccount_clickdepthchange = 0; + while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { + // for each to-be-processed entry work on the process tag + Collection proctags = doc.getFieldValues(YaCySchema.process_sxt.getSolrFieldName()); + for (Object tag: proctags) { + String tagname = (String) tag; + ProcessType tagtype = ProcessType.valueOf(tagname); + + // switch over tag types + if (tagtype == ProcessType.CLICKDEPTH) { + //proctags.remove(tag); + if (index.fulltext().getSolrScheme().contains(YaCySchema.clickdepth_i)) { + DigestURI url; + try { + Integer oldclickdepth = (Integer) doc.getFieldValue(YaCySchema.clickdepth_i.getSolrFieldName()); + url = new DigestURI((String) doc.getFieldValue(YaCySchema.sku.getSolrFieldName()), ASCII.getBytes((String) doc.getFieldValue(YaCySchema.id.getSolrFieldName()))); + int clickdepth = SolrConfiguration.getClickDepth(index.urlCitation(), url); + if (oldclickdepth == null || oldclickdepth.intValue() != clickdepth) proccount_clickdepthchange++; + SolrInputDocument sid = ClientUtils.toSolrInputDocument(doc); + sid.setField(YaCySchema.clickdepth_i.getSolrFieldName(), clickdepth); + sid.removeField(YaCySchema.process_sxt.getSolrFieldName()); + index.fulltext().getSolr().add(sid); + proccount_clickdepth++; + } catch (Throwable e) { + Log.logException(e); + } + } + } + } + } + log.logInfo("cleanup_processing: re-calculated " + proccount_clickdepth + " new clickdepth values, " + proccount_clickdepthchange + " values changed."); + } + return true; } catch ( final InterruptedException e ) { this.log.logInfo("cleanupJob: Shutdown detected"); diff --git a/source/net/yacy/search/index/SolrConfiguration.java b/source/net/yacy/search/index/SolrConfiguration.java index 249d2ae74..e6de803e9 100644 --- a/source/net/yacy/search/index/SolrConfiguration.java +++ b/source/net/yacy/search/index/SolrConfiguration.java @@ -841,7 +841,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable * @return the clickdepth level or -1 if the root url cannot be found or a recursion limit is reached * @throws IOException */ - private static int getClickDepth(final IndexCell citations, final DigestURI url) throws IOException { + public static int getClickDepth(final IndexCell citations, final DigestURI url) throws IOException { final byte[] searchhash = url.hash(); RowHandleSet rootCandidates = url.getPossibleRootHashes();