added the clickdepth post-processing: some links may have 'shortcuts' to

already calculated click depths. There are then calculated if the crawl buffer is empty and therefore no new 'shortcuts' can be discovered. The status of the clickdepth stack (to-be-processed) can be seen using a solr search command like this: http://localhost:8090/solr/select?q=process_sxt:[*%20TO%20*]&start=0&rows=30&fl=sku,clickdepth_i,process_sxt
12 years ago · 6f0baaa309
parent 0f5b6f38c1
commit 6f0baaa309
3 changed files with 48 additions and 2 deletions
--- a/htroot/HostBrowser.java
+++ b/htroot/HostBrowser.java
@ -270,7 +270,7 @@ public class HostBrowser {
                    String errortype = (String) doc.getFieldValue(YaCySchema.failtype_s.getSolrFieldName());
                    FailType error = errortype == null ? null : FailType.valueOf(errortype);  
                    Integer cd = (Integer) doc.getFieldValue(YaCySchema.clickdepth_i.getSolrFieldName());
-                    if (cd != null) clickdepth.add(ASCII.getBytes((String) doc.getFieldValue(YaCySchema.id.getSolrFieldName())), cd.intValue());
+                    if (cd != null && cd.intValue() >= 0) clickdepth.add(ASCII.getBytes((String) doc.getFieldValue(YaCySchema.id.getSolrFieldName())), cd.intValue());
                    if (u.startsWith(path)) {
                        if (delete) {
                            deleteIDs.add(ASCII.getBytes((String) doc.getFieldValue(YaCySchema.id.getSolrFieldName())));
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -69,6 +69,7 @@ import java.util.SortedMap;
 import java.util.SortedSet;
 import java.util.TreeMap;
 import java.util.TreeSet;
 import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.LinkedBlockingQueue;
@ -81,6 +82,8 @@ import java.util.zip.GZIPOutputStream;
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipInputStream;
 import org.apache.solr.client.solrj.util.ClientUtils;
 import org.apache.solr.common.SolrDocument;
 import org.apache.solr.common.SolrInputDocument;
 import net.yacy.contentcontrol.ContentControlFilterUpdateThread;
@ -95,7 +98,9 @@ import net.yacy.cora.document.UTF8;
 import net.yacy.cora.document.WordCache;
 import net.yacy.cora.document.analysis.Classification;
 import net.yacy.cora.federate.solr.Boost;
 import net.yacy.cora.federate.solr.ProcessType;
 import net.yacy.cora.federate.solr.YaCySchema;
 import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
 import net.yacy.cora.federate.solr.connector.ShardSelection;
 import net.yacy.cora.federate.solr.connector.ShardSolrConnector;
 import net.yacy.cora.federate.solr.connector.SolrConnector;
@ -2223,6 +2228,47 @@ public final class Switchboard extends serverSwitch {
                JenaTripleStore.saveAll();
            }
            // if no crawl is running and processing is activated:
            // execute the (post-) processing steps for all entries that have a process tag assigned
            if (this.crawlQueues.coreCrawlJobSize() == 0 && index.fulltext().getSolrScheme().contains(YaCySchema.process_sxt)) {
                // that means we must search for those entries.
                index.fulltext().getSolr().commit(); // make sure that we have latest information that can be found
                BlockingQueue<SolrDocument> docs = index.fulltext().getSolr().concurrentQuery(YaCySchema.process_sxt.getSolrFieldName() + ":[* TO *]", 0, 1000, 60000, 10);
                SolrDocument doc;
                int proccount_clickdepth = 0;
                int proccount_clickdepthchange = 0;
                while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
                    // for each to-be-processed entry work on the process tag
                    Collection<Object> proctags = doc.getFieldValues(YaCySchema.process_sxt.getSolrFieldName());
                    for (Object tag: proctags) {
                        String tagname = (String) tag;
                        ProcessType tagtype = ProcessType.valueOf(tagname);
                        // switch over tag types
                        if (tagtype == ProcessType.CLICKDEPTH) {
                            //proctags.remove(tag);
                            if (index.fulltext().getSolrScheme().contains(YaCySchema.clickdepth_i)) {
                                DigestURI url;
                                try {
                                    Integer oldclickdepth = (Integer) doc.getFieldValue(YaCySchema.clickdepth_i.getSolrFieldName());
                                    url = new DigestURI((String) doc.getFieldValue(YaCySchema.sku.getSolrFieldName()), ASCII.getBytes((String) doc.getFieldValue(YaCySchema.id.getSolrFieldName())));
                                    int clickdepth = SolrConfiguration.getClickDepth(index.urlCitation(), url);
                                    if (oldclickdepth == null || oldclickdepth.intValue() != clickdepth) proccount_clickdepthchange++;
                                    SolrInputDocument sid = ClientUtils.toSolrInputDocument(doc);
                                    sid.setField(YaCySchema.clickdepth_i.getSolrFieldName(), clickdepth);
                                    sid.removeField(YaCySchema.process_sxt.getSolrFieldName());
                                    index.fulltext().getSolr().add(sid);
                                    proccount_clickdepth++;
                                } catch (Throwable e) {
                                    Log.logException(e);
                                }
                            }
                        }
                    }
                }
                log.logInfo("cleanup_processing: re-calculated " + proccount_clickdepth + " new clickdepth values, " + proccount_clickdepthchange + " values changed.");
            }
            return true;
        } catch ( final InterruptedException e ) {
            this.log.logInfo("cleanupJob: Shutdown detected");
--- a/source/net/yacy/search/index/SolrConfiguration.java
+++ b/source/net/yacy/search/index/SolrConfiguration.java
@ -841,7 +841,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
     * @return the clickdepth level or -1 if the root url cannot be found or a recursion limit is reached
     * @throws IOException
     */
-    private static int getClickDepth(final IndexCell<CitationReference> citations, final DigestURI url) throws IOException {
+    public static int getClickDepth(final IndexCell<CitationReference> citations, final DigestURI url) throws IOException {
        final byte[] searchhash = url.hash();
        RowHandleSet rootCandidates = url.getPossibleRootHashes();