free configuration of postprocessing clickdepth maximum depth and time

11 years ago · 63c9fcf3e0
parent 39b641d6cd
commit 63c9fcf3e0
6 changed files with 51 additions and 33 deletions
--- a/source/net/yacy/cora/document/id/DigestURL.java
+++ b/source/net/yacy/cora/document/id/DigestURL.java
@ -288,7 +288,7 @@ public class DigestURL extends MultiProtocolURL implements Serializable {
        return Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(sb.toString())).charAt(0);
    }

-    public final static Pattern rootPattern = Pattern.compile("/|/index.htm(l?)|/index.php|/home.htm(l?)|/home.php|/default.htm(l?)|/default.php");
+    public final static Pattern rootPattern = Pattern.compile("/|/\\?|/index.htm(l?)|/index.php|/home.htm(l?)|/home.php|/default.htm(l?)|/default.php");
    
    public final boolean probablyRootURL() {
        return this.path.length() <= 1 || rootPattern.matcher(this.path).matches();
--- a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java
+++ b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java
@ -178,12 +178,12 @@ public class SchemaConfiguration extends Configuration implements Serializable {
        return changed;
    }
    
-    public boolean postprocessing_clickdepth(final ClickdepthCache clickdepthCache, final SolrInputDocument sid, final DigestURL url, final SchemaDeclaration clickdepthfield, final int maxtime) {
+    public boolean postprocessing_clickdepth(final ClickdepthCache clickdepthCache, final SolrInputDocument sid, final DigestURL url, final SchemaDeclaration clickdepthfield) {
        // get new click depth and compare with old
        Integer oldclickdepth = (Integer) sid.getFieldValue(clickdepthfield.getSolrFieldName());
        if (oldclickdepth != null && oldclickdepth.intValue() != 999) return false; // we do not want to compute that again
        try {
-            int clickdepth = clickdepthCache.getClickdepth(url, maxtime);
+            int clickdepth = clickdepthCache.getClickdepth(url);
            if (oldclickdepth == null || oldclickdepth.intValue() != clickdepth) {
                sid.setField(clickdepthfield.getSolrFieldName(), clickdepth);
                return true;
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -542,11 +542,14 @@ public final class Switchboard extends serverSwitch {
        try {
            this.domainList = null;
            if (!getConfig("network.unit.domainlist", "").equals("")) {
-                final Reader r =
-                    getConfigFileFromWebOrLocally(getConfig("network.unit.domainlist", ""), getAppPath()
-                        .getAbsolutePath(), new File(this.networkRoot, "domainlist.txt"));
+                final Reader r = getConfigFileFromWebOrLocally(
+                        getConfig("network.unit.domainlist", ""),
+                        getAppPath().getAbsolutePath(),
+                        new File(this.networkRoot, "domainlist.txt"));
                this.domainList = new FilterEngine();
-                this.domainList.loadList(new BufferedReader(r), null);
+                BufferedReader br = new BufferedReader(r);
+                this.domainList.loadList(br, null);
+                br.close();
            }
        } catch (final FileNotFoundException e ) {
            this.log.severe("CONFIG: domainlist not found: " + e.getMessage());
@ -1382,11 +1385,14 @@ public final class Switchboard extends serverSwitch {
            try {
                this.domainList = null;
                if ( !getConfig("network.unit.domainlist", "").equals("") ) {
-                    final Reader r =
-                        getConfigFileFromWebOrLocally(getConfig("network.unit.domainlist", ""), getAppPath()
-                            .getAbsolutePath(), new File(this.networkRoot, "domainlist.txt"));
+                    final Reader r = getConfigFileFromWebOrLocally(
+                            getConfig("network.unit.domainlist", ""),
+                            getAppPath().getAbsolutePath(),
+                            new File(this.networkRoot, "domainlist.txt"));
                    this.domainList = new FilterEngine();
-                    this.domainList.loadList(new BufferedReader(r), null);
+                    BufferedReader br = new BufferedReader(r);
+                    this.domainList.loadList(br, null);
+                    br.close();
                }
            } catch (final FileNotFoundException e ) {
                this.log.severe("CONFIG: domainlist not found: " + e.getMessage());
@ -1858,8 +1864,9 @@ public final class Switchboard extends serverSwitch {
            }
            return moved;
        }
+        InputStream is = null;
        try {
-            InputStream is = new BufferedInputStream(new FileInputStream(infile));
+            is = new BufferedInputStream(new FileInputStream(infile));
            if ( s.endsWith(".gz") ) {
                is = new GZIPInputStream(is);
            }
@ -1877,8 +1884,10 @@ public final class Switchboard extends serverSwitch {
                        try {
                            final OutputStream os =
                                new BufferedOutputStream(new GZIPOutputStream(new FileOutputStream(gzfile)));
-                            FileUtils.copy(new BufferedInputStream(new FileInputStream(outfile)), os);
+                            BufferedInputStream bis = new BufferedInputStream(new FileInputStream(outfile)); 
+                            FileUtils.copy(bis, os);
                            os.close();
+                            bis.close();
                            if ( gzfile.exists() ) {
                                FileUtils.deletedelete(outfile);
                            }
@ -1890,6 +1899,7 @@ public final class Switchboard extends serverSwitch {
                    }
                }
            }
+            if (is != null) try {is.close();} catch (IOException e) {}
        }
        return moved;
    }
@ -2296,7 +2306,9 @@ public final class Switchboard extends serverSwitch {
                // we optimize first because that is useful for postprocessing
                int proccount = 0;
                ReferenceReportCache rrCache = index.getReferenceReportCache();
-                ClickdepthCache clickdepthCache = index.getClickdepthCache(rrCache);
+                int clickdepth_maxtime = this.getConfigInt("postprocessing.clickdepth.maxtime", 100);
+                int clickdepth_maxdepth = this.getConfigInt("postprocessing.clickdepth.maxdepth", 6);
+                ClickdepthCache clickdepthCache = index.getClickdepthCache(rrCache, clickdepth_maxtime, clickdepth_maxdepth);
                Set<String> deletionCandidates = collection1Configuration.contains(CollectionSchema.harvestkey_s.getSolrFieldName()) ?
                        this.crawler.getFinishesProfiles(this.crawlQueues) : new HashSet<String>();
                int cleanupByHarvestkey = deletionCandidates.size();
--- a/source/net/yacy/search/index/Fulltext.java
+++ b/source/net/yacy/search/index/Fulltext.java
@ -241,7 +241,9 @@ public final class Fulltext {
    public long collectionSize() {
        long t = System.currentTimeMillis();
        if (t - this.collectionSizeLastAccess < 1000) return this.collectionSizeLastValue;
-        long size = this.solrInstances.getDefaultMirrorConnector().getSize();
+        SolrConnector sc = this.solrInstances.getDefaultMirrorConnector();
+        if (sc == null) return 0;
+        long size = sc.getSize();
        this.collectionSizeLastAccess = t;
        this.collectionSizeLastValue = size;
        return size;
--- a/source/net/yacy/search/index/Segment.java
+++ b/source/net/yacy/search/index/Segment.java
@ -209,20 +209,20 @@ public class Segment {
     * @return the clickdepth level or 999 if the root url cannot be found or a recursion limit is reached
     * @throws IOException
     */
-    private int getClickDepth(ReferenceReportCache rrc, final DigestURL url, int maxtime) throws IOException {
+    private int getClickDepth(final ReferenceReportCache rrc, final DigestURL url, final int maxtime, final int maxdepth) throws IOException {

        final byte[] searchhash = url.hash();
        RowHandleSet rootCandidates = getPossibleRootHashes(url);
+        if (rootCandidates.has(searchhash)) return 0; // the url is a root candidate itself
        
-        Set<String> ignore = new HashSet<String>(); // a set of urlhashes to be ignored. This is generated from all hashes that are seen during recursion to prevent enless loops
+        Set<String> ignore = new HashSet<String>(); // a set of urlhashes to be ignored. This is generated from all hashes that are seen during recursion to prevent endless loops
        Set<String> levelhashes = new HashSet<String>(); // all hashes of a clickdepth. The first call contains the target hash only and therefore just one entry
        levelhashes.add(ASCII.String(searchhash));
-        int leveldepth = 0; // the recursion depth and therefore the result depth-1. Shall be 0 for the first call
        final byte[] hosthash = new byte[6]; // the host of the url to be checked
        System.arraycopy(searchhash, 6, hosthash, 0, 6);
        
        long timeout = System.currentTimeMillis() + maxtime;
-        mainloop: for (int maxdepth = 0; maxdepth < 6 && System.currentTimeMillis() < timeout; maxdepth++) {
+        mainloop: for (int leveldepth = 0; leveldepth < maxdepth && System.currentTimeMillis() < timeout; leveldepth++) {
            
            Set<String> checknext = new HashSet<String>();
            
@ -254,15 +254,14 @@ public class Segment {
                }
                if (System.currentTimeMillis() > timeout) break mainloop;
            }
-            leveldepth++;
            levelhashes = checknext;
        }
        return 999;
    }
    
-    private static RowHandleSet getPossibleRootHashes(DigestURL url) {
+    private static RowHandleSet getPossibleRootHashes(final DigestURL url) {
        RowHandleSet rootCandidates = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 10);
-        String rootStub = url.getProtocol() + "://" + url.getHost();
+        String rootStub = url.getProtocol() + "://" + url.getHost() + (url.getProtocol().equals("http") && url.getPort() != 80 ? (":" + url.getPort()) : "");
        try {
            rootCandidates.put(new DigestURL(rootStub).hash());
            rootCandidates.put(new DigestURL(rootStub + "/").hash());
@ -277,6 +276,7 @@ public class Segment {
            rootCandidates.put(new DigestURL(rootStub + "/default.php").hash());
            rootCandidates.optimize();
        } catch (final Throwable e) {}
+        rootCandidates.optimize();
        return rootCandidates;
    }

@ -304,25 +304,29 @@ public class Segment {
        }
    }
    
-    public ClickdepthCache getClickdepthCache(ReferenceReportCache rrc)  {
-        return new ClickdepthCache(rrc);
+    public ClickdepthCache getClickdepthCache(ReferenceReportCache rrc, final int maxtime, final int maxdepth)  {
+        return new ClickdepthCache(rrc, maxtime, maxdepth);
    }
    
    public class ClickdepthCache {
-        final ReferenceReportCache rrc;
-        final Map<String, Integer> cache;
-        public ClickdepthCache(ReferenceReportCache rrc) {
+        private final ReferenceReportCache rrc;
+        private final Map<String, Integer> cache;
+        public final int maxdepth; // maximum clickdepth
+        public final int maxtime; // maximum time to compute clickdepth
+        public ClickdepthCache(final ReferenceReportCache rrc, final int maxtime, final int maxdepth) {
            this.rrc = rrc;
            this.cache = new ConcurrentHashMap<String, Integer>();
+            this.maxdepth = maxdepth;
+            this.maxtime = maxtime;
        }
-        public int getClickdepth(final DigestURL url, int maxtime) throws IOException {
+        public int getClickdepth(final DigestURL url) throws IOException {
            Integer clickdepth = cache.get(ASCII.String(url.hash()));
            if (MemoryControl.shortStatus()) cache.clear();
            if (clickdepth != null) {
                //ConcurrentLog.info("Segment", "get clickdepth of url " + url.toNormalform(true) + ": " + clickdepth + " CACHE HIT");
                return clickdepth.intValue();
            }
-            clickdepth = Segment.this.getClickDepth(this.rrc, url, maxtime);
+            clickdepth = Segment.this.getClickDepth(this.rrc, url, this.maxtime, this.maxdepth);
            //ConcurrentLog.info("Segment", "get clickdepth of url " + url.toNormalform(true) + ": " + clickdepth);
            this.cache.put(ASCII.String(url.hash()), clickdepth);
            return clickdepth.intValue();
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@ -1085,7 +1085,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
                                                id = (String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName());
                                                try {
                                                    url = new DigestURL(protocol + "://" + urlstub, ASCII.getBytes(id));
-                                                    postprocessing_clickdepth(clickdepthCache, sid, url, WebgraphSchema.source_clickdepth_i, 100);
+                                                    postprocessing_clickdepth(clickdepthCache, sid, url, WebgraphSchema.source_clickdepth_i);
                                                } catch (MalformedURLException e) {
                                                }
                                            }
@ -1095,7 +1095,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
                                                id = (String) doc.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName());
                                                try {
                                                    url = new DigestURL(protocol + "://" + urlstub, ASCII.getBytes(id));
-                                                    postprocessing_clickdepth(clickdepthCache, sid, url, WebgraphSchema.target_clickdepth_i, 100);
+                                                    postprocessing_clickdepth(clickdepthCache, sid, url, WebgraphSchema.target_clickdepth_i);
                                                } catch (MalformedURLException e) {
                                                }
                                            }
@ -1167,7 +1167,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
                        // switch over tag types
                        ProcessType tagtype = ProcessType.valueOf((String) tag);
                        if (tagtype == ProcessType.CLICKDEPTH && collection.contains(CollectionSchema.clickdepth_i)) {
-                            if (postprocessing_clickdepth(clickdepthCache, sid, url, CollectionSchema.clickdepth_i, 100)) proccount_clickdepthchange++;
+                            if (postprocessing_clickdepth(clickdepthCache, sid, url, CollectionSchema.clickdepth_i)) proccount_clickdepthchange++;
                        }

                        if (tagtype == ProcessType.CITATION &&