Merge branch 'master' of git://gitorious.org/~reger/yacy/bbyacy-rc1

13 years ago · c18fa9fa75
parent ce8d4b87d9 067728bccc
commit c18fa9fa75
5 changed files with 124 additions and 9 deletions
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@ -1015,6 +1015,8 @@ about.body =
 # search heuristics
 heuristic.site = false
 heuristic.blekko = false
+heuristic.searchresults = false
+heuristic.searchresults.crawlglobal = false

 # colours for generic design
 color_background = #FFFFFF
--- a/htroot/ConfigHeuristics_p.html
+++ b/htroot/ConfigHeuristics_p.html
@ -43,6 +43,34 @@
      </p>
    </fieldset>
    </form>
+    
+    <form id="HeuristicFormSearchResult" method="post" action="ConfigHeuristics_p.html" enctype="multipart/form-data" accept-charset="UTF-8">
+        <fieldset>
+            <table>
+                <tr>
+                    <td>
+                        <legend>
+                            <input type="checkbox" name="searchresult_check" id="searchresult" onclick="window.location.href='ConfigHeuristics_p.html?#(searchresult.checked)#searchresult_on=::searchresult_off=#(/searchresult.checked)#'" value="searchresult"#(searchresult.checked)#:: checked="checked"#(/searchresult.checked)# />
+                            <label for="searchresult">search-result: shallow crawl on all displayed search results</label>
+                        </legend>
+                    </td>
+                    <td>
+                        <legend>
+                            <input type="checkbox" name="searchresultglobal_check" id="searchresultglobal" onclick="window.location.href='ConfigHeuristics_p.html?#(searchresultglobal.checked)#searchresultglobal_on=::searchresultglobal_off=#(/searchresultglobal.checked)#'" value="siteresultglobal"#(searchresultglobal.checked)#:: checked="checked"#(/searchresultglobal.checked)# />
+                            <label for="searchresultglobal">add as global crawl job</label>
+                        </legend>
+                    </td>
+                </tr>
+            </table>
+      <p>
+      When a search is made then all displayed result links are crawled with a depth-1 crawl.
+      This means: right after the search request every page is loaded and every page that is linked on this page.
+      If you check 'add as global crawl job' the pages to be crawled are added to the global crawl queue (remote peers can pickup pages to be crawled).
+      Default is to add the links to the local crawl queue (your peer crawls the linked pages).
+      </p>
+    </fieldset>
+    </form>
+    
    <form id="HeuristicFormBlekko" method="post" action="ConfigHeuristics_p.html" enctype="multipart/form-data" accept-charset="UTF-8">
    <fieldset>
      <legend>
@ -56,7 +84,6 @@
    </fieldset>
    </form>

-    
    #%env/templates/footer.template%#
  </body>
 </html>
--- a/htroot/ConfigHeuristics_p.java
+++ b/htroot/ConfigHeuristics_p.java
@ -45,11 +45,17 @@ public class ConfigHeuristics_p {
            
            if (post.containsKey("site_on")) sb.setConfig("heuristic.site", true);
            if (post.containsKey("site_off")) sb.setConfig("heuristic.site", false);
+            if (post.containsKey("searchresult_on")) sb.setConfig("heuristic.searchresults", true);
+            if (post.containsKey("searchresult_off")) sb.setConfig("heuristic.searchresults", false);
+            if (post.containsKey("searchresultglobal_on")) sb.setConfig("heuristic.searchresults.crawlglobal", true);
+            if (post.containsKey("searchresultglobal_off")) sb.setConfig("heuristic.searchresults.crawlglobal", false);
            if (post.containsKey("blekko_on")) sb.setConfig("heuristic.blekko", true);
            if (post.containsKey("blekko_off")) sb.setConfig("heuristic.blekko", false);
        }
        
        prop.put("site.checked", sb.getConfigBool("heuristic.site", false) ? 1 : 0);
+        prop.put("searchresult.checked", sb.getConfigBool("heuristic.searchresults", false) ? 1 : 0);
+        prop.put("searchresultglobal.checked", sb.getConfigBool("heuristic.searchresults.crawlglobal", false) ? 1 : 0);
        prop.put("blekko.checked", sb.getConfigBool("heuristic.blekko", false) ? 1 : 0);

        return prop;
--- a/htroot/yacysearchitem.java
+++ b/htroot/yacysearchitem.java
@ -248,6 +248,7 @@ public class yacysearchitem {
                prop.put("content_loc_lat", result.lat());
                prop.put("content_loc_lon", result.lon());
            }
+            if (sb.getConfigBool("heuristic.searchresults",false)) sb.heuristicSearchResults(resultUrlstring);
            theQuery.transmitcount = item + 1;
            return prop;
        }
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -160,6 +160,7 @@ import de.anomic.crawler.CrawlQueues;
 import de.anomic.crawler.CrawlStacker;
 import de.anomic.crawler.CrawlSwitchboard;
 import de.anomic.crawler.NoticedURL;
+import de.anomic.crawler.NoticedURL.StackType;
 import de.anomic.crawler.ResourceObserver;
 import de.anomic.crawler.ResultImages;
 import de.anomic.crawler.ResultURLs;
@ -2791,6 +2792,44 @@ public final class Switchboard extends serverSwitch
        }.start();
    }

+     /**
+     * add url to Crawler - which itself loads the URL, parses the content and adds it to the index
+     * transparent alternative to "addToIndex" including, double in crawler check, display in crawl monitor
+     * but doesn't return results for a ongoing search
+     *
+     * @param url the url that shall be indexed
+     * @param asglobal true adds the url to global crawl queue (for remote crawling), false to the local crawler
+     */
+    public void addToCrawler(final DigestURI url, final boolean asglobal) {
+
+        if ( this.index.exists(url.hash()) ) {
+            return; // don't do double-work
+        }
+        final Request request = this.loader.request(url, true, true);
+        final CrawlProfile profile = sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
+        final String acceptedError = this.crawlStacker.checkAcceptance(url, profile, 0);
+        if (acceptedError != null) {
+            this.log.logInfo("addToCrawler: cannot load "
+                    + url.toNormalform(false, false)
+                    + ": "
+                    + acceptedError);
+            return;
+        }
+        final String s;
+        if (asglobal) {
+            s = sb.crawlQueues.noticeURL.push(StackType.GLOBAL, request);
+        } else {
+            s = sb.crawlQueues.noticeURL.push(StackType.LOCAL, request);
+        }
+
+        if (s != null) {
+            Switchboard.this.log.logInfo("addToCrawler: failed to add "
+                    + url.toNormalform(false, false)
+                    + ": "
+                    + s);
+        }
+    }
+
    public class receiptSending implements Runnable
    {
        private final Seed initiatorPeer;
@ -3155,6 +3194,46 @@ public final class Switchboard extends serverSwitch
        }.start();
    }

+    public final void heuristicSearchResults(final String host) {
+        new Thread() {
+
+            @Override
+            public void run() {
+
+                // get the links for a specific site
+                final DigestURI startUrl;
+                try {
+                    startUrl = new DigestURI(host);
+                } catch (final MalformedURLException e) {
+                    Log.logException(e);
+                    return;
+                }
+
+                final Map<MultiProtocolURI, String> links;
+                DigestURI url;
+                try {
+                    links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH);
+                    if (links != null) {
+                        if (links.size() < 1000) { // limit to 1000 to skip large index pages
+                            final Iterator<MultiProtocolURI> i = links.keySet().iterator();
+                            final boolean globalcrawljob = sb.getConfigBool("heuristic.searchresults.crawlglobal",false);
+                            while (i.hasNext()) {
+                                url = new DigestURI(i.next());
+                                boolean islocal = url.getHost().contentEquals(startUrl.getHost());
+                                // add all external links or links to different page to crawler
+                                if ( !islocal ) {// || (!startUrl.getPath().endsWith(url.getPath()))) {
+                                    addToCrawler(url,globalcrawljob);
+                                }
+                            }
+                        }
+                    }
+                } catch (final Throwable e) {
+                    Log.logException(e);
+                }
+            }
+        }.start();
+    }
+
    // blekko pattern: http://blekko.com/ws/$+/rss
    public final void heuristicRSS(
        final String urlpattern,