enhanced input options for crawl start

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1978 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago · 860a7b545b
parent d181d3fde7
commit 860a7b545b
2 changed files with 50 additions and 25 deletions
--- a/htroot/IndexCreate_p.html
+++ b/htroot/IndexCreate_p.html
@ -46,8 +46,8 @@ You can define URLs as start points for Web page crawling and start crawling her
  <tr valign="top" class="TableCellLight">
    <td class=small>Re-Crawl Option:</td>
    <td class=small>
-    <input type="checkbox" name="crawlingIfOlderCheck" align="top" #(crawlingIfOlderCheck)#::checked#(/crawlingIfOlderCheck)#>
-    <input name="crawlingIfOlderNumber" type="text" size="7" maxlength="7" value="#[crawlingIfOlderNumber]#"><br>
+    Use:<input type="checkbox" name="crawlingIfOlderCheck" align="top" #(crawlingIfOlderCheck)#::checked#(/crawlingIfOlderCheck)#>&nbsp;&nbsp;
+    Time:<input name="crawlingIfOlderNumber" type="text" size="7" maxlength="7" value="#[crawlingIfOlderNumber]#"><br>
 <input type="radio" name="crawlingIfOlderUnit" value="year"  #(crawlingIfOlderUnitYearCheck)#::checked#(/crawlingIfOlderUnitYearCheck)#>Year(s)&nbsp;&nbsp;
    <input type="radio" name="crawlingIfOlderUnit" value="month" #(crawlingIfOlderUnitMonthCheck)#::checked#(/crawlingIfOlderUnitMonthCheck)#>Month(s)&nbsp;&nbsp;
    <input type="radio" name="crawlingIfOlderUnit" value="day" #(crawlingIfOlderUnitDayCheck)#::checked#(/crawlingIfOlderUnitDayCheck)#>Day(s)&nbsp;&nbsp;
@ -60,17 +60,26 @@ You can define URLs as start points for Web page crawling and start crawling her
    </td>
  </tr>
  <tr valign="top" class="TableCellDark">
-    <td class=small>Auto-Dom-Filter Depth:</td>
-    <td class=small><input name="crawlingDomFilterDepth" type="text" size="2" maxlength="2" value="#[crawlingDomFilterDepth]#"></td>
+    <td class=small>Auto-Dom-Filter:</td>
    <td class=small>
-    
+    Use:<input type="checkbox" name="crawlingDomFilterCheck" align="top" #(crawlingDomFilterCheck)#::checked#(/crawlingDomFilterCheck)#>&nbsp;&nbsp;
+    Depth:<input name="crawlingDomFilterDepth" type="text" size="2" maxlength="2" value="#[crawlingDomFilterDepth]#"></td>
+    <td class=small>
+    This option will cause a creation of a domain-list during indexing. This list is filled only with domains that
+    appear on the given depth during crawling. The domain-list is then used to filter-out all domains, that appear
+    on depths greater then the given depth, but do not appear in the domain-list. You can use this option i.e.
+    to crawl pages with bookmarks while restricting the crawl on only those domains that appear on the bookmark-page.
    </td>
  </tr>
  <tr valign="top" class="TableCellLight">
    <td class=small>Maximum Pages per Domain:</td>
-    <td class=small><input name="crawlingDomMaxPages" type="text" size="6" maxlength="6" value="#[crawlingDomMaxPages]#"></td>
    <td class=small>
-    
+    Use:<input type="checkbox" name="crawlingDomMaxCheck" align="top" #(crawlingDomMaxCheck)#::checked#(/crawlingDomMaxCheck)#>&nbsp;&nbsp;
+    Page-Count:<input name="crawlingDomMaxPages" type="text" size="6" maxlength="6" value="#[crawlingDomMaxPages]#"></td>
+    <td class=small>
+    You can limit the maxmimum number of pages that are fetched and indexed from a single domain with this option.
+    You can combine this limitation with the 'Auto-Dom-Filter', so that the limit is applied to all the domains within
+    the given depth. Domains outside the given depth are then sorted-out anyway.
    </td>
  </tr>
  <tr valign="top" class="TableCellDark">
--- a/htroot/IndexCreate_p.java
+++ b/htroot/IndexCreate_p.java
@ -93,30 +93,43 @@ public class IndexCreate_p {
                    // set new properties
                    String newcrawlingfilter = post.get("crawlingFilter", ".*");
                    env.setConfig("crawlingFilter", newcrawlingfilter);
+                    
                    int newcrawlingdepth = Integer.parseInt(post.get("crawlingDepth", "0"));
                    env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
-                    boolean crawlingIfOlderCheck = post.get("crawlingIfOlderCheck", "").equals("on");
+                    
+                    boolean crawlingIfOlderCheck = post.get("crawlingIfOlderCheck", "off").equals("on");
                    int crawlingIfOlderNumber = Integer.parseInt(post.get("crawlingIfOlderNumber", "-1"));
                    String crawlingIfOlderUnit = post.get("crawlingIfOlderUnit","year");
                    int crawlingIfOlder = recrawlIfOlderC(crawlingIfOlderCheck, crawlingIfOlderNumber, crawlingIfOlderUnit);                    
                    env.setConfig("crawlingIfOlder", crawlingIfOlder);
-                    int domFilterDepth = Integer.parseInt(post.get("crawlingDomFilterDepth", "-1"));
-                    env.setConfig("crawlingDomFilterDepth", Integer.toString(domFilterDepth));
-                    int domMaxPages = Integer.parseInt(post.get("crawlingDomMaxPages", "-1"));
-                    env.setConfig("crawlingDomMaxPages", Integer.toString(domMaxPages));
-                    boolean crawlingQ = post.get("crawlingQ", "").equals("on");
+                    
+                    boolean crawlingDomFilterCheck = post.get("crawlingDomFilterCheck", "off").equals("on");
+                    int crawlingDomFilterDepth = (crawlingDomFilterCheck) ? Integer.parseInt(post.get("crawlingDomFilterDepth", "-1")) : -1;
+                    env.setConfig("crawlingDomFilterDepth", Integer.toString(crawlingDomFilterDepth));
+                    
+                    boolean crawlingDomMaxCheck = post.get("crawlingDomMaxCheck", "off").equals("on");
+                    int crawlingDomMaxPages = (crawlingDomMaxCheck) ? Integer.parseInt(post.get("crawlingDomFilterDepth", "-1")) : -1;
+                    env.setConfig("crawlingDomMaxPages", Integer.toString(crawlingDomMaxPages));
+                    
+                    boolean crawlingQ = post.get("crawlingQ", "off").equals("on");
                    env.setConfig("crawlingQ", (crawlingQ) ? "true" : "false");
-                    boolean storeHTCache = post.get("storeHTCache", "").equals("on");
+                    
+                    boolean storeHTCache = post.get("storeHTCache", "off").equals("on");
                    env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false");
-                    boolean localIndexing = post.get("localIndexing", "").equals("on");
+                    
+                    boolean localIndexing = post.get("localIndexing", "off").equals("on");
                    env.setConfig("localIndexing", (localIndexing) ? "true" : "false");
-                    boolean crawlOrder = post.get("crawlOrder", "").equals("on");
+                    
+                    boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
                    env.setConfig("crawlOrder", (crawlOrder) ? "true" : "false");
-                    boolean xsstopw = post.get("xsstopw", "").equals("on");
+                    
+                    boolean xsstopw = post.get("xsstopw", "off").equals("on");
                    env.setConfig("xsstopw", (xsstopw) ? "true" : "false");
-                    boolean xdstopw = post.get("xdstopw", "").equals("on");
+                    
+                    boolean xdstopw = post.get("xdstopw", "off").equals("on");
                    env.setConfig("xdstopw", (xdstopw) ? "true" : "false");
-                    boolean xpstopw = post.get("xpstopw", "").equals("on");
+                    
+                    boolean xpstopw = post.get("xpstopw", "off").equals("on");
                    env.setConfig("xpstopw", (xpstopw) ? "true" : "false");
                    
                    String crawlingMode = post.get("crawlingMode","url");
@ -154,7 +167,7 @@ public class IndexCreate_p {
                            switchboard.urlPool.errorURL.remove(urlhash);
                            
                            // stack url
-                            plasmaCrawlProfile.entry pe = switchboard.profiles.newEntry(crawlingStartURL.getHost(), crawlingStart, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingIfOlder, domFilterDepth, domMaxPages, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);
+                            plasmaCrawlProfile.entry pe = switchboard.profiles.newEntry(crawlingStartURL.getHost(), crawlingStart, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);
                            String reasonString = switchboard.sbStackCrawlThread.stackCrawl(crawlingStart, null, yacyCore.seedDB.mySeed.hash, "CRAWLING-ROOT", new Date(), 0, pe);
                            
                            if (reasonString == null) {
@ -215,7 +228,7 @@ public class IndexCreate_p {
                                HashMap hyperlinks = (HashMap) scraper.getAnchors();
                                
                                // creating a crawler profile
-                                plasmaCrawlProfile.entry profile = switchboard.profiles.newEntry(fileName, file.toURL().toString(), newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingIfOlder, domFilterDepth, domMaxPages, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);                                
+                                plasmaCrawlProfile.entry profile = switchboard.profiles.newEntry(fileName, file.toURL().toString(), newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);                                
                                
                                // loop through the contained links
                                Iterator interator = hyperlinks.entrySet().iterator();
@ -306,7 +319,7 @@ public class IndexCreate_p {
        prop.put("crawlingFilter", env.getConfig("crawlingFilter", "0"));
        
        int crawlingIfOlder = (int) env.getConfigLong("crawlingIfOlder", -1);
-        prop.put("crawlingIfOlderCheck", (crawlingIfOlder == Integer.MAX_VALUE) ? 0 : 1);
+        prop.put("crawlingIfOlderCheck", (crawlingIfOlder == -1) ? 0 : 1);
        prop.put("crawlingIfOlderUnitYearCheck", 0);
        prop.put("crawlingIfOlderUnitMonthCheck", 0);
        prop.put("crawlingIfOlderUnitDayCheck", 0);
@ -329,9 +342,12 @@ public class IndexCreate_p {
            prop.put("crawlingIfOlderNumber", crawlingIfOlder);
            prop.put("crawlingIfOlderUnitMinuteCheck", 1);
        }
-        //prop.put("crawlingIfOlder", crawlingIfOlder);
-        prop.put("crawlingDomFilterDepth", env.getConfig("crawlingDomFilterDepth", "-1"));
-        prop.put("crawlingDomMaxPages", env.getConfig("crawlingDomMaxPages", "-1"));
+        int crawlingDomFilterDepth = (int) env.getConfigLong("crawlingDomFilterDepth", -1);
+        prop.put("crawlingDomFilterCheck", (crawlingDomFilterDepth == -1) ? 0 : 1);
+        prop.put("crawlingDomFilterDepth", (crawlingDomFilterDepth == -1) ? 1 : crawlingDomFilterDepth);
+        int crawlingDomMaxPages = (int) env.getConfigLong("crawlingDomMaxPages", -1);
+        prop.put("crawlingDomMaxCheck", (crawlingDomMaxPages == -1) ? 0 : 1);
+        prop.put("crawlingDomMaxPages", (crawlingDomMaxPages == -1) ? 10000 : crawlingDomMaxPages);
        prop.put("crawlingQChecked", env.getConfig("crawlingQ", "").equals("true") ? 1 : 0);
        prop.put("storeHTCacheChecked", env.getConfig("storeHTCache", "").equals("true") ? 1 : 0);
        prop.put("localIndexingChecked", env.getConfig("localIndexing", "").equals("true") ? 1 : 0);