From e1574fe02e57ad141054241af312a5678d63fe3d Mon Sep 17 00:00:00 2001
From: apfelmaennchen <apfelmaennchen@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Mon, 4 Aug 2008 20:43:36 +0000
Subject: [PATCH] - added autoReCrawl folders to bookmarks
 (DATA/SETTINGS/autoReCrawl.conf) - the serverBusyThread checks folders every
 60 min. (==> autoReCrawl_idlesleep in yacy.conf) - added option to create
 bookmarks from CrawlStart URL

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5033 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 defaults/autoReCrawl.conf              |   8 ++
 htroot/Bookmarks.java                  |   2 +-
 htroot/CrawlStart_p.html               |  19 +++
 htroot/WatchCrawler_p.java             |  19 ++-
 source/de/anomic/data/bookmarksDB.java | 173 ++++++++++++++++++++++++-
 5 files changed, 217 insertions(+), 4 deletions(-)
 create mode 100644 defaults/autoReCrawl.conf
diff --git a/defaults/autoReCrawl.conf b/defaults/autoReCrawl.conf
new file mode 100644
index 000000000..cbe98f40d
--- /dev/null
+++ b/defaults/autoReCrawl.conf
@@ -0,0 +1,8 @@
+# YaCy autoReCrawl configuration for bookmark folders
+#
+# schedule|folder|filter|crawlingdepth|crawlingIfOlder|DomFilterDepth|DomMaxPages|crawlingQ|indexText|indexMedia|crawlOrder|xsstopw|storeHTCache
+3600000	/autoReCrawl/hourly	.*	1	59	-1	-1	1	1	1	1	0	0
+86400000	/autoReCrawl/daily	.*	3	1439	-1	-1	1	1	1	1	0	0
+604800000	/autoReCrawl/weekly	.*	3	10079	-1	-1	1	1	1	1	0	0
+2678400000	/autoReCrawl/monthly	.*	4	44639	-1	-1	1	1	1	1	0	0
+# eof
diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java
index 018447012..d088cb505 100644
--- a/htroot/Bookmarks.java
+++ b/htroot/Bookmarks.java
@@ -134,7 +134,7 @@ public class Bookmarks {
     			final String pathString = post.get("path");
     			tagsString=tagsString+","+pathString;
     			if(tagsString.equals("")){
-    				tagsString="unsorted"; //default tag
+    				tagsString="/unsorted"; //default tag
     			}
     			final Set<String> tags=listManager.string2set(bookmarksDB.cleanTagsString(tagsString)); 
     			final bookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(url, username);
diff --git a/htroot/CrawlStart_p.html b/htroot/CrawlStart_p.html
index 59a126570..bafd8f784 100644
--- a/htroot/CrawlStart_p.html
+++ b/htroot/CrawlStart_p.html
@@ -56,6 +56,25 @@
             Existing start URLs are always re-crawled.
             Other already visited URLs are sorted out as "double", if they are not allowed using the re-crawl option.
           </td>
+        </tr>
+        <tr valign="top" class="TableCellDark">
+          <td>Create Bookmark</td>
+          <td>
+	          <label for="createBookmark">Use</label>:
+	          <input type="checkbox" name="createBookmark" id="createBookmark" />&nbsp;&nbsp;&nbsp;
+			  <label for="bookmarkFolder"> Bookmark Folder</label>:
+			  <input name="bookmarkFolder" id="bookmarkFolder" type="text" size="20" maxlength="100" value="/crawlStart" /><br />
+          	  <br/><br/>This option works with "Starting Point: From URL" only!
+          </td>
+          <td>
+            This option lets you create a bookmark from your crawl start URL. For automatic re-crawling you can use the following default folders:<br/>
+            <ul>
+	            <li>/autoReCrawl/hourly</li>
+	            <li>/autoReCrawl/daily</li>
+	            <li>/autoReCrawl/weekly</li>
+	            <li>/autoReCrawl/monthly</li>
+            </ul>            
+          </td>
         </tr>
         <tr valign="top" class="TableCellLight">
           <td><label for="crawlingDepth">Crawling Depth</label>:</td>
diff --git a/htroot/WatchCrawler_p.java b/htroot/WatchCrawler_p.java
index 091310d1c..7d361e5b1 100644
--- a/htroot/WatchCrawler_p.java
+++ b/htroot/WatchCrawler_p.java
@@ -32,6 +32,7 @@ import java.util.Date;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
+import java.util.Set;
 import java.util.regex.Pattern;
 import java.util.regex.PatternSyntaxException;
 
@@ -40,6 +41,8 @@ import de.anomic.crawler.CrawlEntry;
 import de.anomic.crawler.CrawlProfile;
 import de.anomic.crawler.SitemapImporter;
 import de.anomic.crawler.ZURL;
+import de.anomic.data.bookmarksDB;
+import de.anomic.data.listManager;
 import de.anomic.htmlFilter.htmlFilterContentScraper;
 import de.anomic.htmlFilter.htmlFilterWriter;
 import de.anomic.http.httpHeader;
@@ -207,6 +210,19 @@ public class WatchCrawler_p {
                             final String reasonString = sb.crawlStacker.stackCrawl(url, null, sb.webIndex.seedDB.mySeed().hash, "CRAWLING-ROOT", new Date(), 0, pe);
                             
                             if (reasonString == null) {
+                            	// create a bookmark from crawl start url
+                            	Set<String> tags=listManager.string2set(bookmarksDB.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));                                
+                                tags.add("crawlStart");
+                            	if (post.get("createBookmark","off").equals("on")) {
+                                	bookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(crawlingStart, "admin");
+                        			if(bookmark != null){
+                        				bookmark.setProperty(bookmarksDB.Bookmark.BOOKMARK_TITLE, crawlingStart);                        				
+                        				bookmark.setOwner("admin");                        				
+                        				bookmark.setPublic(false);    
+                        				bookmark.setTags(tags, true);
+                        				sb.bookmarksDB.saveBookmark(bookmark);
+                        			}
+                                }
                                 // liftoff!
                                 prop.put("info", "8");//start msg
                                 prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
@@ -227,8 +243,7 @@ public class WatchCrawler_p {
                                     m.remove("specificFilter");
                                     m.put("intention", post.get("intention", "").replace(',', '/'));
                                     sb.webIndex.newsPool.publishMyNews(yacyNewsRecord.newRecord(sb.webIndex.seedDB.mySeed(), yacyNewsPool.CATEGORY_CRAWL_START, m));
-                                }
-                                
+                                }                                
                             } else {
                                 prop.put("info", "5"); //Crawling failed
                                 prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
diff --git a/source/de/anomic/data/bookmarksDB.java b/source/de/anomic/data/bookmarksDB.java
index cdcb17f55..d4d1e13b0 100644
--- a/source/de/anomic/data/bookmarksDB.java
+++ b/source/de/anomic/data/bookmarksDB.java
@@ -23,8 +23,13 @@
 
 package de.anomic.data;
 
+import java.io.BufferedReader;
 import java.io.ByteArrayInputStream;
 import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.FileWriter;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
@@ -54,6 +59,9 @@ import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
 import org.xml.sax.SAXException;
 
+import de.anomic.crawler.CrawlEntry;
+import de.anomic.crawler.CrawlProfile;
+import de.anomic.crawler.ZURL;
 import de.anomic.htmlFilter.htmlFilterContentScraper;
 import de.anomic.htmlFilter.htmlFilterWriter;
 import de.anomic.index.indexWord;
@@ -62,9 +70,14 @@ import de.anomic.kelondro.kelondroCloneableIterator;
 import de.anomic.kelondro.kelondroException;
 import de.anomic.kelondro.kelondroMap;
 import de.anomic.kelondro.kelondroNaturalOrder;
+import de.anomic.plasma.plasmaSwitchboard;
+import de.anomic.server.serverBusyThread;
 import de.anomic.server.serverDate;
 import de.anomic.server.serverFileUtils;
+import de.anomic.server.serverInstantBusyThread;
 import de.anomic.server.logging.serverLog;
+import de.anomic.yacy.yacyNewsPool;
+import de.anomic.yacy.yacyNewsRecord;
 import de.anomic.yacy.yacyURL;
 
 public class bookmarksDB {
@@ -75,6 +88,7 @@ public class bookmarksDB {
 	final static int SORT_ALPHA = 1;
 	final static int SORT_SIZE = 2;
 	final static int SHOW_ALL = -1;
+	final static String SLEEP_TIME = "3600000"; // default sleepTime: check for recrawls every hour
 	
 	// bookmarks
     kelondroMap bookmarksTable;		// kelondroMap bookmarksTable;
@@ -85,7 +99,9 @@ public class bookmarksDB {
     
     // dates
     kelondroMap datesTable;
-
+    
+    // autoReCrawl    
+    private serverBusyThread autoReCrawl;
     
 	// ------------------------------------
 	// bookmarksDB's class constructor
@@ -109,6 +125,14 @@ public class bookmarksDB {
         this.datesTable = new kelondroMap(new kelondroBLOBTree(datesFile, true, true, 20, 256, '_', kelondroNaturalOrder.naturalOrder, true, false, false), 500);
         if (!datesExisted) rebuildDates();
 
+        // autoReCrawl
+        plasmaSwitchboard sb = plasmaSwitchboard.getSwitchboard();
+        this.autoReCrawl = new serverInstantBusyThread(this, "autoReCrawl", null, null);
+        long sleepTime = Long.parseLong(sb.getConfig("autoReCrawl_idlesleep" , SLEEP_TIME));
+        sb.deployThread("autoReCrawl", "autoReCrawl Scheduler", "simple scheduler for automatic re-crawls of bookmarked urls", null, autoReCrawl, -1,
+                sleepTime, sleepTime, Long.parseLong(sb.getConfig("autoReCrawl_memprereq" , "-1"))
+        );
+        serverLog.logInfo("BOOKMARKS", "autoReCrawl - thread initialized checking every "+(sleepTime/1000/60)+" minutes for recrawls");
     }
 
     // -----------------------------------------------------
@@ -122,6 +146,153 @@ public class bookmarksDB {
         datesTable.close();
     }
     
+    // -----------------------------------------------------
+	// bookmarksDB's functions for autoReCrawl
+	// ----------------------------------------------------- 
+    
+    public boolean autoReCrawl() {
+    	
+    	// read crontab
+        File f = new File (plasmaSwitchboard.getSwitchboard().getRootPath(),"DATA/SETTINGS/autoReCrawl.conf");
+        String s;
+        try {                    	
+        	BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(f)));
+        	serverLog.logInfo("BOOKMARKS", "autoReCrawl - reading schedules from " + f);
+        	while( null != (s = in.readLine()) ) {
+        		if (!s.startsWith("#") && s.length()>0) {        			
+        			String parser[] = s.split("\t");
+        			if (parser.length == 13) {        				
+        				folderReCrawl(Long.parseLong(parser[0]), parser[1], parser[2], Integer.parseInt(parser[3]), Integer.parseInt(parser[4]), 
+           								Integer.parseInt(parser[5]), Integer.parseInt(parser[6]), Boolean.parseBoolean(parser[7]), 
+           								Boolean.parseBoolean(parser[8]), Boolean.parseBoolean(parser[9]), 
+           								Boolean.parseBoolean(parser[10]), Boolean.parseBoolean(parser[11]), 
+           								Boolean.parseBoolean(parser[12])
+           				);           			
+        			}
+        		}        		
+        	}
+        	in.close();
+        } catch( FileNotFoundException ex ) {        	
+        	try {
+        		serverLog.logInfo("BOOKMARKS", "autoReCrawl - creating new autoReCrawl.conf"); 
+        		File inputFile = new File(plasmaSwitchboard.getSwitchboard().getRootPath(),"defaults/autoReCrawl.conf");
+	            File outputFile = new File(plasmaSwitchboard.getSwitchboard().getRootPath(),"DATA/SETTINGS/autoReCrawl.conf");	
+	            FileReader i = new FileReader(inputFile);
+	            FileWriter o = new FileWriter(outputFile);
+	            int c;	
+	            while ((c = i.read()) != -1)
+	              o.write(c);	
+	            i.close();
+	            o.close();
+	            autoReCrawl();
+	        	return true;
+        	} catch( FileNotFoundException e ) {
+        		 serverLog.logSevere("BOOKMARKS", "autoReCrawl - file not found error: defaults/autoReCrawl.conf", e);
+        		 return false;
+        	} catch (IOException e) {
+        		serverLog.logSevere("BOOKMARKS", "autoReCrawl - IOException: defaults/autoReCrawl.conf", e);
+       		 	return false;
+        	}
+        } catch( Exception ex ) {
+        	serverLog.logSevere("BOOKMARKS", "autoReCrawl - error reading " + f, ex);
+        	return false;
+        }
+    	return true;
+    }    
+    
+    public void folderReCrawl (long schedule, String folder, String newcrawlingfilter, int newcrawlingdepth, int crawlingIfOlder, 
+    		int crawlingDomFilterDepth, int crawlingDomMaxPages, boolean crawlingQ, boolean indexText, boolean indexMedia, 
+    		boolean crawlOrder, boolean xsstopw, boolean storeHTCache) {
+
+	    plasmaSwitchboard sb = plasmaSwitchboard.getSwitchboard();
+	    Iterator<String> bit=getBookmarksIterator(folder, true);    		
+		serverLog.logInfo("BOOKMARKS", "autoReCrawl - processing: "+folder);
+		 
+		boolean xdstopw = xsstopw;
+		boolean xpstopw = xsstopw;
+				
+		while(bit.hasNext()) {
+			
+			Bookmark bm = getBookmark(bit.next());			
+			long sleepTime = Long.parseLong(sb.getConfig("autoReCrawl_idlesleep" , SLEEP_TIME));			
+			long interTime = (System.currentTimeMillis()-bm.getTimeStamp())%schedule;
+			
+			Date date=new Date(bm.getTimeStamp());
+			serverLog.logInfo("BOOKMARKS", "autoReCrawl - checking schedule for: "+"["+serverDate.formatISO8601(date)+"] "+bm.getUrl());
+			
+			if (interTime >= 0 && interTime < sleepTime) {			
+				try {	
+	    			// check if the crawl filter works correctly    			
+	    			Pattern.compile(newcrawlingfilter);
+	    			
+	    			// set crawlingStart to BookmarkUrl    			
+	    			String crawlingStart = bm.getUrl();	    			
+	    			
+                    // stack request
+                    // first delete old entry, if exists
+                    yacyURL crawlingStartURL = new yacyURL(crawlingStart, null);
+                    String urlhash = crawlingStartURL.hash();
+                    sb.webIndex.removeURL(urlhash);
+                    sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
+                    sb.crawlQueues.errorURL.remove(urlhash);
+	               
+	                // stack url
+	                sb.webIndex.profilesPassiveCrawls.removeEntry(crawlingStartURL.hash()); // if there is an old entry, delete it
+	                CrawlProfile.entry pe = sb.webIndex.profilesActiveCrawls.newEntry(
+	                        "autoReCrawl", crawlingStartURL, newcrawlingfilter, newcrawlingfilter,
+	                        newcrawlingdepth, newcrawlingdepth,
+	                        crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages,
+	                        crawlingQ,
+	                        indexText, indexMedia,
+	                        storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw);
+	                String reasonString = sb.crawlStacker.stackCrawl(crawlingStartURL, null, sb.webIndex.seedDB.mySeed().hash, "CRAWLING-ROOT", new Date(), 0, pe);
+	                
+	                if (reasonString == null) {
+	                	serverLog.logInfo("BOOKMARKS", "autoReCrawl - adding crawl profile for: " + crawlingStart);
+	                	// generate a YaCyNews if the global flag was set
+	                    if (crawlOrder) {
+	                        Map<String, String> m = new HashMap<String, String>(pe.map()); // must be cloned
+	                        m.remove("specificDepth");
+	                        m.remove("indexText");
+	                        m.remove("indexMedia");
+	                        m.remove("remoteIndexing");
+	                        m.remove("xsstopw");
+	                        m.remove("xpstopw");
+	                        m.remove("xdstopw");
+	                        m.remove("storeTXCache");
+	                        m.remove("storeHTCache");
+	                        m.remove("generalFilter");
+	                        m.remove("specificFilter");
+	                        m.put("intention", "Automatic ReCrawl!");
+	                        sb.webIndex.newsPool.publishMyNews(yacyNewsRecord.newRecord(sb.webIndex.seedDB.mySeed(), yacyNewsPool.CATEGORY_CRAWL_START, m));	                      
+	                    }                    
+	                } else {
+	                	serverLog.logInfo("BOOKMARKS", "autoReCrawl error adding crawl profile: " + crawlingStart + "- " + reasonString);                	
+	                	ZURL.Entry ee = sb.crawlQueues.errorURL.newEntry(
+	                            new CrawlEntry(
+	                                    sb.webIndex.seedDB.mySeed().hash, 
+	                                    crawlingStartURL, 
+	                                    "", 
+	                                    "", 
+	                                    new Date(),
+	                                    pe.handle(),
+	                                    0, 
+	                                    0, 
+	                                    0),
+	                            sb.webIndex.seedDB.mySeed().hash,
+	                            new Date(),
+	                            1,
+	                            reasonString);
+	                    
+	                    ee.store();
+	                    sb.crawlQueues.errorURL.push(ee);
+	                }              
+	    		} catch (MalformedURLException e1) {}
+			} // if
+		} // while(bit.hasNext())    	
+       	return;
+    } // } autoReCrawl() 
+    
     // -------------------------------------
 	// bookmarksDB's public helper functions
 	// -------------------------------------