From e1574fe02e57ad141054241af312a5678d63fe3d Mon Sep 17 00:00:00 2001 From: apfelmaennchen Date: Mon, 4 Aug 2008 20:43:36 +0000 Subject: [PATCH] - added autoReCrawl folders to bookmarks (DATA/SETTINGS/autoReCrawl.conf) - the serverBusyThread checks folders every 60 min. (==> autoReCrawl_idlesleep in yacy.conf) - added option to create bookmarks from CrawlStart URL git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5033 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- defaults/autoReCrawl.conf | 8 ++ htroot/Bookmarks.java | 2 +- htroot/CrawlStart_p.html | 19 +++ htroot/WatchCrawler_p.java | 19 ++- source/de/anomic/data/bookmarksDB.java | 173 ++++++++++++++++++++++++- 5 files changed, 217 insertions(+), 4 deletions(-) create mode 100644 defaults/autoReCrawl.conf diff --git a/defaults/autoReCrawl.conf b/defaults/autoReCrawl.conf new file mode 100644 index 000000000..cbe98f40d --- /dev/null +++ b/defaults/autoReCrawl.conf @@ -0,0 +1,8 @@ +# YaCy autoReCrawl configuration for bookmark folders +# +# schedule|folder|filter|crawlingdepth|crawlingIfOlder|DomFilterDepth|DomMaxPages|crawlingQ|indexText|indexMedia|crawlOrder|xsstopw|storeHTCache +3600000 /autoReCrawl/hourly .* 1 59 -1 -1 1 1 1 1 0 0 +86400000 /autoReCrawl/daily .* 3 1439 -1 -1 1 1 1 1 0 0 +604800000 /autoReCrawl/weekly .* 3 10079 -1 -1 1 1 1 1 0 0 +2678400000 /autoReCrawl/monthly .* 4 44639 -1 -1 1 1 1 1 0 0 +# eof diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java index 018447012..d088cb505 100644 --- a/htroot/Bookmarks.java +++ b/htroot/Bookmarks.java @@ -134,7 +134,7 @@ public class Bookmarks { final String pathString = post.get("path"); tagsString=tagsString+","+pathString; if(tagsString.equals("")){ - tagsString="unsorted"; //default tag + tagsString="/unsorted"; //default tag } final Set tags=listManager.string2set(bookmarksDB.cleanTagsString(tagsString)); final bookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(url, username); diff --git a/htroot/CrawlStart_p.html b/htroot/CrawlStart_p.html index 59a126570..bafd8f784 100644 --- a/htroot/CrawlStart_p.html +++ b/htroot/CrawlStart_p.html @@ -56,6 +56,25 @@ Existing start URLs are always re-crawled. Other already visited URLs are sorted out as "double", if they are not allowed using the re-crawl option. + + + Create Bookmark + + : +     + : +
+

This option works with "Starting Point: From URL" only! + + + This option lets you create a bookmark from your crawl start URL. For automatic re-crawling you can use the following default folders:
+ + : diff --git a/htroot/WatchCrawler_p.java b/htroot/WatchCrawler_p.java index 091310d1c..7d361e5b1 100644 --- a/htroot/WatchCrawler_p.java +++ b/htroot/WatchCrawler_p.java @@ -32,6 +32,7 @@ import java.util.Date; import java.util.HashMap; import java.util.Iterator; import java.util.Map; +import java.util.Set; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; @@ -40,6 +41,8 @@ import de.anomic.crawler.CrawlEntry; import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.SitemapImporter; import de.anomic.crawler.ZURL; +import de.anomic.data.bookmarksDB; +import de.anomic.data.listManager; import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterWriter; import de.anomic.http.httpHeader; @@ -207,6 +210,19 @@ public class WatchCrawler_p { final String reasonString = sb.crawlStacker.stackCrawl(url, null, sb.webIndex.seedDB.mySeed().hash, "CRAWLING-ROOT", new Date(), 0, pe); if (reasonString == null) { + // create a bookmark from crawl start url + Set tags=listManager.string2set(bookmarksDB.cleanTagsString(post.get("bookmarkFolder","/crawlStart"))); + tags.add("crawlStart"); + if (post.get("createBookmark","off").equals("on")) { + bookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(crawlingStart, "admin"); + if(bookmark != null){ + bookmark.setProperty(bookmarksDB.Bookmark.BOOKMARK_TITLE, crawlingStart); + bookmark.setOwner("admin"); + bookmark.setPublic(false); + bookmark.setTags(tags, true); + sb.bookmarksDB.saveBookmark(bookmark); + } + } // liftoff! prop.put("info", "8");//start msg prop.putHTML("info_crawlingURL", (post.get("crawlingURL"))); @@ -227,8 +243,7 @@ public class WatchCrawler_p { m.remove("specificFilter"); m.put("intention", post.get("intention", "").replace(',', '/')); sb.webIndex.newsPool.publishMyNews(yacyNewsRecord.newRecord(sb.webIndex.seedDB.mySeed(), yacyNewsPool.CATEGORY_CRAWL_START, m)); - } - + } } else { prop.put("info", "5"); //Crawling failed prop.putHTML("info_crawlingURL", (post.get("crawlingURL"))); diff --git a/source/de/anomic/data/bookmarksDB.java b/source/de/anomic/data/bookmarksDB.java index cdcb17f55..d4d1e13b0 100644 --- a/source/de/anomic/data/bookmarksDB.java +++ b/source/de/anomic/data/bookmarksDB.java @@ -23,8 +23,13 @@ package de.anomic.data; +import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; @@ -54,6 +59,9 @@ import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; +import de.anomic.crawler.CrawlEntry; +import de.anomic.crawler.CrawlProfile; +import de.anomic.crawler.ZURL; import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterWriter; import de.anomic.index.indexWord; @@ -62,9 +70,14 @@ import de.anomic.kelondro.kelondroCloneableIterator; import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroMap; import de.anomic.kelondro.kelondroNaturalOrder; +import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.server.serverBusyThread; import de.anomic.server.serverDate; import de.anomic.server.serverFileUtils; +import de.anomic.server.serverInstantBusyThread; import de.anomic.server.logging.serverLog; +import de.anomic.yacy.yacyNewsPool; +import de.anomic.yacy.yacyNewsRecord; import de.anomic.yacy.yacyURL; public class bookmarksDB { @@ -75,6 +88,7 @@ public class bookmarksDB { final static int SORT_ALPHA = 1; final static int SORT_SIZE = 2; final static int SHOW_ALL = -1; + final static String SLEEP_TIME = "3600000"; // default sleepTime: check for recrawls every hour // bookmarks kelondroMap bookmarksTable; // kelondroMap bookmarksTable; @@ -85,7 +99,9 @@ public class bookmarksDB { // dates kelondroMap datesTable; - + + // autoReCrawl + private serverBusyThread autoReCrawl; // ------------------------------------ // bookmarksDB's class constructor @@ -109,6 +125,14 @@ public class bookmarksDB { this.datesTable = new kelondroMap(new kelondroBLOBTree(datesFile, true, true, 20, 256, '_', kelondroNaturalOrder.naturalOrder, true, false, false), 500); if (!datesExisted) rebuildDates(); + // autoReCrawl + plasmaSwitchboard sb = plasmaSwitchboard.getSwitchboard(); + this.autoReCrawl = new serverInstantBusyThread(this, "autoReCrawl", null, null); + long sleepTime = Long.parseLong(sb.getConfig("autoReCrawl_idlesleep" , SLEEP_TIME)); + sb.deployThread("autoReCrawl", "autoReCrawl Scheduler", "simple scheduler for automatic re-crawls of bookmarked urls", null, autoReCrawl, -1, + sleepTime, sleepTime, Long.parseLong(sb.getConfig("autoReCrawl_memprereq" , "-1")) + ); + serverLog.logInfo("BOOKMARKS", "autoReCrawl - thread initialized checking every "+(sleepTime/1000/60)+" minutes for recrawls"); } // ----------------------------------------------------- @@ -122,6 +146,153 @@ public class bookmarksDB { datesTable.close(); } + // ----------------------------------------------------- + // bookmarksDB's functions for autoReCrawl + // ----------------------------------------------------- + + public boolean autoReCrawl() { + + // read crontab + File f = new File (plasmaSwitchboard.getSwitchboard().getRootPath(),"DATA/SETTINGS/autoReCrawl.conf"); + String s; + try { + BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(f))); + serverLog.logInfo("BOOKMARKS", "autoReCrawl - reading schedules from " + f); + while( null != (s = in.readLine()) ) { + if (!s.startsWith("#") && s.length()>0) { + String parser[] = s.split("\t"); + if (parser.length == 13) { + folderReCrawl(Long.parseLong(parser[0]), parser[1], parser[2], Integer.parseInt(parser[3]), Integer.parseInt(parser[4]), + Integer.parseInt(parser[5]), Integer.parseInt(parser[6]), Boolean.parseBoolean(parser[7]), + Boolean.parseBoolean(parser[8]), Boolean.parseBoolean(parser[9]), + Boolean.parseBoolean(parser[10]), Boolean.parseBoolean(parser[11]), + Boolean.parseBoolean(parser[12]) + ); + } + } + } + in.close(); + } catch( FileNotFoundException ex ) { + try { + serverLog.logInfo("BOOKMARKS", "autoReCrawl - creating new autoReCrawl.conf"); + File inputFile = new File(plasmaSwitchboard.getSwitchboard().getRootPath(),"defaults/autoReCrawl.conf"); + File outputFile = new File(plasmaSwitchboard.getSwitchboard().getRootPath(),"DATA/SETTINGS/autoReCrawl.conf"); + FileReader i = new FileReader(inputFile); + FileWriter o = new FileWriter(outputFile); + int c; + while ((c = i.read()) != -1) + o.write(c); + i.close(); + o.close(); + autoReCrawl(); + return true; + } catch( FileNotFoundException e ) { + serverLog.logSevere("BOOKMARKS", "autoReCrawl - file not found error: defaults/autoReCrawl.conf", e); + return false; + } catch (IOException e) { + serverLog.logSevere("BOOKMARKS", "autoReCrawl - IOException: defaults/autoReCrawl.conf", e); + return false; + } + } catch( Exception ex ) { + serverLog.logSevere("BOOKMARKS", "autoReCrawl - error reading " + f, ex); + return false; + } + return true; + } + + public void folderReCrawl (long schedule, String folder, String newcrawlingfilter, int newcrawlingdepth, int crawlingIfOlder, + int crawlingDomFilterDepth, int crawlingDomMaxPages, boolean crawlingQ, boolean indexText, boolean indexMedia, + boolean crawlOrder, boolean xsstopw, boolean storeHTCache) { + + plasmaSwitchboard sb = plasmaSwitchboard.getSwitchboard(); + Iterator bit=getBookmarksIterator(folder, true); + serverLog.logInfo("BOOKMARKS", "autoReCrawl - processing: "+folder); + + boolean xdstopw = xsstopw; + boolean xpstopw = xsstopw; + + while(bit.hasNext()) { + + Bookmark bm = getBookmark(bit.next()); + long sleepTime = Long.parseLong(sb.getConfig("autoReCrawl_idlesleep" , SLEEP_TIME)); + long interTime = (System.currentTimeMillis()-bm.getTimeStamp())%schedule; + + Date date=new Date(bm.getTimeStamp()); + serverLog.logInfo("BOOKMARKS", "autoReCrawl - checking schedule for: "+"["+serverDate.formatISO8601(date)+"] "+bm.getUrl()); + + if (interTime >= 0 && interTime < sleepTime) { + try { + // check if the crawl filter works correctly + Pattern.compile(newcrawlingfilter); + + // set crawlingStart to BookmarkUrl + String crawlingStart = bm.getUrl(); + + // stack request + // first delete old entry, if exists + yacyURL crawlingStartURL = new yacyURL(crawlingStart, null); + String urlhash = crawlingStartURL.hash(); + sb.webIndex.removeURL(urlhash); + sb.crawlQueues.noticeURL.removeByURLHash(urlhash); + sb.crawlQueues.errorURL.remove(urlhash); + + // stack url + sb.webIndex.profilesPassiveCrawls.removeEntry(crawlingStartURL.hash()); // if there is an old entry, delete it + CrawlProfile.entry pe = sb.webIndex.profilesActiveCrawls.newEntry( + "autoReCrawl", crawlingStartURL, newcrawlingfilter, newcrawlingfilter, + newcrawlingdepth, newcrawlingdepth, + crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages, + crawlingQ, + indexText, indexMedia, + storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw); + String reasonString = sb.crawlStacker.stackCrawl(crawlingStartURL, null, sb.webIndex.seedDB.mySeed().hash, "CRAWLING-ROOT", new Date(), 0, pe); + + if (reasonString == null) { + serverLog.logInfo("BOOKMARKS", "autoReCrawl - adding crawl profile for: " + crawlingStart); + // generate a YaCyNews if the global flag was set + if (crawlOrder) { + Map m = new HashMap(pe.map()); // must be cloned + m.remove("specificDepth"); + m.remove("indexText"); + m.remove("indexMedia"); + m.remove("remoteIndexing"); + m.remove("xsstopw"); + m.remove("xpstopw"); + m.remove("xdstopw"); + m.remove("storeTXCache"); + m.remove("storeHTCache"); + m.remove("generalFilter"); + m.remove("specificFilter"); + m.put("intention", "Automatic ReCrawl!"); + sb.webIndex.newsPool.publishMyNews(yacyNewsRecord.newRecord(sb.webIndex.seedDB.mySeed(), yacyNewsPool.CATEGORY_CRAWL_START, m)); + } + } else { + serverLog.logInfo("BOOKMARKS", "autoReCrawl error adding crawl profile: " + crawlingStart + "- " + reasonString); + ZURL.Entry ee = sb.crawlQueues.errorURL.newEntry( + new CrawlEntry( + sb.webIndex.seedDB.mySeed().hash, + crawlingStartURL, + "", + "", + new Date(), + pe.handle(), + 0, + 0, + 0), + sb.webIndex.seedDB.mySeed().hash, + new Date(), + 1, + reasonString); + + ee.store(); + sb.crawlQueues.errorURL.push(ee); + } + } catch (MalformedURLException e1) {} + } // if + } // while(bit.hasNext()) + return; + } // } autoReCrawl() + // ------------------------------------- // bookmarksDB's public helper functions // -------------------------------------