From e4a82ddd8b4753a25550a27d5f0172faa44c5728 Mon Sep 17 00:00:00 2001 From: orbiter Date: Mon, 21 Nov 2011 23:10:29 +0000 Subject: [PATCH] produce a bookmark entry from every crawl start. these bookmarks are always private. these bookmarks will be used to get a source reference for the search in case of intranet or portal searches. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@8062 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- defaults/yacy.init | 2 +- htroot/CrawlStartExpert_p.html | 28 ++++++++----------------- htroot/CrawlStartSite_p.html | 1 - htroot/Crawler_p.java | 25 +++++++++++++++++----- source/de/anomic/crawler/RobotsTxt.java | 3 +-- 5 files changed, 31 insertions(+), 28 deletions(-) diff --git a/defaults/yacy.init b/defaults/yacy.init index 16127bda2..ffac6aebe 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -955,7 +955,7 @@ routing.deleteOldSeeds.time = 30 # options to remember the default search engines when using the search compare features compare_yacy.left = YaCy -compare_yacy.right = YaCy +compare_yacy.right = google.com # minimum free disk space for crawling (MiB) disk.free = 3000 diff --git a/htroot/CrawlStartExpert_p.html b/htroot/CrawlStartExpert_p.html index 5d0d55fcc..b166c3733 100644 --- a/htroot/CrawlStartExpert_p.html +++ b/htroot/CrawlStartExpert_p.html @@ -39,12 +39,19 @@ Starting Point: - +
+ + + + + @@ -307,23 +314,6 @@ --> - - - - - diff --git a/htroot/CrawlStartSite_p.html b/htroot/CrawlStartSite_p.html index bdcfc89c7..a6fce7777 100644 --- a/htroot/CrawlStartSite_p.html +++ b/htroot/CrawlStartSite_p.html @@ -104,7 +104,6 @@ -
diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 05b66ee71..b91376d2e 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -303,6 +303,7 @@ public class Crawler_p { prop.putHTML("info_crawlingStart", crawlingStart); } else try { + // check if the crawl filter works correctly Pattern.compile(newcrawlingMustMatch); @@ -314,6 +315,11 @@ public class Crawler_p { sb.crawlQueues.noticeURL.removeByURLHash(urlhash); sb.crawlQueues.errorURL.remove(urlhash); + // get a scraper to get the title + final ContentScraper scraper = sb.loader.parseResource(url, CacheStrategy.IFFRESH); + final String title = scraper == null ? url.toNormalform(true, true) : scraper.getTitle(); + //final String description = scraper.getDescription(); + // stack url sb.crawler.removePassive(crawlingStartURL.hash()); // if there is an old entry, delete it final CrawlProfile pe = new CrawlProfile( @@ -352,21 +358,30 @@ public class Crawler_p { if (reasonString == null) { // create a bookmark from crawl start url - final Set tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart"))); + //final Set tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart"))); + final Set tags=ListManager.string2set(BookmarkHelper.cleanTagsString("/crawlStart")); tags.add("crawlStart"); - if ("on".equals(post.get("createBookmark","off"))) { + final String[] keywords = scraper.getKeywords(); + if (keywords != null) { + for (final String k: keywords) { + final String kk = BookmarkHelper.cleanTagsString(k); + if (kk.length() > 0) tags.add(kk); + } + } + //if ("on".equals(post.get("createBookmark","off"))) { + // we will create always a bookmark to use this to track crawled hosts final BookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(crawlingStart, "admin"); if (bookmark != null) { - bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_TITLE, post.get("bookmarkTitle", crawlingStart)); + bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_TITLE, title /*post.get("bookmarkTitle", crawlingStart)*/); bookmark.setOwner("admin"); bookmark.setPublic(false); bookmark.setTags(tags, true); sb.bookmarksDB.saveBookmark(bookmark); } - } + //} // liftoff! prop.put("info", "8");//start msg - prop.putHTML("info_crawlingURL", (post.get("crawlingURL"))); + prop.putHTML("info_crawlingURL", post.get("crawlingURL")); // generate a YaCyNews if the global flag was set if (!sb.isRobinsonMode() && crawlOrder) { diff --git a/source/de/anomic/crawler/RobotsTxt.java b/source/de/anomic/crawler/RobotsTxt.java index f56ea0e4e..b71666907 100644 --- a/source/de/anomic/crawler/RobotsTxt.java +++ b/source/de/anomic/crawler/RobotsTxt.java @@ -45,7 +45,6 @@ import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.kelondro.blob.BEncodedHeap; import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.io.ByteCount; -import net.yacy.kelondro.logging.Log; import org.apache.log4j.Logger; @@ -197,7 +196,7 @@ public class RobotsTxt { } } else { final byte[] robotsTxt = (byte[]) result[DOWNLOAD_ROBOTS_TXT]; - Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove + //Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove RobotsTxtParser parserResult; ArrayList denyPath; if (((Boolean) result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue()) {
: - + +
+
Create Bookmark - : - -    (works with "Starting Point: From URL" only) -

- :    -

- : - -
  -
- This option lets you create a bookmark from your crawl start URL. -