From e4a82ddd8b4753a25550a27d5f0172faa44c5728 Mon Sep 17 00:00:00 2001
From: orbiter <orbiter@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Mon, 21 Nov 2011 23:10:29 +0000
Subject: [PATCH] produce a bookmark entry from every crawl start. these
 bookmarks are always private. these bookmarks will be used to get a source
 reference for the search in case of intranet or portal searches.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@8062 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 defaults/yacy.init                      |  2 +-
 htroot/CrawlStartExpert_p.html          | 28 ++++++++-----------------
 htroot/CrawlStartSite_p.html            |  1 -
 htroot/Crawler_p.java                   | 25 +++++++++++++++++-----
 source/de/anomic/crawler/RobotsTxt.java |  3 +--
 5 files changed, 31 insertions(+), 28 deletions(-)
diff --git a/defaults/yacy.init b/defaults/yacy.init
index 16127bda2..ffac6aebe 100644
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@@ -955,7 +955,7 @@ routing.deleteOldSeeds.time = 30
 
 # options to remember the default search engines when using the search compare features
 compare_yacy.left = YaCy
-compare_yacy.right = YaCy
+compare_yacy.right = google.com
 
 # minimum free disk space for crawling (MiB)
 disk.free = 3000
diff --git a/htroot/CrawlStartExpert_p.html b/htroot/CrawlStartExpert_p.html
index 5d0d55fcc..b166c3733 100644
--- a/htroot/CrawlStartExpert_p.html
+++ b/htroot/CrawlStartExpert_p.html
@@ -39,12 +39,19 @@
         <tr valign="top" class="TableCellSummary">
           <td>Starting Point:</td>
           <td>
-            <table cellpadding="0" cellspacing="0">            
+            <table cellpadding="0" cellspacing="0">
               <tr>
                 <td><label for="url"><span class="nobr">From URL</span></label>:</td>
                 <td><input type="radio" name="crawlingMode" id="url" value="url" checked="checked" /></td>
                 <td>
-                  <input name="crawlingURL" id="crawlingURL" type="text" size="41" maxlength="256" value="#[starturl]#" onkeypress="changed()" onfocus="check('url')" />                                    
+                  <input name="crawlingURL" id="crawlingURL" type="text" size="41" maxlength="256" value="#[starturl]#" onkeypress="changed()" onfocus="check('url')" />
+                </td>
+              </tr>
+              <tr>
+                <td></td>
+                <td></td>
+                <td>
+                  <input name="bookmarkTitle" id="bookmarkTitle" type="text" size="50" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/>
                 </td>
               </tr>
               <tr>
@@ -307,23 +314,6 @@
           </td>
         </tr>
         -->
-        <tr valign="top" class="TableCellLight">
-          <td>Create Bookmark</td>
-          <td>
-	          <label for="createBookmark">Use</label>:
-	          <input type="checkbox" name="createBookmark" id="createBookmark" />
-	          &nbsp;&nbsp;&nbsp;(works with "Starting Point: From URL" only)
-	          <br /><br />
-			  <label for="bookmarkTitle"> Title</label>:&nbsp;&nbsp;&nbsp; 
-			  <input name="bookmarkTitle" id="bookmarkTitle" type="text" size="50" maxlength="100" /><br /><br />
-			  <label for="bookmarkFolder"> Folder</label>:
-			  <input name="bookmarkFolder" id="bookmarkFolder" type="text" size="50" maxlength="100" value="/crawlStart" />
-			  <br />&nbsp;          	  
-          </td>
-          <td>
-            This option lets you create a bookmark from your crawl start URL.
-          </td>
-        </tr>
         <tr valign="top" class="TableCellLight">
           <td colspan="5"><input type="submit" name="crawlingstart" value="Start New Crawl" /></td>
         </tr>
diff --git a/htroot/CrawlStartSite_p.html b/htroot/CrawlStartSite_p.html
index bdcfc89c7..a6fce7777 100644
--- a/htroot/CrawlStartSite_p.html
+++ b/htroot/CrawlStartSite_p.html
@@ -104,7 +104,6 @@
         <input type="hidden" name="xsstopw" id="xsstopw" value="off" />
         <input type="hidden" name="xdstopw" id="xdstopw" value="off" />
         <input type="hidden" name="xpstopw" id="xpstopw" value="off" />
-        <input type="hidden" name="createBookmark" id="createBookmark" value="off" />
         </dd>
         <!-- <dt>&nbsp;</dt><dd>&nbsp;</dd><dt>&nbsp;</dt><dd>&nbsp;</dd> -->
         <dt><label>Start</label></dt>
diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java
index 05b66ee71..b91376d2e 100644
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@@ -303,6 +303,7 @@ public class Crawler_p {
                         prop.putHTML("info_crawlingStart", crawlingStart);
                     } else try {
 
+
                         // check if the crawl filter works correctly
                         Pattern.compile(newcrawlingMustMatch);
 
@@ -314,6 +315,11 @@ public class Crawler_p {
                         sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
                         sb.crawlQueues.errorURL.remove(urlhash);
 
+                        // get a scraper to get the title
+                        final ContentScraper scraper = sb.loader.parseResource(url, CacheStrategy.IFFRESH);
+                        final String title = scraper == null ? url.toNormalform(true, true) : scraper.getTitle();
+                        //final String description = scraper.getDescription();
+
                         // stack url
                         sb.crawler.removePassive(crawlingStartURL.hash()); // if there is an old entry, delete it
                         final CrawlProfile pe = new CrawlProfile(
@@ -352,21 +358,30 @@ public class Crawler_p {
 
                         if (reasonString == null) {
                             // create a bookmark from crawl start url
-                            final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
+                            //final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
+                            final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString("/crawlStart"));
                             tags.add("crawlStart");
-                            if ("on".equals(post.get("createBookmark","off"))) {
+                            final String[] keywords = scraper.getKeywords();
+                            if (keywords != null) {
+                                for (final String k: keywords) {
+                                    final String kk = BookmarkHelper.cleanTagsString(k);
+                                    if (kk.length() > 0) tags.add(kk);
+                                }
+                            }
+                            //if ("on".equals(post.get("createBookmark","off"))) {
+                            // we will create always a bookmark to use this to track crawled hosts
                             final BookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(crawlingStart, "admin");
                                 if (bookmark != null) {
-                                    bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_TITLE, post.get("bookmarkTitle", crawlingStart));
+                                    bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_TITLE, title /*post.get("bookmarkTitle", crawlingStart)*/);
                                     bookmark.setOwner("admin");
                                     bookmark.setPublic(false);
                                     bookmark.setTags(tags, true);
                                     sb.bookmarksDB.saveBookmark(bookmark);
                                 }
-                            }
+                            //}
                             // liftoff!
                             prop.put("info", "8");//start msg
-                            prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
+                            prop.putHTML("info_crawlingURL", post.get("crawlingURL"));
 
                             // generate a YaCyNews if the global flag was set
                             if (!sb.isRobinsonMode() && crawlOrder) {
diff --git a/source/de/anomic/crawler/RobotsTxt.java b/source/de/anomic/crawler/RobotsTxt.java
index f56ea0e4e..b71666907 100644
--- a/source/de/anomic/crawler/RobotsTxt.java
+++ b/source/de/anomic/crawler/RobotsTxt.java
@@ -45,7 +45,6 @@ import net.yacy.cora.protocol.http.HTTPClient;
 import net.yacy.kelondro.blob.BEncodedHeap;
 import net.yacy.kelondro.index.RowSpaceExceededException;
 import net.yacy.kelondro.io.ByteCount;
-import net.yacy.kelondro.logging.Log;
 
 import org.apache.log4j.Logger;
 
@@ -197,7 +196,7 @@ public class RobotsTxt {
                     }
                 } else {
                     final byte[] robotsTxt = (byte[]) result[DOWNLOAD_ROBOTS_TXT];
-                    Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove
+                    //Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove
                     RobotsTxtParser parserResult;
                     ArrayList<String> denyPath;
                     if (((Boolean) result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue()) {

From URL:		- + +
		+
Create Bookmark	- Use: - - (works with "Starting Point: From URL" only) - - Title: - - Folder: - - -	- This option lets you create a bookmark from your crawl start URL. -