produce a bookmark entry from every crawl start. these bookmarks are always private.

these bookmarks will be used to get a source reference for the search in case of intranet or portal searches. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@8062 6c8d7289-2bf4-0310-a012-ef5d649a1542
13 years ago · e4a82ddd8b
parent 257c399d29
commit e4a82ddd8b
5 changed files with 31 additions and 28 deletions
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@ -955,7 +955,7 @@ routing.deleteOldSeeds.time = 30

 # options to remember the default search engines when using the search compare features
 compare_yacy.left = YaCy
-compare_yacy.right = YaCy
+compare_yacy.right = google.com

 # minimum free disk space for crawling (MiB)
 disk.free = 3000
--- a/htroot/CrawlStartExpert_p.html
+++ b/htroot/CrawlStartExpert_p.html
@ -39,12 +39,19 @@
        <tr valign="top" class="TableCellSummary">
          <td>Starting Point:</td>
          <td>
-            <table cellpadding="0" cellspacing="0">            
+            <table cellpadding="0" cellspacing="0">
              <tr>
                <td><label for="url"><span class="nobr">From URL</span></label>:</td>
                <td><input type="radio" name="crawlingMode" id="url" value="url" checked="checked" /></td>
                <td>
-                  <input name="crawlingURL" id="crawlingURL" type="text" size="41" maxlength="256" value="#[starturl]#" onkeypress="changed()" onfocus="check('url')" />                                    
+                  <input name="crawlingURL" id="crawlingURL" type="text" size="41" maxlength="256" value="#[starturl]#" onkeypress="changed()" onfocus="check('url')" />
+                </td>
+              </tr>
+              <tr>
+                <td></td>
+                <td></td>
+                <td>
+                  <input name="bookmarkTitle" id="bookmarkTitle" type="text" size="50" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/>
                </td>
              </tr>
              <tr>
@ -307,23 +314,6 @@
          </td>
        </tr>
        -->
-        <tr valign="top" class="TableCellLight">
-          <td>Create Bookmark</td>
-          <td>
-	          <label for="createBookmark">Use</label>:
-	          <input type="checkbox" name="createBookmark" id="createBookmark" />
-	          &nbsp;&nbsp;&nbsp;(works with "Starting Point: From URL" only)
-	          <br /><br />
-			  <label for="bookmarkTitle"> Title</label>:&nbsp;&nbsp;&nbsp; 
-			  <input name="bookmarkTitle" id="bookmarkTitle" type="text" size="50" maxlength="100" /><br /><br />
-			  <label for="bookmarkFolder"> Folder</label>:
-			  <input name="bookmarkFolder" id="bookmarkFolder" type="text" size="50" maxlength="100" value="/crawlStart" />
-			  <br />&nbsp;          	  
-          </td>
-          <td>
-            This option lets you create a bookmark from your crawl start URL.
-          </td>
-        </tr>
        <tr valign="top" class="TableCellLight">
          <td colspan="5"><input type="submit" name="crawlingstart" value="Start New Crawl" /></td>
        </tr>
--- a/htroot/CrawlStartSite_p.html
+++ b/htroot/CrawlStartSite_p.html
@ -104,7 +104,6 @@
        <input type="hidden" name="xsstopw" id="xsstopw" value="off" />
        <input type="hidden" name="xdstopw" id="xdstopw" value="off" />
        <input type="hidden" name="xpstopw" id="xpstopw" value="off" />
-        <input type="hidden" name="createBookmark" id="createBookmark" value="off" />
        </dd>
        <!-- <dt>&nbsp;</dt><dd>&nbsp;</dd><dt>&nbsp;</dt><dd>&nbsp;</dd> -->
        <dt><label>Start</label></dt>
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -303,6 +303,7 @@ public class Crawler_p {
                        prop.putHTML("info_crawlingStart", crawlingStart);
                    } else try {

+
                        // check if the crawl filter works correctly
                        Pattern.compile(newcrawlingMustMatch);

@ -314,6 +315,11 @@ public class Crawler_p {
                        sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
                        sb.crawlQueues.errorURL.remove(urlhash);

+                        // get a scraper to get the title
+                        final ContentScraper scraper = sb.loader.parseResource(url, CacheStrategy.IFFRESH);
+                        final String title = scraper == null ? url.toNormalform(true, true) : scraper.getTitle();
+                        //final String description = scraper.getDescription();
+
                        // stack url
                        sb.crawler.removePassive(crawlingStartURL.hash()); // if there is an old entry, delete it
                        final CrawlProfile pe = new CrawlProfile(
@ -352,21 +358,30 @@ public class Crawler_p {

                        if (reasonString == null) {
                            // create a bookmark from crawl start url
-                            final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
+                            //final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
+                            final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString("/crawlStart"));
                            tags.add("crawlStart");
-                            if ("on".equals(post.get("createBookmark","off"))) {
+                            final String[] keywords = scraper.getKeywords();
+                            if (keywords != null) {
+                                for (final String k: keywords) {
+                                    final String kk = BookmarkHelper.cleanTagsString(k);
+                                    if (kk.length() > 0) tags.add(kk);
+                                }
+                            }
+                            //if ("on".equals(post.get("createBookmark","off"))) {
+                            // we will create always a bookmark to use this to track crawled hosts
                            final BookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(crawlingStart, "admin");
                                if (bookmark != null) {
-                                    bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_TITLE, post.get("bookmarkTitle", crawlingStart));
+                                    bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_TITLE, title /*post.get("bookmarkTitle", crawlingStart)*/);
                                    bookmark.setOwner("admin");
                                    bookmark.setPublic(false);
                                    bookmark.setTags(tags, true);
                                    sb.bookmarksDB.saveBookmark(bookmark);
                                }
-                            }
+                            //}
                            // liftoff!
                            prop.put("info", "8");//start msg
-                            prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
+                            prop.putHTML("info_crawlingURL", post.get("crawlingURL"));

                            // generate a YaCyNews if the global flag was set
                            if (!sb.isRobinsonMode() && crawlOrder) {
--- a/source/de/anomic/crawler/RobotsTxt.java
+++ b/source/de/anomic/crawler/RobotsTxt.java
@ -45,7 +45,6 @@ import net.yacy.cora.protocol.http.HTTPClient;
 import net.yacy.kelondro.blob.BEncodedHeap;
 import net.yacy.kelondro.index.RowSpaceExceededException;
 import net.yacy.kelondro.io.ByteCount;
-import net.yacy.kelondro.logging.Log;

 import org.apache.log4j.Logger;

@ -197,7 +196,7 @@ public class RobotsTxt {
                    }
                } else {
                    final byte[] robotsTxt = (byte[]) result[DOWNLOAD_ROBOTS_TXT];
-                    Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove
+                    //Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove
                    RobotsTxtParser parserResult;
                    ArrayList<String> denyPath;
                    if (((Boolean) result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue()) {