produce a bookmark entry from every crawl start. these bookmarks are always private.

these bookmarks will be used to get a source reference for the search in case of intranet or portal searches.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@8062 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 13 years ago
parent 257c399d29
commit e4a82ddd8b

@ -955,7 +955,7 @@ routing.deleteOldSeeds.time = 30
# options to remember the default search engines when using the search compare features
compare_yacy.left = YaCy
compare_yacy.right = YaCy
compare_yacy.right = google.com
# minimum free disk space for crawling (MiB)
disk.free = 3000

@ -39,12 +39,19 @@
<tr valign="top" class="TableCellSummary">
<td>Starting Point:</td>
<td>
<table cellpadding="0" cellspacing="0">
<table cellpadding="0" cellspacing="0">
<tr>
<td><label for="url"><span class="nobr">From URL</span></label>:</td>
<td><input type="radio" name="crawlingMode" id="url" value="url" checked="checked" /></td>
<td>
<input name="crawlingURL" id="crawlingURL" type="text" size="41" maxlength="256" value="#[starturl]#" onkeypress="changed()" onfocus="check('url')" />
<input name="crawlingURL" id="crawlingURL" type="text" size="41" maxlength="256" value="#[starturl]#" onkeypress="changed()" onfocus="check('url')" />
</td>
</tr>
<tr>
<td></td>
<td></td>
<td>
<input name="bookmarkTitle" id="bookmarkTitle" type="text" size="50" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/>
</td>
</tr>
<tr>
@ -307,23 +314,6 @@
</td>
</tr>
-->
<tr valign="top" class="TableCellLight">
<td>Create Bookmark</td>
<td>
<label for="createBookmark">Use</label>:
<input type="checkbox" name="createBookmark" id="createBookmark" />
&nbsp;&nbsp;&nbsp;(works with "Starting Point: From URL" only)
<br /><br />
<label for="bookmarkTitle"> Title</label>:&nbsp;&nbsp;&nbsp;
<input name="bookmarkTitle" id="bookmarkTitle" type="text" size="50" maxlength="100" /><br /><br />
<label for="bookmarkFolder"> Folder</label>:
<input name="bookmarkFolder" id="bookmarkFolder" type="text" size="50" maxlength="100" value="/crawlStart" />
<br />&nbsp;
</td>
<td>
This option lets you create a bookmark from your crawl start URL.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td colspan="5"><input type="submit" name="crawlingstart" value="Start New Crawl" /></td>
</tr>

@ -104,7 +104,6 @@
<input type="hidden" name="xsstopw" id="xsstopw" value="off" />
<input type="hidden" name="xdstopw" id="xdstopw" value="off" />
<input type="hidden" name="xpstopw" id="xpstopw" value="off" />
<input type="hidden" name="createBookmark" id="createBookmark" value="off" />
</dd>
<!-- <dt>&nbsp;</dt><dd>&nbsp;</dd><dt>&nbsp;</dt><dd>&nbsp;</dd> -->
<dt><label>Start</label></dt>

@ -303,6 +303,7 @@ public class Crawler_p {
prop.putHTML("info_crawlingStart", crawlingStart);
} else try {
// check if the crawl filter works correctly
Pattern.compile(newcrawlingMustMatch);
@ -314,6 +315,11 @@ public class Crawler_p {
sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
sb.crawlQueues.errorURL.remove(urlhash);
// get a scraper to get the title
final ContentScraper scraper = sb.loader.parseResource(url, CacheStrategy.IFFRESH);
final String title = scraper == null ? url.toNormalform(true, true) : scraper.getTitle();
//final String description = scraper.getDescription();
// stack url
sb.crawler.removePassive(crawlingStartURL.hash()); // if there is an old entry, delete it
final CrawlProfile pe = new CrawlProfile(
@ -352,21 +358,30 @@ public class Crawler_p {
if (reasonString == null) {
// create a bookmark from crawl start url
final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
//final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString("/crawlStart"));
tags.add("crawlStart");
if ("on".equals(post.get("createBookmark","off"))) {
final String[] keywords = scraper.getKeywords();
if (keywords != null) {
for (final String k: keywords) {
final String kk = BookmarkHelper.cleanTagsString(k);
if (kk.length() > 0) tags.add(kk);
}
}
//if ("on".equals(post.get("createBookmark","off"))) {
// we will create always a bookmark to use this to track crawled hosts
final BookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(crawlingStart, "admin");
if (bookmark != null) {
bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_TITLE, post.get("bookmarkTitle", crawlingStart));
bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_TITLE, title /*post.get("bookmarkTitle", crawlingStart)*/);
bookmark.setOwner("admin");
bookmark.setPublic(false);
bookmark.setTags(tags, true);
sb.bookmarksDB.saveBookmark(bookmark);
}
}
//}
// liftoff!
prop.put("info", "8");//start msg
prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
prop.putHTML("info_crawlingURL", post.get("crawlingURL"));
// generate a YaCyNews if the global flag was set
if (!sb.isRobinsonMode() && crawlOrder) {

@ -45,7 +45,6 @@ import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.kelondro.blob.BEncodedHeap;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.io.ByteCount;
import net.yacy.kelondro.logging.Log;
import org.apache.log4j.Logger;
@ -197,7 +196,7 @@ public class RobotsTxt {
}
} else {
final byte[] robotsTxt = (byte[]) result[DOWNLOAD_ROBOTS_TXT];
Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove
//Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove
RobotsTxtParser parserResult;
ArrayList<String> denyPath;
if (((Boolean) result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue()) {

Loading…
Cancel
Save