From 11729061f2a47ce387ad458dc0ec0d680166b5f2 Mon Sep 17 00:00:00 2001 From: orbiter Date: Sat, 3 Dec 2011 00:27:01 +0000 Subject: [PATCH] added an option in the bookmark import process to put everything into the crawler git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@8134 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/Crawler_p.java | 19 +------ htroot/YMarks.html | 21 ++++++-- htroot/api/ymarks/import_ymark.java | 63 +++++++++++++++++++--- source/de/anomic/crawler/CrawlProfile.java | 19 +++++++ 4 files changed, 93 insertions(+), 29 deletions(-) diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 36a4de9f9..dff58b49b 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -167,23 +167,8 @@ public class Crawler_p { // special cases: if (crawlingStartURL!= null && fullDomain) { - if (crawlingStartURL.isFile()) { - newcrawlingMustMatch = "file://" + crawlingStartURL.getPath(); - } else if (crawlingStartURL.isSMB()) { - newcrawlingMustMatch = "smb://" + crawlingStartURL.getHost(); - } else if (crawlingStartURL.isFTP()) { - newcrawlingMustMatch = "ftp://" + crawlingStartURL.getHost(); - } else { - final String host = crawlingStartURL.getHost(); - if (host.startsWith("www.")) { - newcrawlingMustMatch = "https?://" + crawlingStartURL.getHost(); - } else { - // if the www is not given we accept that also - newcrawlingMustMatch = "https?://(www.)?" + crawlingStartURL.getHost(); - } - } - if (subPath) newcrawlingMustMatch += crawlingStartURL.getPath(); - newcrawlingMustMatch += ".*"; + newcrawlingMustMatch = CrawlProfile.mustMatchFilterFullDomain(crawlingStartURL); + if (subPath) newcrawlingMustMatch = newcrawlingMustMatch.substring(0, newcrawlingMustMatch.length() - 2) + crawlingStartURL.getPath() + ".*"; } if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf('/')) > 0) { newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*"; diff --git a/htroot/YMarks.html b/htroot/YMarks.html index 852c94740..757b8de9a 100644 --- a/htroot/YMarks.html +++ b/htroot/YMarks.html @@ -141,9 +141,9 @@ To see a list of all APIs, please visit the
-

Bookmark Importerhelp

+

Bookmark Importerhelp

- Netscape HTML
+ Netscape HTML
Firefox JSON
XBEL
Surrogate XML
@@ -157,7 +157,7 @@ To see a list of all APIs, please visit the


-

Folder settingshelp

+

Folder settingshelp

Source folder @@ -167,7 +167,7 @@ To see a list of all APIs, please visit the


-

Automatic tagginghelp

+

Automatic tagginghelp

Off
@@ -178,6 +178,19 @@ To see a list of all APIs, please visit the
Merging with existing tags


+

Automatic Indexinghelp

+

+ No indexing +
+ Index every bookmark entry +
+ Index every bookmark entry plus all directly linked pages +
+ Index all domains from all bookmarks completely +

+ also all media (image/movie/document) links +

+

diff --git a/htroot/api/ymarks/import_ymark.java b/htroot/api/ymarks/import_ymark.java index 9097c27d7..40a647483 100644 --- a/htroot/api/ymarks/import_ymark.java +++ b/htroot/api/ymarks/import_ymark.java @@ -3,21 +3,27 @@ import java.io.IOException; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; +import java.util.Date; import java.util.Iterator; import java.util.concurrent.ArrayBlockingQueue; import java.util.regex.Pattern; import net.yacy.cora.document.UTF8; import net.yacy.cora.protocol.RequestHeader; +import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.document.Parser.Failure; import net.yacy.document.content.SurrogateReader; import net.yacy.kelondro.blob.Tables; +import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; import net.yacy.search.Switchboard; import org.xml.sax.SAXException; +import de.anomic.crawler.CrawlProfile; +import de.anomic.crawler.CrawlSwitchboard; +import de.anomic.crawler.retrieval.Request; import de.anomic.data.BookmarksDB; import de.anomic.data.UserDB; import de.anomic.data.WorkTables; @@ -54,6 +60,8 @@ public class import_ymark { boolean autotag = false; boolean merge = false; boolean empty = false; + final String indexing = post.get("indexing", "off"); + final boolean medialink = post.getBoolean("medialink", false); if(post.containsKey("autotag") && !post.get("autotag", "off").equals("off")) { autotag = true; @@ -67,7 +75,7 @@ public class import_ymark { t.start(); } - if(isAdmin && post.containsKey("table") && post.get("table").length() > 0) { + if(isAdmin && post.containsKey("table") && post.get("table").length() > 0) { bmk_user = post.get("table").substring(0, post.get("table").indexOf('_',0)); } if(post.containsKey("redirect") && post.get("redirect").length() > 0) { @@ -92,7 +100,7 @@ public class import_ymark { t = new Thread(surrogateReader, "YMarks - Surrogate Reader"); t.start(); while ((bmk = new YMarkEntry(surrogateReader.take())) != YMarkEntry.POISON) { - putBookmark(sb.tables.bookmarks, bmk_user, bmk, autoTaggingQueue, autotag, empty); + putBookmark(sb, bmk_user, bmk, autoTaggingQueue, autotag, empty, indexing, medialink); } prop.put("status", "1"); } else { @@ -110,7 +118,7 @@ public class import_ymark { t = new Thread(htmlImporter, "YMarks - HTML Importer"); t.start(); while ((bmk = htmlImporter.take()) != YMarkEntry.POISON) { - putBookmark(sb.tables.bookmarks, bmk_user, bmk, autoTaggingQueue, autotag, empty); + putBookmark(sb, bmk_user, bmk, autoTaggingQueue, autotag, empty, indexing, medialink); } prop.put("status", "1"); } else if(post.get("importer").equals("xbel") && reader != null) { @@ -127,7 +135,7 @@ public class import_ymark { t = new Thread(xbelImporter, "YMarks - XBEL Importer"); t.start(); while ((bmk = xbelImporter.take()) != YMarkEntry.POISON) { - putBookmark(sb.tables.bookmarks, bmk_user, bmk, autoTaggingQueue, autotag, empty); + putBookmark(sb, bmk_user, bmk, autoTaggingQueue, autotag, empty, indexing, medialink); } prop.put("status", "1"); } else if(post.get("importer").equals("json") && reader != null) { @@ -136,7 +144,7 @@ public class import_ymark { t = new Thread(jsonImporter, "YMarks - JSON Importer"); t.start(); while ((bmk = jsonImporter.take()) != YMarkEntry.POISON) { - putBookmark(sb.tables.bookmarks, bmk_user, bmk, autoTaggingQueue, autotag, empty); + putBookmark(sb, bmk_user, bmk, autoTaggingQueue, autotag, empty, indexing, medialink); } prop.put("status", "1"); } @@ -219,13 +227,13 @@ public class import_ymark { return prop; } - public static void putBookmark(final YMarkTables ymarks, final String bmk_user, final YMarkEntry bmk, - final ArrayBlockingQueue autoTaggingQueue, final boolean autotag, final boolean empty) { + public static void putBookmark(final Switchboard sb, final String bmk_user, final YMarkEntry bmk, + final ArrayBlockingQueue autoTaggingQueue, final boolean autotag, final boolean empty, final String indexing, final boolean medialink) { try { final String url = bmk.get(YMarkEntry.BOOKMARK.URL.key()); // other protocols could cause problems if(url != null && url.startsWith("http")) { - ymarks.addBookmark(bmk_user, bmk, true, true); + sb.tables.bookmarks.addBookmark(bmk_user, bmk, true, true); if(autotag) { if(!empty) { autoTaggingQueue.put(url); @@ -233,6 +241,16 @@ public class import_ymark { autoTaggingQueue.put(url); } } + + // fill crawler + if (indexing.equals("single")) { + crawlStart(sb, new DigestURI(url), CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, 0, true, medialink); + } else if (indexing.equals("onelink")) { + crawlStart(sb, new DigestURI(url), CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, 1, true, medialink); + } else if (indexing.equals("fulldomain")) { + final DigestURI u = new DigestURI(url); + crawlStart(sb, u, CrawlProfile.mustMatchFilterFullDomain(u), CrawlProfile.MATCH_NEVER_STRING, 99, false, medialink); + } } } catch (final IOException e) { Log.logException(e); @@ -242,6 +260,35 @@ public class import_ymark { Log.logException(e); } } + + public static String crawlStart( + final Switchboard sb, + final DigestURI startURL, + final String urlMustMatch, + final String urlMustNotMatch, + final int depth, + final boolean crawlingQ, final boolean medialink) { + final CrawlProfile pe = new CrawlProfile( + (startURL.getHost() == null) ? startURL.toNormalform(true, false) : startURL.getHost(), null, + urlMustMatch, + urlMustNotMatch, + CrawlProfile.MATCH_ALL_STRING, + CrawlProfile.MATCH_NEVER_STRING, + "", depth, medialink, + CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, crawlingQ, + true, true, true, false, true, true, true, + CacheStrategy.IFFRESH); + sb.crawler.putActive(pe.handle().getBytes(), pe); + return sb.crawlStacker.stackCrawl(new Request( + sb.peers.mySeed().hash.getBytes(), + startURL, + null, + "CRAWLING-ROOT", + new Date(), + pe.handle(), 0, 0, 0, 0 + )); + } + } diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java index 4c095ae67..5b667dd6c 100644 --- a/source/de/anomic/crawler/CrawlProfile.java +++ b/source/de/anomic/crawler/CrawlProfile.java @@ -31,6 +31,7 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.regex.Pattern; import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.Word; @@ -476,4 +477,22 @@ public class CrawlProfile extends ConcurrentHashMap implements M public static long getRecrawlDate(final long oldTimeMinutes) { return System.currentTimeMillis() - (60000L * oldTimeMinutes); } + + public static String mustMatchFilterFullDomain(final MultiProtocolURI crawlingStartURL) { + if (crawlingStartURL.isFile()) { + return "file://" + crawlingStartURL.getPath() + ".*"; + } else if (crawlingStartURL.isSMB()) { + return "smb://" + crawlingStartURL.getHost() + ".*"; + } else if (crawlingStartURL.isFTP()) { + return "ftp://" + crawlingStartURL.getHost() + ".*"; + } else { + final String host = crawlingStartURL.getHost(); + if (host.startsWith("www.")) { + return "https?://" + crawlingStartURL.getHost() + ".*"; + } else { + // if the www is not given we accept that also + return "https?://(www.)?" + crawlingStartURL.getHost() + ".*"; + } + } + } }