added an option in the bookmark import process to put everything into the crawler

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@8134 6c8d7289-2bf4-0310-a012-ef5d649a1542
14 years ago · 11729061f2
parent 77317a88e0
commit 11729061f2
4 changed files with 93 additions and 29 deletions
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -167,23 +167,8 @@ public class Crawler_p {

                // special cases:
                if (crawlingStartURL!= null && fullDomain) {
-                    if (crawlingStartURL.isFile()) {
-                        newcrawlingMustMatch = "file://" + crawlingStartURL.getPath();
-                    } else if (crawlingStartURL.isSMB()) {
-                        newcrawlingMustMatch = "smb://" + crawlingStartURL.getHost();
-                    } else if (crawlingStartURL.isFTP()) {
-                        newcrawlingMustMatch = "ftp://" + crawlingStartURL.getHost();
-                    } else {
-                        final String host = crawlingStartURL.getHost();
-                        if (host.startsWith("www.")) {
-                            newcrawlingMustMatch = "https?://" + crawlingStartURL.getHost();
-                        } else {
-                            // if the www is not given we accept that also
-                            newcrawlingMustMatch = "https?://(www.)?" + crawlingStartURL.getHost();
-                        }
-                    }
-                    if (subPath) newcrawlingMustMatch += crawlingStartURL.getPath();
-                    newcrawlingMustMatch += ".*";
+                    newcrawlingMustMatch = CrawlProfile.mustMatchFilterFullDomain(crawlingStartURL);
+                    if (subPath) newcrawlingMustMatch = newcrawlingMustMatch.substring(0, newcrawlingMustMatch.length() - 2) + crawlingStartURL.getPath() + ".*";
                }
                if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf('/')) > 0) {
                    newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*";
--- a/htroot/YMarks.html
+++ b/htroot/YMarks.html
@ -141,9 +141,9 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
 			<div id="ymarks_import_tab">
 				<form action="/api/ymarks/import_ymark.html" method="post"  enctype="multipart/form-data" accept-charset="UTF-8">
 					<input type="hidden" value="/YMarks.html" name="redirect">
-					<h4>Bookmark Importer<img title="help" alt="help" class="help" src="/yacy/ui/img-2/question_blue.png"></h4>
+					<h4>Bookmark Importer<img alt="help" title="If you put in your bookmarks here, you can access them anywhere where you have access to your YaCy peer. Think of it as your 'personal cloud' for bookmarking." class="help" src="/yacy/ui/img-2/question_blue.png"></h4>
 				    <p>
-					    <input type="radio" name="importer" value="html" /> Netscape HTML<br />
+					    <input type="radio" name="importer" value="html" checked="checked" /> Netscape HTML<br />
 					    <input type="radio" name="importer" value="json" /> Firefox JSON<br />
 						<input type="radio" name="importer" value="xbel" /> XBEL<br />
 						<input type="radio" name="importer" value="surro" /> Surrogate XML<br />
@ -157,7 +157,7 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
 						<input class="input" type="file" name="bmkfile" id="bmkfile" size="8" /><br />
 					</p>
 					<hr />
-					<h4>Folder settings<img title="help" alt="help" class="help" src="/yacy/ui/img-2/question_blue.png"></h4>
+					<h4>Folder settings<img alt="help" title="A folder structure is helpful to organize your bookmarks in a hierarchical way." class="help" src="/yacy/ui/img-2/question_blue.png"></h4>
 					<p>
 						<small>Source folder</small>
 						<input class="input" type="text" name="source" id="source" value="" disabled="disabled" />
@ -167,7 +167,7 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
 						<input class="input" type="text" name="root" id="root" value="/Imported Bookmarks" />
 					</p>
 					<hr />
-					<h4>Automatic tagging<img title="help" alt="help" class="help" src="/yacy/ui/img-2/question_blue.png"></h4>
+					<h4>Automatic tagging<img alt="help" title="Tags are words that are attached to documents as metadata. It is possible to read all the documents and find the attached tags automatically." class="help" src="/yacy/ui/img-2/question_blue.png"></h4>
 					<p>
 					    <input type="radio" name="autotag" value="off" checked="checked" /> Off
 					    <br />
@ -178,6 +178,19 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
 					    <input type="radio" name="autotag" value="merge" /> Merging with existing tags
 					</p>
 					<hr />
+					<h4>Automatic Indexing<img alt="help" title="While doing the bookmark import, YaCy can push all URLs to the indexing process" class="help" src="/yacy/ui/img-2/question_blue.png"></h4>
+					<p>
+					    <input type="radio" name="indexing" value="off" checked="checked" /> No indexing
+					    <br />
+					    <input type="radio" name="indexing" value="single" /> Index every bookmark entry
+					    <br />
+					    <input type="radio" name="indexing" value="onelink" /> Index every bookmark entry plus all directly linked pages
+					    <br />
+					    <input type="radio" name="indexing" value="fulldomain" /> Index all domains from all bookmarks completely
+					    <br /><br />
+					    <input type="checkbox" name="medialink" /> also all media (image/movie/document) links
+					</p>
+					<hr />
 					<p style="text-align: right">
 						<input type="submit" name="importbookmarks" value="Import" />
 					</p>
--- a/htroot/api/ymarks/import_ymark.java
+++ b/htroot/api/ymarks/import_ymark.java
@ -3,21 +3,27 @@ import java.io.IOException;
 import java.io.InputStreamReader;
 import java.io.UnsupportedEncodingException;
 import java.net.MalformedURLException;
+import java.util.Date;
 import java.util.Iterator;
 import java.util.concurrent.ArrayBlockingQueue;
 import java.util.regex.Pattern;

 import net.yacy.cora.document.UTF8;
 import net.yacy.cora.protocol.RequestHeader;
+import net.yacy.cora.services.federated.yacy.CacheStrategy;
 import net.yacy.document.Parser.Failure;
 import net.yacy.document.content.SurrogateReader;
 import net.yacy.kelondro.blob.Tables;
+import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.index.RowSpaceExceededException;
 import net.yacy.kelondro.logging.Log;
 import net.yacy.search.Switchboard;

 import org.xml.sax.SAXException;

+import de.anomic.crawler.CrawlProfile;
+import de.anomic.crawler.CrawlSwitchboard;
+import de.anomic.crawler.retrieval.Request;
 import de.anomic.data.BookmarksDB;
 import de.anomic.data.UserDB;
 import de.anomic.data.WorkTables;
@ -54,6 +60,8 @@ public class import_ymark {
            boolean autotag = false;
        	boolean merge = false;
        	boolean empty = false;
+        	final String indexing = post.get("indexing", "off");
+        	final boolean medialink = post.getBoolean("medialink", false);

        	if(post.containsKey("autotag") && !post.get("autotag", "off").equals("off")) {
        		autotag = true;
@ -67,7 +75,7 @@ public class import_ymark {
                t.start();
        	}

-        	if(isAdmin && post.containsKey("table") && post.get("table").length() > 0) {
+            if(isAdmin && post.containsKey("table") && post.get("table").length() > 0) {
        		bmk_user = post.get("table").substring(0, post.get("table").indexOf('_',0));
        	}
            if(post.containsKey("redirect") && post.get("redirect").length() > 0) {
@ -92,7 +100,7 @@ public class import_ymark {
                    t = new Thread(surrogateReader, "YMarks - Surrogate Reader");
                    t.start();
                    while ((bmk = new YMarkEntry(surrogateReader.take())) != YMarkEntry.POISON) {
-                        putBookmark(sb.tables.bookmarks, bmk_user, bmk, autoTaggingQueue, autotag, empty);
+                        putBookmark(sb, bmk_user, bmk, autoTaggingQueue, autotag, empty, indexing, medialink);
                    }
                    prop.put("status", "1");
                } else {
@ -110,7 +118,7 @@ public class import_ymark {
                        t = new Thread(htmlImporter, "YMarks - HTML Importer");
                        t.start();
                        while ((bmk = htmlImporter.take()) != YMarkEntry.POISON) {
-                            putBookmark(sb.tables.bookmarks, bmk_user, bmk, autoTaggingQueue, autotag, empty);
+                            putBookmark(sb, bmk_user, bmk, autoTaggingQueue, autotag, empty, indexing, medialink);
                        }
                        prop.put("status", "1");
                    } else if(post.get("importer").equals("xbel") && reader != null) {
@ -127,7 +135,7 @@ public class import_ymark {
                        t = new Thread(xbelImporter, "YMarks - XBEL Importer");
                        t.start();
                        while ((bmk = xbelImporter.take()) != YMarkEntry.POISON) {
-                            putBookmark(sb.tables.bookmarks, bmk_user, bmk, autoTaggingQueue, autotag, empty);
+                            putBookmark(sb, bmk_user, bmk, autoTaggingQueue, autotag, empty, indexing, medialink);
                        }
                        prop.put("status", "1");
                    } else if(post.get("importer").equals("json") && reader != null) {
@ -136,7 +144,7 @@ public class import_ymark {
                        t = new Thread(jsonImporter, "YMarks - JSON Importer");
                        t.start();
                        while ((bmk = jsonImporter.take()) != YMarkEntry.POISON) {
-                        	putBookmark(sb.tables.bookmarks, bmk_user, bmk, autoTaggingQueue, autotag, empty);
+                        	putBookmark(sb, bmk_user, bmk, autoTaggingQueue, autotag, empty, indexing, medialink);
                        }
                        prop.put("status", "1");
                    }
@ -219,13 +227,13 @@ public class import_ymark {
        return prop;
 	}

-	public static void putBookmark(final YMarkTables ymarks, final String bmk_user, final YMarkEntry bmk,
-			final ArrayBlockingQueue<String> autoTaggingQueue, final boolean autotag, final boolean empty) {
+	public static void putBookmark(final Switchboard sb, final String bmk_user, final YMarkEntry bmk,
+			final ArrayBlockingQueue<String> autoTaggingQueue, final boolean autotag, final boolean empty, final String indexing, final boolean medialink) {
 		try {
 			final String url = bmk.get(YMarkEntry.BOOKMARK.URL.key());
 			// other protocols could cause problems
 			if(url != null && url.startsWith("http")) {
-				ymarks.addBookmark(bmk_user, bmk, true, true);
+			    sb.tables.bookmarks.addBookmark(bmk_user, bmk, true, true);
 				if(autotag) {
 					if(!empty) {
 						autoTaggingQueue.put(url);
@ -233,6 +241,16 @@ public class import_ymark {
 						autoTaggingQueue.put(url);
 					}
 				}
+
+				// fill crawler
+				if (indexing.equals("single")) {
+				    crawlStart(sb, new DigestURI(url), CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, 0, true, medialink);
+				} else if (indexing.equals("onelink")) {
+                    crawlStart(sb, new DigestURI(url), CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, 1, true, medialink);
+                } else if (indexing.equals("fulldomain")) {
+                    final DigestURI u = new DigestURI(url);
+                    crawlStart(sb, u, CrawlProfile.mustMatchFilterFullDomain(u), CrawlProfile.MATCH_NEVER_STRING, 99, false, medialink);
+                }
 			}
 		} catch (final IOException e) {
 			Log.logException(e);
@ -242,6 +260,35 @@ public class import_ymark {
 			Log.logException(e);
 		}
 	}
+
+	public static String crawlStart(
+	                final Switchboard sb,
+	                final DigestURI startURL,
+	                final String urlMustMatch,
+	                final String urlMustNotMatch,
+	                final int depth,
+	                final boolean crawlingQ, final boolean medialink) {
+	    final CrawlProfile pe = new CrawlProfile(
+                (startURL.getHost() == null) ? startURL.toNormalform(true, false) : startURL.getHost(), null,
+                urlMustMatch,
+                urlMustNotMatch,
+                CrawlProfile.MATCH_ALL_STRING,
+                CrawlProfile.MATCH_NEVER_STRING,
+                "", depth, medialink,
+                CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, crawlingQ,
+                true, true, true, false, true, true, true,
+                CacheStrategy.IFFRESH);
+        sb.crawler.putActive(pe.handle().getBytes(), pe);
+        return sb.crawlStacker.stackCrawl(new Request(
+                sb.peers.mySeed().hash.getBytes(),
+                startURL,
+                null,
+                "CRAWLING-ROOT",
+                new Date(),
+                pe.handle(), 0, 0, 0, 0
+                ));
+	}
+
 }


--- a/source/de/anomic/crawler/CrawlProfile.java
+++ b/source/de/anomic/crawler/CrawlProfile.java
@ -31,6 +31,7 @@ import java.util.concurrent.ConcurrentHashMap;
 import java.util.regex.Pattern;

 import net.yacy.cora.document.ASCII;
+import net.yacy.cora.document.MultiProtocolURI;
 import net.yacy.cora.services.federated.yacy.CacheStrategy;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.data.word.Word;
@ -476,4 +477,22 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
    public static long getRecrawlDate(final long oldTimeMinutes) {
        return System.currentTimeMillis() - (60000L * oldTimeMinutes);
    }
+
+    public static String mustMatchFilterFullDomain(final MultiProtocolURI crawlingStartURL) {
+        if (crawlingStartURL.isFile()) {
+            return "file://" + crawlingStartURL.getPath() + ".*";
+        } else if (crawlingStartURL.isSMB()) {
+            return "smb://" + crawlingStartURL.getHost() + ".*";
+        } else if (crawlingStartURL.isFTP()) {
+            return "ftp://" + crawlingStartURL.getHost() + ".*";
+        } else {
+            final String host = crawlingStartURL.getHost();
+            if (host.startsWith("www.")) {
+                return "https?://" + crawlingStartURL.getHost() + ".*";
+            } else {
+                // if the www is not given we accept that also
+                return "https?://(www.)?" + crawlingStartURL.getHost() + ".*";
+            }
+        }
+    }
 }